├── .github └── workflows │ └── lint_and_tests.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── data ├── nllb200 │ ├── LICENSE │ └── README.md ├── tatoeba │ └── v1 │ │ ├── README.md │ │ ├── tatoeba.afr-eng.afr │ │ ├── tatoeba.afr-eng.eng │ │ ├── tatoeba.amh-eng.amh │ │ ├── tatoeba.amh-eng.eng │ │ ├── tatoeba.ang-eng.ang │ │ ├── tatoeba.ang-eng.eng │ │ ├── tatoeba.ara-eng.ara │ │ ├── tatoeba.ara-eng.eng │ │ ├── tatoeba.arq-eng.arq │ │ ├── tatoeba.arq-eng.eng │ │ ├── tatoeba.arz-eng.arz │ │ ├── tatoeba.arz-eng.eng │ │ ├── tatoeba.ast-eng.ast │ │ ├── tatoeba.ast-eng.eng │ │ ├── tatoeba.awa-eng.awa │ │ ├── tatoeba.awa-eng.eng │ │ ├── tatoeba.aze-eng.aze │ │ ├── tatoeba.aze-eng.eng │ │ ├── tatoeba.bel-eng.bel │ │ ├── tatoeba.bel-eng.eng │ │ ├── tatoeba.ben-eng.ben │ │ ├── tatoeba.ben-eng.eng │ │ ├── tatoeba.ber-eng.ber │ │ ├── tatoeba.ber-eng.eng │ │ ├── tatoeba.bos-eng.bos │ │ ├── tatoeba.bos-eng.eng │ │ ├── tatoeba.bre-eng.bre │ │ ├── tatoeba.bre-eng.eng │ │ ├── tatoeba.bul-eng.bul │ │ ├── tatoeba.bul-eng.eng │ │ ├── tatoeba.cat-eng.cat │ │ ├── tatoeba.cat-eng.eng │ │ ├── tatoeba.cbk-eng.cbk │ │ ├── tatoeba.cbk-eng.eng │ │ ├── tatoeba.ceb-eng.ceb │ │ ├── tatoeba.ceb-eng.eng │ │ ├── tatoeba.ces-eng.ces │ │ ├── tatoeba.ces-eng.eng │ │ ├── tatoeba.cha-eng.cha │ │ ├── tatoeba.cha-eng.eng │ │ ├── tatoeba.cmn-eng.cmn │ │ ├── tatoeba.cmn-eng.eng │ │ ├── tatoeba.cor-eng.cor │ │ ├── tatoeba.cor-eng.eng │ │ ├── tatoeba.csb-eng.csb │ │ ├── tatoeba.csb-eng.eng │ │ ├── tatoeba.cym-eng.cym │ │ ├── tatoeba.cym-eng.eng │ │ ├── tatoeba.dan-eng.dan │ │ ├── tatoeba.dan-eng.eng │ │ ├── tatoeba.deu-eng.deu │ │ ├── tatoeba.deu-eng.eng │ │ ├── tatoeba.dsb-eng.dsb │ │ ├── tatoeba.dsb-eng.eng │ │ ├── tatoeba.dtp-eng.dtp │ │ ├── tatoeba.dtp-eng.eng │ │ ├── tatoeba.ell-eng.ell │ │ ├── tatoeba.ell-eng.eng │ │ ├── tatoeba.epo-eng.eng │ │ ├── tatoeba.epo-eng.epo │ │ ├── tatoeba.est-eng.eng │ │ ├── tatoeba.est-eng.est │ │ ├── tatoeba.eus-eng.eng │ │ ├── tatoeba.eus-eng.eus │ │ ├── tatoeba.fao-eng.eng │ │ ├── tatoeba.fao-eng.fao │ │ ├── tatoeba.fin-eng.eng │ │ ├── tatoeba.fin-eng.fin │ │ ├── tatoeba.fra-eng.eng │ │ ├── tatoeba.fra-eng.fra │ │ ├── tatoeba.fry-eng.eng │ │ ├── tatoeba.fry-eng.fry │ │ ├── tatoeba.gla-eng.eng │ │ ├── tatoeba.gla-eng.gla │ │ ├── tatoeba.gle-eng.eng │ │ ├── tatoeba.gle-eng.gle │ │ ├── tatoeba.glg-eng.eng │ │ ├── tatoeba.glg-eng.glg │ │ ├── tatoeba.gsw-eng.eng │ │ ├── tatoeba.gsw-eng.gsw │ │ ├── tatoeba.heb-eng.eng │ │ ├── tatoeba.heb-eng.heb │ │ ├── tatoeba.hin-eng.eng │ │ ├── tatoeba.hin-eng.hin │ │ ├── tatoeba.hrv-eng.eng │ │ ├── tatoeba.hrv-eng.hrv │ │ ├── tatoeba.hsb-eng.eng │ │ ├── tatoeba.hsb-eng.hsb │ │ ├── tatoeba.hun-eng.eng │ │ ├── tatoeba.hun-eng.hun │ │ ├── tatoeba.hye-eng.eng │ │ ├── tatoeba.hye-eng.hye │ │ ├── tatoeba.ido-eng.eng │ │ ├── tatoeba.ido-eng.ido │ │ ├── tatoeba.ile-eng.eng │ │ ├── tatoeba.ile-eng.ile │ │ ├── tatoeba.ina-eng.eng │ │ ├── tatoeba.ina-eng.ina │ │ ├── tatoeba.ind-eng.eng │ │ ├── tatoeba.ind-eng.ind │ │ ├── tatoeba.isl-eng.eng │ │ ├── tatoeba.isl-eng.isl │ │ ├── tatoeba.ita-eng.eng │ │ ├── tatoeba.ita-eng.ita │ │ ├── tatoeba.jav-eng.eng │ │ ├── tatoeba.jav-eng.jav │ │ ├── tatoeba.jpn-eng.eng │ │ ├── tatoeba.jpn-eng.jpn │ │ ├── tatoeba.kab-eng.eng │ │ ├── tatoeba.kab-eng.kab │ │ ├── tatoeba.kat-eng.eng │ │ ├── tatoeba.kat-eng.kat │ │ ├── tatoeba.kaz-eng.eng │ │ ├── tatoeba.kaz-eng.kaz │ │ ├── tatoeba.khm-eng.eng │ │ ├── tatoeba.khm-eng.khm │ │ ├── tatoeba.kor-eng.eng │ │ ├── tatoeba.kor-eng.kor │ │ ├── tatoeba.kur-eng.eng │ │ ├── tatoeba.kur-eng.kur │ │ ├── tatoeba.kzj-eng.eng │ │ ├── tatoeba.kzj-eng.kzj │ │ ├── tatoeba.lat-eng.eng │ │ ├── tatoeba.lat-eng.lat │ │ ├── tatoeba.lfn-eng.eng │ │ ├── tatoeba.lfn-eng.lfn │ │ ├── tatoeba.lit-eng.eng │ │ ├── tatoeba.lit-eng.lit │ │ ├── tatoeba.lvs-eng.eng │ │ ├── tatoeba.lvs-eng.lvs │ │ ├── tatoeba.mal-eng.eng │ │ ├── tatoeba.mal-eng.mal │ │ ├── tatoeba.mar-eng.eng │ │ ├── tatoeba.mar-eng.mar │ │ ├── tatoeba.max-eng.eng │ │ ├── tatoeba.max-eng.max │ │ ├── tatoeba.mhr-eng.eng │ │ ├── tatoeba.mhr-eng.mhr │ │ ├── tatoeba.mkd-eng.eng │ │ ├── tatoeba.mkd-eng.mkd │ │ ├── tatoeba.mon-eng.eng │ │ ├── tatoeba.mon-eng.mon │ │ ├── tatoeba.nds-eng.eng │ │ ├── tatoeba.nds-eng.nds │ │ ├── tatoeba.nld-eng.eng │ │ ├── tatoeba.nld-eng.nld │ │ ├── tatoeba.nno-eng.eng │ │ ├── tatoeba.nno-eng.nno │ │ ├── tatoeba.nob-eng.eng │ │ ├── tatoeba.nob-eng.nob │ │ ├── tatoeba.nov-eng.eng │ │ ├── tatoeba.nov-eng.nov │ │ ├── tatoeba.oci-eng.eng │ │ ├── tatoeba.oci-eng.oci │ │ ├── tatoeba.orv-eng.eng │ │ ├── tatoeba.orv-eng.orv │ │ ├── tatoeba.pam-eng.eng │ │ ├── tatoeba.pam-eng.pam │ │ ├── tatoeba.pes-eng.eng │ │ ├── tatoeba.pes-eng.pes │ │ ├── tatoeba.pms-eng.eng │ │ ├── tatoeba.pms-eng.pms │ │ ├── tatoeba.pol-eng.eng │ │ ├── tatoeba.pol-eng.pol │ │ ├── tatoeba.por-eng.eng │ │ ├── tatoeba.por-eng.por │ │ ├── tatoeba.ron-eng.eng │ │ ├── tatoeba.ron-eng.ron │ │ ├── tatoeba.rus-eng.eng │ │ ├── tatoeba.rus-eng.rus │ │ ├── tatoeba.slk-eng.eng │ │ ├── tatoeba.slk-eng.slk │ │ ├── tatoeba.slv-eng.eng │ │ ├── tatoeba.slv-eng.slv │ │ ├── tatoeba.spa-eng.eng │ │ ├── tatoeba.spa-eng.spa │ │ ├── tatoeba.sqi-eng.eng │ │ ├── tatoeba.sqi-eng.sqi │ │ ├── tatoeba.srp-eng.eng │ │ ├── tatoeba.srp-eng.srp │ │ ├── tatoeba.swe-eng.eng │ │ ├── tatoeba.swe-eng.swe │ │ ├── tatoeba.swg-eng.eng │ │ ├── tatoeba.swg-eng.swg │ │ ├── tatoeba.swh-eng.eng │ │ ├── tatoeba.swh-eng.swh │ │ ├── tatoeba.tam-eng.eng │ │ ├── tatoeba.tam-eng.tam │ │ ├── tatoeba.tat-eng.eng │ │ ├── tatoeba.tat-eng.tat │ │ ├── tatoeba.tel-eng.eng │ │ ├── tatoeba.tel-eng.tel │ │ ├── tatoeba.tgl-eng.eng │ │ ├── tatoeba.tgl-eng.tgl │ │ ├── tatoeba.tha-eng.eng │ │ ├── tatoeba.tha-eng.tha │ │ ├── tatoeba.tuk-eng.eng │ │ ├── tatoeba.tuk-eng.tuk │ │ ├── tatoeba.tur-eng.eng │ │ ├── tatoeba.tur-eng.tur │ │ ├── tatoeba.tzl-eng.eng │ │ ├── tatoeba.tzl-eng.tzl │ │ ├── tatoeba.uig-eng.eng │ │ ├── tatoeba.uig-eng.uig │ │ ├── tatoeba.ukr-eng.eng │ │ ├── tatoeba.ukr-eng.ukr │ │ ├── tatoeba.urd-eng.eng │ │ ├── tatoeba.urd-eng.urd │ │ ├── tatoeba.uzb-eng.eng │ │ ├── tatoeba.uzb-eng.uzb │ │ ├── tatoeba.vie-eng.eng │ │ ├── tatoeba.vie-eng.vie │ │ ├── tatoeba.war-eng.eng │ │ ├── tatoeba.war-eng.war │ │ ├── tatoeba.wuu-eng.eng │ │ ├── tatoeba.wuu-eng.wuu │ │ ├── tatoeba.xho-eng.eng │ │ ├── tatoeba.xho-eng.xho │ │ ├── tatoeba.yid-eng.eng │ │ ├── tatoeba.yid-eng.yid │ │ ├── tatoeba.yue-eng.eng │ │ ├── tatoeba.yue-eng.yue │ │ ├── tatoeba.zsm-eng.eng │ │ └── tatoeba.zsm-eng.zsm └── wmt22_african │ ├── LICENSE │ └── README.md ├── docker ├── Dockerfile ├── README.md ├── app.py └── decode.py ├── install_external_tools.sh ├── install_models.sh ├── laser_encoders ├── README.md ├── __init__.py ├── download_models.py ├── language_list.py ├── laser_tokenizer.py ├── models.py ├── test_laser_tokenizer.py ├── test_models_initialization.py └── validate_models.py ├── nllb ├── README.md ├── download_models.sh └── nllb_laser3.png ├── pyproject.toml ├── remove_external_tools.sh ├── source ├── embed.py ├── eval.py ├── lib │ ├── indexing.py │ ├── romanize_lc.py │ └── text_processing.py ├── mine_bitexts.py ├── nli.py ├── paraphrase.py ├── pxsim.py ├── sent_classif.py ├── similarity_search.py └── xsim.py ├── tasks ├── CCMatrix │ ├── MatrixMine.pdf │ ├── README.md │ └── dl_cc_matrix.py ├── SentimentAnalysis │ ├── README.md │ └── SentimentAnalysis.ipynb ├── WikiMatrix │ ├── README.md │ ├── WikiMatrix-bleu.pdf │ ├── WikiMatrix-sizes.pdf │ ├── extract.py │ └── list_of_bitexts.txt ├── bucc │ ├── README.md │ ├── bucc.py │ └── bucc.sh ├── clustering │ ├── LaserClusteringExample.ipynb │ └── README.md ├── embed │ ├── README.md │ └── embed.sh ├── librivox-s2s │ └── README.md ├── mldoc │ ├── README.md │ ├── mldoc.py │ └── mldoc.sh ├── pxsim │ ├── README.md │ └── eval.sh ├── similarity │ ├── README.md │ └── wmt.sh ├── wmt22 │ ├── README.md │ └── download_models.sh ├── xnli │ ├── README.md │ ├── xnli.py │ └── xnli.sh ├── xsim │ ├── README.md │ └── eval.sh └── xsimplusplus │ ├── README.md │ └── eval.sh └── utils ├── requirements.txt ├── setup.py └── src ├── __init__.py ├── cleaner_splitter.py ├── demojizer.py ├── remove_non_printing_char.py └── sentence_split.py /.github/workflows/lint_and_tests.yml: -------------------------------------------------------------------------------- 1 | name: lint_and_tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | strategy: 8 | max-parallel: 1 9 | matrix: 10 | platform: [ubuntu-latest] 11 | python-version: [3.8] 12 | 13 | runs-on: ${{ matrix.platform }} 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: Install dependencies 19 | run: | 20 | python --version 21 | python -m pip install --upgrade 'pip>=23.2.1' 22 | python -m pip show pip 23 | python -m pip install -e '.[dev]' 24 | 25 | - name: isort 26 | run: cd laser_encoders && isort --check --diff . 27 | 28 | - name: black 29 | run: cd laser_encoders && black --check --diff . 30 | 31 | - name: pytest 32 | run: pytest laser_encoders 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | source/__pycache__ 2 | source/lib/__pycache__ 3 | models 4 | tools-external 5 | tasks/mldoc/MLDoc 6 | embed 7 | tasks/bucc/downloaded 8 | tasks/similarity/dev/ 9 | tasks/xnli/XNLI-1.0* 10 | tasks/xnli/multinli_1.0* 11 | .??*swp 12 | .idea 13 | __pycache__ 14 | nllb 15 | dist 16 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to LASER 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | Minor changes and improvements will be released on an ongoing basis. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `master`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | ## Coding Style 29 | * 4 spaces for indentation rather than tabs 30 | * 80 character line length 31 | * PEP8 formatting 32 | 33 | ## License 34 | By contributing to LASER, you agree that your contributions will be licensed 35 | under the LICENSE file in the root directory of this source tree. 36 | 37 | 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For Language-Agnostic SEntence Representations (LASER) software 4 | 5 | Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /data/tatoeba/v1/README.md: -------------------------------------------------------------------------------- 1 | # LASER Language-Agnostic SEntence Representations 2 | 3 | LASER is a library to calculate and use multilingual sentence embeddings. 4 | 5 | # Tatoeba multilingual test set 6 | 7 | We provide here the test set for 112 languages as we have used in the paper [1]. 8 | This data is extracted from the [Tatoeba corpus](https://tatoeba.org/eng/), dated Saturday 2018/11/17. 9 | 10 | For each languages, we have selected 1000 English sentences and their translations, if available. 11 | Please check [this paper](https://arxiv.org/abs/1812.10464) for a description of the languages, their families and scripts as well as baseline results. 12 | 13 | Please note that the English sentences are not identical for all language pairs. 14 | This means that the results are not directly comparable across languages. In particular, 15 | the sentences tend to have less variety for several low-resource languages, 16 | e.g. "Tom needed water", "Tom needs water", "Tom is getting water", .... 17 | 18 | # License 19 | 20 | Please see [here](https://tatoeba.org/eng/terms_of_use) for the license of the Tatoeba corpus. 21 | 22 | # References 23 | 24 | [1] Mikel Artetxe, Holger Schwenk, 25 | Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond, 26 | [arXiv Dec 26 2018](https://arxiv.org/abs/1812.10464) 27 | 28 | 29 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.amh-eng.amh: -------------------------------------------------------------------------------- 1 | በውነት ያስፈራል። 2 | ቶም ብዙ ጊዜ ከውሻው ጋር ይናገራል። 3 | ተማሪ አይደለሁም። 4 | እሱም አማርኛን እየተመርኩ ነው። 5 | ዘግይቼ እትምሀርት ቤት ምን ጊዜም አልደርስም። 6 | አንተ ተማሪ ነህ። 7 | ሐኪም የለሁም። 8 | ገደልኩው። 9 | ገደልኩዋት። 10 | እንግሊዝኛን እየተማርኩ ነው። 11 | ፈረንሳይኛን እየተመርኩ ነው። 12 | አንቺ ማን ነሽ? 13 | ወደ ትምህርት ቤት እየሄድክ ነው? 14 | ሰዉ ወጣት ነው። 15 | ሰዉ ወጣት አይደለም። 16 | አላውቅም። 17 | ሴቷ ወጣት ነች። 18 | ወጣት አይደለችም። 19 | በሳምንት ከሁለት እስካስር ብር ነው። 20 | ከቤቱ እስከ ባቡር ጣቢያ ድረስ ሄደ። 21 | እዚያ እሄዳለሁ። 22 | ጀግና ነኝ። 23 | በዚህ ሆቴል ውስጥ ባዶ ክፍል አለዎት? 24 | ስምህ ማን ነው? 25 | አስተርጓሚ ነኝ። 26 | እኔ አስተርጓሚ ነኝ። 27 | ወደ ትምሕርት ቤት ይሄዳል። 28 | ልጆች የሕይወታችን አበባዎች ናቸው። 29 | ወደ ትምሕርት ቤት እሄዳለሁ። 30 | እማራለሁ። 31 | አባትሽ ረጅም ነው። 32 | እንደማምነው ከሆነ ስህተት አለ። 33 | እኔ ተማሪ አይደለሁም። 34 | ሴቷ ዳቦ እየበላች ነው። 35 | እኔም ወደ ትምህርት ቤት እየሄድኩ ነው። 36 | አንድ መጽሐፍ እጽፋለሁ። 37 | መፍቻውን ስጠኝ። 38 | ቁጥሩን ስጠኝ። 39 | መሄድ ይፈልጋል። 40 | አትረብሽ። 41 | ሙዚቃውን መስማት ደስ አለን። 42 | እሷ እንድትረዳህ ትፈልጋለህ? 43 | ቦስቶን ውስጥ ለመጉብኘት ብዙ ቦታዎች አሉ። 44 | ወደ ተማሪው ቤት ሄደ። 45 | ይህን አልበልም። 46 | አንፈልገውም። 47 | እኔም ተማሪ ነኝ። 48 | የምትሄደው በተሳሳተ አቅጣጫ ነው። 49 | ስራ እየፈለግሁኝ ነው። 50 | ጨረቃው ይበራል። 51 | ነጭ ድመት አለኝ። 52 | እኔ ከካናዳ ነኝ። 53 | ምን ማዘዝ ትፈልጋለህ? 54 | ሴቶች ዓለምን ይለውጣሉ። 55 | እኔ ከክዮቶ ነኝ። 56 | እኔ ከሲንጋፖር ነኝ። 57 | እኔ ከዛምቢያ ነኝ። 58 | መራመድ እችላለሁ። 59 | እኔ ከሳፖሮ ነኝ። 60 | ሁለት ድመቶች አሉኝ። 61 | እኔ ከአሜሪካ ነኝ። 62 | እኔ ከአውስትራልያ ነኝ። 63 | ምን እየፍለግህ ነው? 64 | እኔ ከኮሎምቢያ ነኝ። 65 | ሴቷ ቆንጆ ናት። 66 | እኔ ከብራዚል ነኝ። 67 | ተመሣሣይ እወስዳለሁ። 68 | ጥሩ ሴት ነች። 69 | ማንኪያውን ስጠኝ። 70 | አጸደ እጫወታለሁ። 71 | እኔ ከቶክዮ ነኝ። 72 | ገንዘቡን አገኘሁ። 73 | ወደ አዲስ አበባ መቼ ነው የምትሄድ? 74 | ተማሪ ነኝ። 75 | አስተማሪ ነኝ። 76 | የእናት ቋንቋዬ ከእናቴ የተቀበልኩት በጣም ደጉ በረከት ነው። 77 | አስተማሪ የለሁም። 78 | አንደኛው ፎቅላይ ነው። 79 | ሰውዬው አባቱን መጥራት ፈለገ። 80 | አንተ አስተማሪ ነህ። 81 | ቀጠሮው ከማን ጋር ነው? 82 | እሱ በጣም ጥሩ ሐኪም ነው። 83 | አትረብሽ! 84 | አንተ ተማሪ ነህ? 85 | የኔ መጽሃፍ እዚህ ነው። 86 | ወደ ሱቅ እሄዳለሁ። 87 | ድመት አለኝ። 88 | ተማሪዎቹን ታያለሽ። 89 | ስሜ ጆን ነው። 90 | ምንኛ ቋንቋ ይናገራል? 91 | ሆቴል ትፈልጋለች። 92 | በሁለት ሰኣት ውስጥ፣ ወደ ቤት እንመጣለን። 93 | ቤቱን ዛሬ ታያላችሁ። 94 | ሴቷ ወጣት አይደለችም። 95 | ፈረሱን አልፈልግም። 96 | አንድ ቤት አላችሁ? 97 | ሩሲያው ትልቅ ነው። 98 | ቶም ትልቅ ነው። 99 | ወጣት አይደለም። 100 | አነጋገፌ ትክክል ነውን? 101 | ሙት አይደለሁም። 102 | እንኳን ለአለም አቀፍ የሴቶች ቀን አደረሰን 103 | ሐቁን አላውቅም። 104 | በአሁኑ ወቅት ቡርጅ ከሊፋ የአለማችን ረጅሙ ሰማይ ጠቀስ ሕንጻ ነው። 105 | ቤቶቹ እዚህ ናቸው። 106 | እርስዎስ? 107 | እኔም አስተማሪ ነኝ። 108 | እኔ ተማሪ ነኝ። እርስዎስ? 109 | አሜሪካዊ ነኝ። 110 | ትናንትና መጣሁ። 111 | ሶስት ሰዓት ነው። 112 | ኢትዮጵያዊ ነህ? 113 | ይህ ሰው ማን ነው? 114 | እናትሽን ትወጃታለሽ፧ 115 | እኔ ተማሪ ነኝ። አንተስ? 116 | ተማሪዎች ነን። 117 | በአስመራ ውስጥ ተወለድኩ። 118 | አልፈልገውም። 119 | አስተማሪዎች ናችሁ። 120 | እርሳስ አለዎት? 121 | ወደ ትምህርት ቤት መሄድ አልፈልግም። 122 | እኔ ማነኝ? 123 | ልጁ ይህች ቆንጆ ልጃገረድን ይወዳል። 124 | አሰልቺ ነኝ። 125 | ውሃን አልጠጣሁም። 126 | ትፈልገዋለች። 127 | ትፈልግሃለች። 128 | ወደ ትምሕርት ቤት እየሄድኩ ነው። 129 | እሱ ዛፍ ስር ተኝቷል። 130 | መጽሃፉን ስጠኝ። 131 | አነጋግሬ ለየት ያለ ሊሆን ይችላል። 132 | እሞክራለሁ። 133 | ተቈጥቼ ነበር። 134 | ጋዜጠኛ ነኝ። 135 | ፓሊስ ነኝ። 136 | እኔ ፓሊስ ነኝ። 137 | ነፍጡን ስጠኝ። 138 | እኔ ተማሪ ነኝ። 139 | ሰው ነኝ። 140 | እንግዳ ነኝ። 141 | መጽሃፉ ቀይ ነው። 142 | ሴት ነኝ። 143 | እበላለሁ። 144 | እርሱ ወዴት ነው፧ 145 | መኪናቸው ጥሩ አይደለም። 146 | አንተ ትልቅ ነህ። 147 | ጥሩ ተማሪ ነው። 148 | ሐኪም ነኝ። 149 | አንተ ትደክማለህ። እኔም እደክማለሁ። 150 | ጥሩ ተማሪ ነች። 151 | ቋንቋ ሁሉ እወዳለሁ 152 | ያቺ መኪና ስንት ነው? 153 | በባሕሩ ውስጥ ደሴቶች አሉ። 154 | ሦስት ልጆች አሉኝ። 155 | ስሜ ጀክ ነው። 156 | ምንም ሰው አየ 157 | ስሜ ያማዳ ነው። 158 | ሦስት ቋንቋዎች አልናገርም 159 | ቶም ሁልጊዜ ጧት ቡና ይጠጣል 160 | ስሜ አሕመድ ነው። 161 | በሒሳብ በጣም ጐበዝ ነው። 162 | ስሜ ቁዘይ አይደለም። 163 | ባለፈው ሳምንት ከአሜሪካ አንዳንድ ማስታወሻ ላኩለት። 164 | መጽሃፉን ሰጠው። 165 | ጤናማ ልጅ ነበርኩ። 166 | ምንም አያውቁም። 167 | የት መብላት ተፈልጋለህ? 168 | እኔ ሐኪም ነኝ። 169 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.amh-eng.eng: -------------------------------------------------------------------------------- 1 | It's truly frightening. 2 | Tom often talks to his dog. 3 | I'm not a student. 4 | I'm also learning Amharic. 5 | I'm never late for school. 6 | You are a student. 7 | I'm not a doctor. 8 | I killed him. 9 | I killed her. 10 | I'm learning English. 11 | I'm learning French. 12 | Who are you? 13 | Are you going to school? 14 | The man is young. 15 | The man is not young. 16 | I don't know. 17 | The woman is young. 18 | She is not young. 19 | It is from two to ten birrs a week. 20 | He went from his house up to the railroad station. 21 | I'll go there. 22 | I'm a hero. 23 | Do you have an empty room in this hotel? 24 | What is your name? 25 | I'm a translator. 26 | I am a translator. 27 | He goes to school. 28 | Children are the flowers of our lives. 29 | I will go to school. 30 | I will learn. 31 | Your father is tall. 32 | I believe there is a mistake here. 33 | I am not a student. 34 | The woman is eating bread. 35 | I'm also going to school. 36 | I'm writing a book. 37 | Give me the key. 38 | Give me the number. 39 | He wants to go. 40 | Do not disturb. 41 | We enjoyed listening to the music. 42 | Do you want her to help you? 43 | There are a lot of places to visit in Boston. 44 | He went to the student's house. 45 | I'm not eating this. 46 | We don't want it. 47 | I'm also a student. 48 | You're going the wrong direction. 49 | I'm looking for work. 50 | The moon is shining. 51 | I have a white cat. 52 | I'm from Canada. 53 | What would you like to order? 54 | Women change the world. 55 | I'm from Kyoto. 56 | I'm from Singapore. 57 | I'm from Zambia. 58 | I can walk. 59 | I'm from Sapporo. 60 | I have two cats. 61 | I'm from America. 62 | I'm from Australia. 63 | What are you looking for? 64 | I'm from Colombia. 65 | The woman is beautiful. 66 | I'm from Brazil. 67 | I'll have the same. 68 | She's a good woman. 69 | Give me the spoon. 70 | I'm playing in the garden. 71 | I'm from Tokyo. 72 | I found the money. 73 | When are you going to Addis Ababa? 74 | I'm a student. 75 | I'm a teacher. 76 | My native language is the most beautiful gift from my mother. 77 | I'm not a teacher. 78 | It's on the first floor. 79 | The man wanted to call his father. 80 | You're a teacher. 81 | With whom is the appointment? 82 | He is a very good doctor. 83 | Do not disturb! 84 | Are you a student? 85 | My book is here. 86 | I'm going to a store. 87 | I have a cat. 88 | You'll see the students. 89 | My name is John. 90 | What language is he speaking? 91 | She's looking for a hotel. 92 | We'll come home in two hours. 93 | You'll see the house today. 94 | The woman is not young. 95 | I don't want the horse. 96 | Do you have a house? 97 | Russia is big. 98 | Tom is big. 99 | He is not young. 100 | Am I pronouncing this correctly? 101 | I'm not dead. 102 | Happy International Women's Day! 103 | I don't know the truth. 104 | Burj Khalifa is currently the tallest skyscraper in the world. 105 | The houses are here. 106 | How about you? 107 | I'm also a teacher. 108 | I'm a student. And you? 109 | I am an American. 110 | I came yesterday. 111 | It's three o'clock. 112 | Are you an Ethiopian? 113 | Who is this man? 114 | Do you love your mother? 115 | I am a student, and you? 116 | We are students. 117 | I was born in Asmara. 118 | I don't want it. 119 | You are teachers. 120 | Do you have a pencil? 121 | I don't want to go to school. 122 | Who am I? 123 | The boy loves this beautiful girl. 124 | I am boring. 125 | I didn't drink the water. 126 | She wants him. 127 | She wants you. 128 | I'm going to school. 129 | He was sleeping under the tree. 130 | Give me the book. 131 | My accent is probably strange. 132 | I will try. 133 | I was angry. 134 | I'm a journalist. 135 | I'm a policeman. 136 | I am a policeman. 137 | Give me the gun. 138 | I am a student. 139 | I'm a man. 140 | I'm a foreigner. 141 | His book is red. 142 | I'm a woman. 143 | I will eat. 144 | Where is he? 145 | Their car is not good. 146 | You are big. 147 | He is a good student. 148 | I'm a doctor. 149 | You're tired. I'm also tired. 150 | She is a good student. 151 | I like all languages. 152 | How much does that car cost? 153 | There are islands in the sea. 154 | I have three children. 155 | My name is Jack. 156 | He saw nobody. 157 | My name is Yamada. 158 | I don't speak three languages. 159 | Tom always drinks coffee in the morning. 160 | My name is Ahmad. 161 | In mathematics, he is an ace. 162 | My name is not Kuzey. 163 | Last week, I mailed him some souvenirs from the U.S. 164 | He gave him the book. 165 | I was a healthy child. 166 | They don't know anything. 167 | Where do you want to eat? 168 | I am a doctor. 169 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.ast-eng.ast: -------------------------------------------------------------------------------- 1 | El pan faise con farina, agua y formientu. 2 | Tom ye un neñu traviesu abondo. 3 | Pa mercar una entrá vas tener qu'esperar como poco una hora. 4 | El xefe va enoxase conmigo. 5 | ¿Por qué nun fuiste a la oficina? 6 | Los nenos, que taben frayaos, quedaron dormíos darréu. 7 | Préstame xugar al fútbol. 8 | Tolos días, sacando xueves ya vienres, xuntaba colos amigos na tertulia. 9 | La música ye el alma de la xeometría. 10 | Tom y Mary dixeron que marchaban. 11 | Na cama aou durmi la mio moza nun fae fríu. 12 | Yes mui guapa. 13 | Nun fai falta dir. 14 | ¡Ay, ho! ¿Pues cómo nun fuiste quién d'algamalu inda? 15 | Tengo fame. 16 | Préstame tar contigo. 17 | Una bisarma anda Europa – la bisarma del comunismu. 18 | Tengo de subir al árbol. 19 | Tienes de facelo meyor. 20 | Traime esi vasu lleche. 21 | Ficisti too lo que pudisti. 22 | Tienes de garrar l'autobús. 23 | Tien la zuna de escargatiar nes ñarres. 24 | ¿Crees que ye posible? 25 | ¿Quies dir de compres? 26 | Tienes que dir a chucate. 27 | Prestome ayudar a Tom a mercar lo que-y facía falta. 28 | Ta sangrándome la rodiya. 29 | El Burj Khalifa ye por agora el rascacielos más altu del mundu. 30 | Vimos daqué estraño nel cielu. 31 | ¿Qué cosa cosadiella ye? Una vieya con un diente, apiella a tola xente. La campana. 32 | Toi xintando pasta. 33 | El gatu salió del cestu. 34 | Ye la verdá. 35 | ¿Apetezte venir al baile? 36 | Enseñáronme fotos prestoses asgaya. 37 | Pasé una tarde pistonuda. 38 | Toi bien namorada. 39 | Toi arguyosu de lo bien que cocina mio pá. 40 | ¡Munches gracies! 41 | A la fiesta vinieron sólo seis persones. 42 | Sólo queremos d'atopar a Tom. 43 | ¿Quién yes tú? 44 | Ellos llegaron tarde por causa de la tormenta. 45 | Esto ye lo que tenía que te dicir. Ye de lo más cenciello. Nun se pue ver bien namás que col corazón. Lo esencial nun se ve colos güeyos. 46 | ¿D'ú vienes? 47 | El maestru, al nun oyer la campana, nun paró de falar. 48 | ¿Ulu? 49 | ¿Ula? 50 | ¿Ulo? 51 | ¿Cuál ye'l to profesor favoritu? 52 | ¡Democracia real yá! 53 | Lleva-y les llaves al to hermanu. 54 | ¿Tienes dalgún llibru pa lleer? 55 | Diz Tom que nun se arrepiente de res. 56 | Presumes más qu'un ratón encima un quesu. 57 | Tom deprendióme munches coses útiles. 58 | ¿Nun te dixe que zarraras la puerta? 59 | Ha acabar too llueu abondo. 60 | La verdá ye que yo nunca nun quixe dir allí. 61 | Inda nun atopé trabayu. 62 | ¿Qué fais ho? 63 | Nun diba ser quien de dalo fecho ensin ti. 64 | ¿Qué fais ne? 65 | ¿Préstate correr? 66 | Esa película ye afayadiza pa xente de toles edaes. 67 | La xente miente dacuando. 68 | Nun me da más si me cree daquién o non. 69 | ¡Que semeya más guapa! 70 | El cielu del atapecer ye roxu. 71 | El Burj Khalifa ye actualmente el rascacielos más altu del mundu. 72 | "¿Ye ella moza?" "Sí." 73 | ¡¿Tu yes faltosu o fais-te, ho?! 74 | Tuvi una xinta llixera. 75 | Valíame más tener dexao la pistola au la atopé. 76 | Andorra ye un pequeñu principat, asitiáu entre España y Francia. 77 | Nun gastes más perres de les que ganes. 78 | Esa muyer encaprichose contigo. 79 | El lladrón tenía ferramientes afayaices pa forzar peslleres. 80 | Siento muncho haber tao tantu tiempu ensín escribite. 81 | Por favor, dime lo que quies. 82 | La idea nun ye nueva. 83 | ¿Necesites les llaves? 84 | ¡Fai el favor de vestite, que vamos llegar tarde! 85 | ¡Bon nataliegu! 86 | El fueu siempre ye peligrosu. 87 | Pa daquién que se supón ye un espertu, nun paez que usté sepa muncho. 88 | María ponse toa nerviosa cuando Tom conduz demasiao rápido. 89 | Vivimos xuntos. 90 | Ella frotose los güeyos. 91 | Ella ta amurniada. 92 | Préstame trabayar. 93 | ¿Cual ye'l to equipu favoritu? 94 | Nun te queda otra que facelo. 95 | El músicu ye tan popular nel estranxeru como en Xapón. 96 | Ye más fácil divertise que trabayar. 97 | El unicorniu ye un monstruu fabulosu. 98 | Pa nuesu que tás nel altor, seya’l to nome santificáu. Amiye’l to reinu. 99 | El gatu metióse pente les tables. 100 | ¿Qué faes aende? 101 | Dio-ylu el to xenru, que-y fía bona falta. 102 | Les cartes unviáronles el mes pasáu. 103 | Vémonos la selmana que vien. 104 | Nun falo castellanu. 105 | Ella dió-y un regalu. 106 | ¿Mancástete? 107 | Nin tú nin naide nun vais quitame de dir. 108 | Pa entamar, tienes que dexar de fumar. 109 | Dixo que diba a echa-yos una mano. 110 | ¿Danos permisu pa facer un enllaz de la so páxina al nuestru sitiu corporativu? 111 | Nun soi l'únicu que piensa asina. 112 | Desque-y lo igües, has llavar les manes que tiénesles abondo puerques. 113 | Voi pa la rula. ¿Qué quies de xinta? ¿Xarda o parrochina? 114 | ¡A-neno! ¿Nun oyes? ¡Apurre-y el martiellu ya ten pol clavo! 115 | Rapaz, aballa el xiculate, que nun ensame. 116 | Fexo un furacón grande abondo pa mangar n' ello una carrada de gouños. 117 | El budismu orixinose na India. 118 | Escribió un llibru sobre China. 119 | Van de compres. 120 | Tom nun trouxo les sos llaves 121 | Dexáronme facer lo que me diera pola gana. 122 | Bien fechu, Tom. 123 | Rindióse. 124 | Nun falé con Tom desque salió del hospital 125 | La nena dexó escapar al páxaru. 126 | Ía prestame que fixeras una prueba de sangre. 127 | El nenu mancóse. 128 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.ast-eng.eng: -------------------------------------------------------------------------------- 1 | Bread is made from flour, water and yeast. 2 | Tom is quite a mischievous child. 3 | You'll have to wait at least an hour to get a ticket. 4 | The boss is going to get mad at me. 5 | Why didn't you go to the office? 6 | The kids, who were exhausted, fell asleep right away 7 | I like to play soccer. 8 | Every day, except Thursdays and Fridays, he gathered with his friends for chatting 9 | Music is the soul of geometry. 10 | Tom and Mary said that they were leaving. 11 | In the bed where my girlfriend sleeps it is not cold 12 | You're very pretty. 13 | You don't need to go. 14 | My, my! How come you haven't been able to reach it, yet? 15 | I am hungry. 16 | I like to be with you. 17 | A spectre is haunting Europe - the spectre of communism. 18 | I need to climb the tree. 19 | You've got to do better. 20 | Bring me that glass of milk. 21 | You've done all you can. 22 | You've got a bus to catch. 23 | He's got the bad habit of picking his nose. 24 | Do you think it's possible? 25 | Do you want to go shopping? 26 | You need to get to bed. 27 | I was happy to be able to help Tom buy what he needed. 28 | My knee is bleeding. 29 | Burj Khalifa is currently the tallest skyscraper in the world. 30 | We saw a strange object in the sky. 31 | Guess the riddle. An old woman with one tooth, summons everybody. The bell. 32 | I'm eating pasta. 33 | My cat came out of the basket. 34 | It's the truth. 35 | How would you like to go to a dance? 36 | They showed me a lot of nice pictures. 37 | I had a lovely night. 38 | I'm so in love! 39 | I'm proud that my father is a good cook. 40 | Thanks a lot! 41 | Only six people came to the party. 42 | We just want to find Tom. 43 | Who are you? 44 | They arrived late because of the storm. 45 | And now here is my secret, a very simple secret: It is only with the heart that one can see rightly; what is essential is invisible to the eye. 46 | Where are you from? 47 | The teacher, without hearing the bell, didn't stop talking. 48 | Where is he? 49 | Where is she? 50 | Where is it? 51 | Who's your favorite teacher? 52 | Free democracy now! 53 | Take the keys to your brother. 54 | Do you have any books to read? 55 | Tom says he doesn't have any regrets. 56 | You brag more than a mouse on cheese. 57 | Tom taught me a lot of useful things. 58 | Didn't I tell you to close the door? 59 | It'll all be over soon. 60 | I never actually wanted to go there. 61 | I still haven't found work. 62 | What do you make? 63 | I wouldn't have been able to do it without you. 64 | What do you do? 65 | Do you like to run? 66 | That movie is suitable for people of all ages. 67 | Sometimes, people lie. 68 | I don't care if anyone believes me or not. 69 | What a beautiful picture! 70 | The sky at dusk is red. 71 | Currently Burj Khalifa is the tallest skyscraper in the world. 72 | "Is she young?" "Yes, she is." 73 | You're retarded, or something?! 74 | I ate a light lunch. 75 | I should've left the gun where I found it. 76 | Andorra is a small principality situated between Spain and France. 77 | Don't spend more than you earn. 78 | That woman is infatuated with you. 79 | The thief had special tools for picking locks. 80 | I am sorry that I have not written to you for such a long time. 81 | Please let me know what you want. 82 | The idea isn't new. 83 | You need the keys? 84 | Make the favor of dressing up because we're going to get late! 85 | Happy birthday to you! 86 | Fire is always dangerous. 87 | For someone who's supposed to be an expert, you don't seem to know much. 88 | Mary gets nervous when Tom drives too fast. 89 | We live together. 90 | She rubbed her eyes. 91 | She's in a depression. 92 | I like to work. 93 | Which is your favorite team? 94 | You had no choice but to do it. 95 | The musician is famous abroad as well as in Japan. 96 | It is easier to have fun than to work. 97 | The unicorn is a fabulous monster. 98 | Our Father who art above, hallowed be thy name, thy kingdom come. 99 | The cat went through the boards. 100 | What are you doing there? 101 | Your son-in-law gave it to him, for he needed it badly. 102 | They sent the letters last month. 103 | We see each other next week. 104 | I don't speak Spanish. 105 | She gave him a present. 106 | Did you hurt yourself? 107 | Neither you nor anyone will prevent me from going 108 | First, you have to stop smoking. 109 | He said he would give a helping hand to them. 110 | Would you please make a hyperlink to our corporate site from your page? 111 | I am not alone in thinking so. 112 | Once you have fixed it for him, do wash your hands for they are dirty indeed. 113 | I'm on my way to the fish market. What would you like for lunch? Mackerel or sardine? 114 | Kid! Can't you hear? Hand him the hammer and hold the nail! 115 | Kid, stir the chocolate, so that it doesn't stick. 116 | He made a hole big enough to put inside a chart-load of pebbles 117 | Buddhism had its beginnings in India. 118 | He wrote a book on China. 119 | They are going shopping. 120 | Tom didn't bring his keys. 121 | They let me do whatever I wanted. 122 | You did well, Tom. 123 | He gave up. 124 | I haven't spoken with Tom since he got out of the hospital. 125 | The girl let the bird loose. 126 | I'd like you to have a blood test. 127 | The kid got hurt. 128 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.awa-eng.awa: -------------------------------------------------------------------------------- 1 | का देस ह! 2 | केन सैको स बड़का है। 3 | अलजेरिया मोरा देस ह। 4 | मेक्सिको मा स्पेनिश बोली जात है। 5 | फ़्रांस माँ फ़्रांसीसी बोली जात है। 6 | अंग्रेजी कैनडा मा बोली जात ह का? 7 | कनाडा माँ अंग्रेजी बोली जात है। 8 | स्पेनिश मेक्सिको मा बोली जात ह का? 9 | सिंगापुर मा अंग्रेजी बोली जात है। 10 | आस्ट्रेलिया मा अंग्रेजी बोली जात है। 11 | स्वित्ज़रलैंड मा फ़्रांसीसी बोली जात है। 12 | सुन्नर देस ह। 13 | हिआँ भवा एइसा मइँ कहेउँ नाहीं। 14 | मोका तीस मिनट द्या। 15 | नाशी भाखा चीन मा बोली जात है। 16 | माओरी भाखा न्यूज़ीलैंड मा बोली जात है। 17 | कोंकणी महाराष्ट्र, गोआ अउर करनाटक मा बोली जात है। 18 | उ दुकान रविवार क बन्द रहत ह। 19 | मोरे चाचा क इटली मां एक घर ह। 20 | अउर काहे नाहीं? 21 | "काहे?" "काहे नाहीं?" 22 | ई काहे ह? 23 | कल काहे नाहीं? 24 | मइँ हुवाँ गवा रहा। 25 | ई जनावरन है। 26 | ऊ दक्खिन स आया। 27 | टाम क जनावरन पसन्द है। 28 | पोलेंड बड़ा देस ह। 29 | मोका आनिमे पसन्द है। 30 | टाम क जनावरन पसन्द रहीं। 31 | पान्डा सुन्नर जनावरन है। 32 | उ जनावर ह। 33 | टाम जनावर ह। 34 | वालोनिया सुन्नर देस ह। 35 | इटली एक बहोत सुन्नर देस ह। 36 | कौआ उड़ी गवा। 37 | आर्मिनिया पहाड़ी देस ह। 38 | का सुन्नर ठउर ह! 39 | इ ठउर सुन्नर ह। 40 | टाम क ठउर ल्या। 41 | इ ठउर मोका पसन्द ह। 42 | इ फ़ाक्ता सैनफ्रन्सिस्को स न्यूयार्क उड़ेस। 43 | ऊ एक सेब खात है। 44 | मोरा नाउँ वांग ह। 45 | ई एक किताब है। 46 | युक्रेन बड़ा देस ह। 47 | किताब है। 48 | का चलत ह? 49 | वेल्श सुन्नर भाषा ह। 50 | उ नाउँ केन ह। 51 | हर एक देस क आपन आपन इतिहास होत ह। 52 | टाम रकत दिहेस। 53 | रकत नाहीं ह। 54 | पानी ह का? 55 | टाम क पानी क जरूरत रही। 56 | टाम क पानी क जरूरत ह। 57 | मोका पानी क जरूरत ह। 58 | समइ खतम। 59 | मोका समइ द्या। 60 | ओका समइ द्या। 61 | अंग्रेजी एक भाखा ह। 62 | समइ लागत ह। 63 | टाम क समइ क जरूरत रही। 64 | समइ ह? 65 | अउर थोड़ा समइ ल्या। 66 | तोहरी भाखा तोहार धरम ह। 67 | आपन समइ ल्या, योशिदा। 68 | टाम मसहूर ह। 69 | टाम मसहूर रहा। 70 | टाम मसहूर ह का? 71 | उ घर मसहूर ह। 72 | इ घर मसहूर ह। 73 | टाम मसहूर नाहीं रहा। 74 | टाम बहोत मसहूर ह। 75 | मइँ ज्यादा मसहूर अहइँ। 76 | मइँ कम मसहूर अहइँ। 77 | बाहेर अँधियारा ह। 78 | फुन का भवा? 79 | अराम करा। 80 | राजा इ सुबह सिकार करइ गवा। 81 | अराम करा! 82 | अराम कर ल्या। 83 | टाम क अराम क जरूरत रही। 84 | टाम क अराम क जरूरत ह। 85 | टाम क थोडी़ अराम क जरूरत ह। 86 | मोका अराम क जरूरत ह। 87 | इ मोरी बिटिया ह। 88 | मोरी एक बिटिया ह। 89 | मोरी दुइ बिटियन ह। 90 | उ मोरी बिटिया ह। 91 | मोरी बिटिया क अल्ट्रामेन पसन्द ह। 92 | तोहरी बिटियन सुन्नर है। 93 | मेरी तोहार बिटिया ह का? 94 | इ तोहार बिटियन ह का? 95 | टाम क एक बिटिया रही। 96 | रुमी पहिली बिटिया ह। 97 | सुज़ुकी जी क तीन बिटियन ह। 98 | मोरी बड़ी बिटिया विवाहित ह। 99 | मोरा नाउँ हाशिमोतो ह। 100 | ओकर नाउँ टाम रहा। 101 | रूस बड़ा ह। 102 | टाम बड़ा ह। 103 | इ बड़ा ह। 104 | टाम बड़ा ह का? 105 | इ मछरी बड़ी ह। 106 | उ घर बड़ा ह। 107 | मोरा नाउँ साशा ह। 108 | मोरा नाउँ यामादा ह। 109 | तोहरा घर बड़ा ह। 110 | बाजार बड़ा ह। 111 | उ ओकर नाउँ जेन राखेस। 112 | हइदराबाद प १९४८ तलक एक निजाम क राज रहा। 113 | सूरज बड़ा ह। 114 | सहर बड़ा ह। 115 | अमरीका बहोत बड़ा ह। 116 | घर बड़ा ह। 117 | ओकायामा बड़ा सहर ह का? 118 | ओकर नाउँ का ह? 119 | मोरा नाउँ अहमद ह। 120 | अकास साफ ह। 121 | मोरा नाउँ जान ह। 122 | मोरा नाउँ टाम ह। 123 | उ ओकर नाउँ रहा। 124 | बहोत सुन्नर रहा। 125 | अजय गरीब ह। 126 | टोनी खुस रहा। 127 | केन खुस ह। 128 | ऊ खुस ह। 129 | खुसी का ह? 130 | मइँ खुस अहउँ। 131 | टाम खुस ह। 132 | टाम खुस रहा। 133 | मइँ खुस रहा। 134 | टाम खुस ह का? 135 | मोका खुस करा। 136 | टाम खुस रहा का? 137 | मोरा नाउँ शु ह। 138 | ओकर नाउँ टाम ह। 139 | तू मोका रकत द्या, मइँ तोका आज़ादी देबउँ। 140 | मइँ गाभीन अहउँ। 141 | मइँ गाभीन अहउँ का? 142 | ओकर नाउँ का रहा? 143 | उ गाभीन ह। 144 | मेरी गाभीन ह। 145 | ओकर नाउँ निना ह? 146 | मोका मोरी भाखा पसन्द ह। 147 | टाम, मइँ गाभीन अहउँ। 148 | पान्डा गाभीन ह। 149 | मोरा नाउँ हेनरी ह। 150 | भाखा नाहीं, रास्ट्र नाहीं। 151 | जिन्नगी सुन्नर ह। 152 | ऊ हमै मालुम है। 153 | टाम ह मोरा नाउँ। 154 | जिन्नगी सपन ह। 155 | इ मोरी जिन्नगी ह। 156 | स्वीडन क आपन भाखा ह। 157 | मोका मोरी जिन्नगी पस्न्द ह। 158 | जिन्नगी सर्कस ह। 159 | माको आक्सिटन भाखा पसन्द ह। 160 | अंग्रेजी जर्मेनिक भाखा ह। 161 | जिन्नगी एक भेंट ह। 162 | टाम क आपन जिन्नगी पसन्द ह। 163 | मेरी क आपन जिन्नगी पसन्द ह। 164 | मनई क जिन्नगी पवित्र ह। 165 | इ मोरी जिन्नगी ह! 166 | तेहरान कहां ह? 167 | अकास बड़ा ह। 168 | अकास साफ भवा। 169 | अकास लाल भवा। 170 | तू का जवाब दिहस? 171 | निउए एक देस ह। 172 | आइरिश सुन्नर भाखा ह। 173 | हां, मुला उ? 174 | हुवाँ कउनो नाहीं रहा। 175 | अरबी आसान भाखा ह। 176 | हमेसा मइँ काहे? 177 | उ सुन्नर ह। 178 | इ सु्न्नर ह। 179 | मेहररूअन सुन्नर ह। 180 | लारी सुन्नर ह। 181 | मेरी सुन्नर ह। 182 | मइँ टाम हन्टर। 183 | जापान सुन्नर देस ह। 184 | पाकिस्तान मुस्लिम देस ह। 185 | अमरीका सुन्नर ह। 186 | सुन्नर रहा। 187 | टरकी सुन्नर देस ह। 188 | टाम सिकारी ह। 189 | उ सुन्नर रहा। 190 | आस्ट्रेलिया सुन्नर देस ह। 191 | इटली सुन्नर देस ह। 192 | ब्राजिल बड़ा देस ह। 193 | इ एक सुन्नर देस ह। 194 | का सुन्नर सहर ह! 195 | ठीक है। 196 | फुन राज्यन अउ छोटे देसन विकसित हुएन। 197 | इ गुलाब सुन्नर ह। 198 | मोरा नाउँ जेक ह। 199 | इ नदी सुन्नर ह। 200 | मोरा नाउँ हाप्किन्स ह। 201 | मोरा नाउँ हिसाशी ह। 202 | मोरा नाउँ यातारोउ ह। 203 | नाउँ का ह? 204 | उ बहोत सुन्नर ह। 205 | कछू किसान रहेन, कछू सिकारी रहेन। 206 | ई इस्कूल है। 207 | उ मेहरारू सुन्नर ह। 208 | मोरा नाउँ फरशाद ह। 209 | का ह आहके अधिकार? 210 | प्राग बहोत सुन्नर ह। 211 | मोरा नाउँ लुइस ह। 212 | उ सहर सुन्नर ह। 213 | मोरा नाउँ सेली ह। 214 | मोरा नाउँ जिसुंग ह। 215 | सूचोउ बहोत सुन्नर ह। 216 | उ घर सुन्नर ह। 217 | का सुन्नर घर ह! 218 | इ एक नाउँ ह। 219 | हां, उ सुन्नर ह। 220 | मेरी बहोत सुन्नर ह। 221 | सबहिं मेहररूअन सुन्नर ह। 222 | मोरा नाउँ सउन्दरराजन ह। 223 | सचमुच सुन्नर ह। 224 | ओकर नाउँ मेरी ह। 225 | बहोत सुन्नर ह। 226 | उ ठउर मोका पसन्द ह। 227 | मे अप्रैल क पाछे आवत ह। 228 | ओकरे पाछे का भवा? 229 | ओकरे पाछे उ घर गएस। 230 | अल्लाह महान ह! 231 | चीन बड़ा देस ह। 232 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.awa-eng.eng: -------------------------------------------------------------------------------- 1 | What a country! 2 | Ken is older than Seiko. 3 | Algeria is my country. 4 | Spanish is spoken in Mexico. 5 | French is spoken in France. 6 | Is English spoken in Canada? 7 | English is spoken in Canada. 8 | Is Spanish spoken in Mexico? 9 | English is spoken in Singapore. 10 | English is spoken in Australia. 11 | French is spoken in Switzerland. 12 | It's a beautiful country. 13 | I didn't say it happened here. 14 | Give me thirty minutes. 15 | The Naxi language is spoken in China. 16 | The Maori language is spoken in New Zealand. 17 | Konkani is spoken in Maharashtra, Goa and Karnataka. 18 | The shop is closed on Sundays. 19 | My uncle has a house in Italy. 20 | And why not? 21 | "Why?" "Why not?" 22 | Why is this? 23 | Why not tomorrow? 24 | I had gone there. 25 | These are animals. 26 | He came from the south. 27 | Tom likes animals. 28 | Poland is a big country. 29 | I like animes. 30 | Tom liked animals. 31 | Pandas are beautiful animals. 32 | He's an animal. 33 | Tom is an animal. 34 | Wallonia is a beautiful country. 35 | Italy is a very beautiful country. 36 | The crow flew away. 37 | Armenia is a mountainous country. 38 | What a beautiful place! 39 | This place is beautiful. 40 | Take Tom's place. 41 | I like this place. 42 | This pigeon flew from San Francisco to New York. 43 | He's eating an apple. 44 | My name is Wang. 45 | This is a book. 46 | Ukraine is a big country. 47 | It is a book. 48 | What's going on? 49 | Welsh is a beautiful language. 50 | That name is Ken. 51 | Every country has its own history. 52 | Tom gave blood. 53 | It's not blood. 54 | Is there water? 55 | Tom needed water. 56 | Tom needs water. 57 | I need water. 58 | Time is up. 59 | Give me time. 60 | Give him time. 61 | English is a language. 62 | It takes time. 63 | Tom needed time. 64 | Is there time? 65 | Take some time. 66 | Your language is your religion. 67 | Take your time, Yoshida. 68 | Tom is famous. 69 | Tom was famous. 70 | Is Tom famous? 71 | That house is famous. 72 | This house is famous. 73 | Tom was not famous. 74 | Tom is very famous. 75 | I'm more famous. 76 | I'm less famous. 77 | It is dark outside. 78 | Then what happened? 79 | Take a rest. 80 | The king went hunting this morning. 81 | Take a rest! 82 | Get some rest. 83 | Tom needed rest. 84 | Tom needs rest. 85 | Tom needs some rest. 86 | I need to rest. 87 | This is my daughter. 88 | I have a daughter. 89 | I have two daughters. 90 | She's my daughter. 91 | My daughter likes Ultraman. 92 | Your daughters are beautiful. 93 | Is Mary your daughter? 94 | Are these your daughters? 95 | Tom had one daughter. 96 | Rumi is the first daughter. 97 | Mr Suzuki has three daughters. 98 | Her older daughter is married. 99 | My name is Hashimoto. 100 | His name was Tom. 101 | Russia is big. 102 | Tom is big. 103 | This is big. 104 | Is Tom big? 105 | This fish is big. 106 | That house is big. 107 | My name is Sascha. 108 | My name is Yamada. 109 | Your house is big. 110 | The market is big. 111 | They named her Jane. 112 | Hyderabad was ruled by a nizam until 1948. 113 | The sun is big. 114 | The city is big. 115 | America is very big. 116 | The house is big. 117 | Is Okayama a big city? 118 | What is his name? 119 | My name is Ahmad. 120 | The skies are clear. 121 | My name is John. 122 | My name is Tom. 123 | That was his name. 124 | It was very beautiful. 125 | Ajay is poor. 126 | Tony was happy. 127 | Ken is happy. 128 | She is happy. 129 | What is happiness? 130 | I am happy. 131 | Tom is happy. 132 | Tom was happy. 133 | I was happy. 134 | Is Tom happy? 135 | Make me happy. 136 | Was Tom happy? 137 | My name is Shu. 138 | His name's Tom. 139 | Give me your blood, I will give you freedom. 140 | I am pregnant. 141 | Am I pregnant? 142 | What was his name? 143 | She is pregnant. 144 | Mary is pregnant. 145 | Her name is Nina? 146 | I like my language. 147 | Tom, I'm pregnant. 148 | The panda is pregnant. 149 | My name is Henry. 150 | No language, no nation. 151 | Life is beautiful. 152 | We know him. 153 | Tom is my name. 154 | Life is a dream. 155 | It's my life. 156 | Sweden has its own language. 157 | I like my life. 158 | Life is a circus. 159 | I like the Occitan language. 160 | English is a Germanic language. 161 | Life is a gift. 162 | Tom likes his life. 163 | Mary likes her life. 164 | Human life is sacred. 165 | It's my life! 166 | Where is Tehran? 167 | The sky is big. 168 | The sky cleared up. 169 | The sky was red. 170 | What did you answer? 171 | Niue is a country. 172 | Irish is a beautiful language. 173 | Yes, but that? 174 | There was no one there. 175 | Arabic is a simple language. 176 | Why always me? 177 | She is beautiful. 178 | This is beautiful. 179 | Women are beautiful. 180 | Laurie is beautiful. 181 | Mary is beautiful. 182 | I'm Tom Hunter. 183 | Japan is a beautiful country. 184 | Pakistan is a Muslim country. 185 | America is beautiful. 186 | It was beautiful. 187 | Turkey is a beautiful country. 188 | Tom is a hunter. 189 | That was beautiful. 190 | Australia is a beautiful country. 191 | Italy is a beautiful country. 192 | Brazil is a big country. 193 | This is a beautiful country. 194 | What a beautiful city! 195 | It's all right! 196 | Kingdoms and small countries then developed. 197 | This rose is beautiful. 198 | My name is Jack. 199 | This river is beautiful. 200 | My name is Hopkins. 201 | My name is Hisashi. 202 | My name is Yatarou. 203 | What's its name? 204 | She is very beautiful. 205 | Some were farmers, some were hunters. 206 | This is a school. 207 | The woman is beautiful. 208 | My name is Farshad. 209 | What are their rights? 210 | Prague is very beautiful. 211 | My name is Luis. 212 | The town is beautiful. 213 | My name is Sally. 214 | My name is Jisung. 215 | Suzhou is very beautiful. 216 | The house is beautiful. 217 | What a beautiful house! 218 | This is a name. 219 | Yes, that's beautiful. 220 | Mary is very beautiful. 221 | All women are beautiful. 222 | My name is Soundararajan. 223 | It's really beautiful. 224 | Her name is Mary. 225 | It's very beautiful. 226 | I like that place. 227 | May comes after April. 228 | What happened after that? 229 | After that, he went home. 230 | Allah is great! 231 | China is a large country. 232 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.cha-eng.cha: -------------------------------------------------------------------------------- 1 | Kao malago' hao bumaila yan guåhu? 2 | Para guiya u åpåsi todu. 3 | Guidza i taotao ni matmo korason nya ninangga. 4 | Ohala homlo' hao ti åpmam. 5 | Ågang i polisia! 6 | Humanao si nana-nya para estados unidos para u edzak fino' ingles. 7 | Sa' ni hafa hongga ginen guidza, hu fongge' gue' ta'lo. 8 | Gaige si tåta gi kusina? 9 | Taimanu na linekka' ogsŏ' fuji? 10 | Man ma'udai gi tren. 11 | Anai mafiti pat magualu sakkan-hu, hu tutuhon mamåhan kandi ni salape'hu mismo ya todo i tiempu hu fåfahan måsmelu. 12 | Maila' mågi, ga'lågȗ. 13 | Gof respetadzon attista siha idza frånsia. 14 | Mangangasi si Tom. 15 | Umadotgan i hugua kuåttȗ. 16 | Debi bai få'pos på'guguha'? 17 | Bai karera gi ega'an. 18 | Dzan hagu lokkue'. 19 | Esti na måkina ma fåbrika idza frånsia. 20 | Siempre dzamu gue'. 21 | Sen triste dzu' tumungo' na mumatai si tatamu. 22 | Adzu siha na tånŏ' eståba påttěn frånsia. 23 | Adahen maolek, na gi papa' enao siha na kondishon, tåya remedio ki tafan ñodda' otrŏ fafahan. 24 | Todȗ taotåguě man mafa'nyågŏ librě jzan manačaigua mamiresě respetȗ jzan tininas. Man manå'ě abilidåt rason jzan kunsiensa jza ufan átråta unŏ jzan otrȗ kȗmŏ mohon manye'lŏ. 25 | Maolek pågȗ. Cha'mȗ luhan. Sinya un angkokku dzŏ' gåtos pot gåtos. 26 | Dångkulŏ gumå'hu. 27 | Atanon i kastidzu. 28 | Sinya un kånnu' todo malago'mȗ. 29 | Ni ti tinaka' hugua såkkan mumedzing fumino' pottuges si Melissa. 30 | Håfa na'ånmŏ? 31 | Man komite ham pot prutekshon famagu'on. 32 | Guåha dångkulŏn plåsan kareta gi me'nan estasion tren. 33 | Nina'matman i gobetnadot ni ineppen i komishon. 34 | Båsȗn taifondȗ. 35 | Kuåntŏ i kumishŏn? 36 | Sigi ha' mås man netbiŏs. 37 | Guåhŏ bai falag i tasi, lao hågŏ, otronya påtti, un falag i eskuela. 38 | "Nobelu? Hu chatli'ě' nobelu." Ai musa, asi'ě' guě' ni chatfino'nya. 39 | Håfa gi ofisina? 40 | Kinalesan kabadzȗ i dos guihi na puengi. 41 | Guaha hu hungok. Imposipblě, tåya' taotao gi halom guma'. 42 | Gi sena gi sigenti dia, ha introdusi dzu' gi asaguånya. 43 | Dzanggěn lataftaf mohon kahulŏ' si Sam, ti umadingo ni bås. 44 | Enfin, hu uma i tåguan tataotaonya gi 'inai. 45 | I kabådzu tai kangilung; i guaka yan kinilu man gai kangilung. 46 | En faktutura tulȗ' pot gåtus (3%) na kumishȗn. 47 | Pumåra macho'chȗ i bisikletåhu. 48 | Mahuchom esti na kåha. 49 | An bumiåhe dzu', ganyahu plinen. 50 | Hafa adai, guahu si Nancy. 51 | Man ririses i famagu'on på'gŏ. 52 | Man dångkulȗ esti siha na ga'lågŏ. 53 | Meggai na mansåbě ma konsidedera esti na te'ori. 54 | Ti presiso un kuenta enao na fåktŏ. 55 | Ha nyangon dzu' gi sigenti oga'an: Ta buetta hit Paris. Ilegnya ti guailadzi bai såga na maisa gi adzu na lugåt na'mase'. Depotsi taiguenao. 56 | Pådzon-niha i man rikȗ ma dispresia i mamobblî. 57 | Poddung i presiun kåtně. 58 | Adzuda yȗ' sumoda'ě kŏtbåta ni umaddza dzan esti na såkȗ. 59 | Naturåt para guidza fumino' franses. 60 | Maolek esti na gaseta, no? 61 | Umetupak si Jim ginen i bantalå'an. 62 | Cha'ot-hu kuåttŏ nai bula man-tsitsipa. 63 | Umaksidentî sa' båba mañugon. 64 | Humahalom dzȗ' hådzî tumutuhon i fåbulas tsispas. 65 | Ni håfa na lengguåhě un estudia ti siña hao sin diksionårio. 66 | Minegaiña na taotao ma kontra gera. 67 | Månu na pen anako'ña, ini pat enao? 68 | Ma na'atlibes i preposiȗn lai åntěs di ma påsa. 69 | Ha esgaihon dzu' si George asta i gimå'hu. 70 | Ma'å'ñao si Mary ga'lågȗ. 71 | I mediu tempo na hotnaleru siha impottante påtten niha gi kinalamten i ekonomia. 72 | Hafa adai tatatmanu hao? 73 | Agupa' påguan notze buena. 74 | Na' adzao dzȗ' fan salåppî'. 75 | I Komision ma konklusa na ahě' i ineppi. 76 | Todȗ mannyichi'. 77 | Guåha traidot entre hita. 78 | I "A" sinku biåhe anako'nya ki i "B". 79 | Sa' håfa na tanto taotao mabisisita Kyoto? 80 | Tådza' guåha bai enakompanya hao gi pokatmŏ? 81 | Un fabobot pat un kokontra i planuña ni ha proponě gi miting? 82 | Tanga didide' si biuhu. 83 | Enao guě' mås dångkulȗ propbleman måmě. 84 | Geftao dzan plasidȗ si Tom. 85 | Onların ziyan olmasına izin verme. 86 | Ni ngai'an nahong un linguåhe. 87 | I la manche/English Channel sumepåpara inglatera dzan frånsia. 88 | Numangu si Tom gi saddŏk 89 | I geran gof tumutuhŏn i dekådan 1990. 90 | Numangu si Tom kada' ha'åně. 91 | Ha osgi si tatånya. 92 | Måsa i mansåna 93 | Guse'nya ha na'fonhådzan che'cho'nya gi nigap. 94 | Punȗ' i kandit. 95 | Chatmata guě'. 96 | I palåbran "amour" ginen i ottsitan i trobadores, sinoke humudzongya "ameur". 97 | Inglatera un tåno' anai ti man inå'atmas i polisia. 98 | Håfa uttimoña esti na dråma? 99 | Sa' håfa na i mattsing mås kalan taotao ke otro gå'ga'? 100 | Mappot ma esplikan ñaihon håfa kumeke ilegña. 101 | Umaliansa islan dzan frånsia. 102 | Hu nesisita i kemmŏn. 103 | I presidenten frånsia ha bisita Okinawa. 104 | Ti menosña interisåntě esti na lepblŏ ki enao. 105 | Ni håfa kinano'ña asta ke guě' ma reskåta. 106 | Si Tom ha fafata na påtgon riku guě'. 107 | Ti pot håfa lao umo'otru i asaguanya yanggen ha sangan nyanyaihon dzu'. Lao diberas dimasiau lokkue ha mensionan nyaihon dzu'. 108 | Taimanu masångan-ña "good bye" gi fino' Aleman? 109 | I hues ha talabira i desision finåt. 110 | Muna'huyong si Yu'us i tåno'. 111 | Kao siña hu ayuda hao? 112 | Deposita esti na tsek gi akuentahu tsek. 113 | Mås ki hu 'agradesi i inadzudåmŏ. 114 | Bai hu konne' hao guatu guihi. 115 | Oi? Gagaigě hao ha'? 116 | Me'nan esti guě' hafa hu chocho'guě diåriȗ. 117 | Ha såsangan si Tom na tulo simåna guě' ti chumochȗ. 118 | Ilegnya si Tom na karetånya adzu i ta li'ě' gi nigap ni guaguan yan agaga'. 119 | Ti guidzadza libiånŏ esti na tso'tsu'. 120 | Håfa tatamanu hao? 121 | Åpmam tiempo ti hu li'e hao. 122 | Håyi nå'ån-mu? 123 | Nå'ån-hu si Jack. 124 | Taotao månu hao? 125 | Taotao månu hao, Karen? 126 | Ta fañocho! 127 | Ginen gumera frånsia dzan rusia. 128 | Malagȗ' dzu' malak frånsia. 129 | Buen biåhe. 130 | Mångge i kemmon? 131 | Hu guiaya hao. 132 | Bula håsuli iyo-ku hovercraft. 133 | Si Mary mås gagȗ' entrě man-ga'tsongya eskuela. 134 | Umaguaiya si Nicholas yan si Maria. 135 | Umatungo' i dos palao'an. 136 | Pot fabot tuge' fan påpa'. 137 | Machocho'chȗ' si Harry gi tienda oran tanoris. 138 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.cha-eng.eng: -------------------------------------------------------------------------------- 1 | Would you like to dance with me? 2 | She will pay for everything. 3 | He is a man whose heart is filled with hope. 4 | I hope that you will get well soon. 5 | Call the police! 6 | Her mother went to the United States to learn English. 7 | Because I didn't hear from him, I wrote to him again. 8 | Is dad in the kitchen? 9 | How high is Mt. Fuji? 10 | They got into the train. 11 | When I was 17 or 18 years old, I started buying candy with my own money and I always bought marshmallows. 12 | Come here doggie. 13 | Artists are highly respected in France. 14 | Tom is teasing. 15 | The two rooms are connected. 16 | Do I need to leave immediately? 17 | I leave in the morning. 18 | Don't mention it. 19 | This machine was manufactured in France. 20 | You'll come to like her. 21 | It saddens me greatly to know that your father died. 22 | Those countries used to belong to France. 23 | Bear in mind that, under such circumstances, we have no alternative but to find another buyer. 24 | All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. 25 | It's OK now. Don't worry. You can depend on me one hundred percent. 26 | My house is big. 27 | That castle is beautiful. 28 | You can eat all you want. 29 | Melissa became fluent in Portuguese in less than two years. 30 | What is your name? 31 | We are in commission concerning the protection of children's rights. 32 | There is a large parking lot in front of the station. 33 | The governor was surprised by the commission's response. 34 | It's a cup without a saucer. 35 | How much is the commission? 36 | They became more and more nervous. 37 | I will go to the sea, but you, on the other hand, will go to school. 38 | "Novels? I hate novels." Oh muse, forgive her blasphemy. 39 | How's things at the office? 40 | The couple took a horse carriage that evening. 41 | "I heard something." "Impossible, there's nobody in the house." 42 | The next day, at suppertime, I was introduced to her husband. 43 | If Sam had woken up earlier, he wouldn't have missed the bus. 44 | At last, I brought her exhausted body upon the beach. 45 | The horse doesn't have horns; the ox and the sheep have horns. 46 | We charge a commission of 3%. 47 | My bicycle has gone out of commission. 48 | This register is out of commission. 49 | When I travel, I prefer to travel by air. 50 | Hello, I am Nancy. 51 | The students are having a recess now. 52 | These dogs are big. 53 | Most experts think a lot of his theory. 54 | You need not take account of the fact. 55 | The following morning she said close to my ears: "We're going back to Paris tonight. He says that there's no way that he'll leave me alone in such a sad place. That's how it is." 56 | The rich are apt to look down upon the poor. 57 | The price of meat dropped. 58 | Help me pick out a tie to go with this suit. 59 | It comes natural to him to speak French. 60 | It's a very good newspaper, isn't it? 61 | Jim went fishing from the pier. 62 | Being in a room full of smokers is my pet peeve. 63 | The accident was due to bad driving. 64 | I wonder who started that rumor. 65 | Whatever language you study, you cannot do without a dictionary. 66 | The people at large are against war. 67 | Which is longer, this pen or that one? 68 | The bill was eviscerated before being passed by the legislature. 69 | George accompanied me home. 70 | Merry is scared of dogs. 71 | Part-time workers play an important role in the development of the economy. 72 | Hello, how are you? 73 | Tomorrow is Christmas Day. 74 | Would you lend me some money? 75 | The commission concluded that the answer was no. 76 | They all smiled. 77 | We have a traitor among us. 78 | A is 5 times as long as B. 79 | Why do so many people visit Kyoto? 80 | May we accompany you on your walk? 81 | Are you for or against the plan he put forward at the meeting? 82 | My grandfather is a bit hard of hearing. 83 | That was our biggest problem. 84 | Tom is generous and kind. 85 | Don't let them go to waste. 86 | One language is never enough. 87 | The English Channel separates England and France. 88 | Tom is swimming in the river. 89 | The 1990s began with the Gulf War. 90 | Tom goes swimming every day. 91 | She takes after her father. 92 | The apple is ripe. 93 | She must have finished the work yesterday. 94 | He turned off the light. 95 | He has poor eyesight. 96 | The French word "amour" comes from the Occitan language through the troubadours, otherwise it would be "ameur". 97 | England is a land where the policemen carry no revolvers. 98 | How does this drama end? 99 | Why have the apes evolved more than other animals? 100 | It is difficult to convey the meaning exactly. 101 | Iceland entered into an alliance with France. 102 | I have to use the bathroom. 103 | The President of France visited Okinawa. 104 | This book is not less amusing than that one. 105 | She did not eat anything until she was rescued. 106 | Tom claimed he was the son of a rich man. 107 | For some reason, her husband seemed to dislike that she spoke of me. In truth, she talked too much about me. 108 | How do you say "good bye" in German? 109 | The judge reversed the final decision. 110 | God created the world. 111 | May I help you? 112 | Deposit this check in my checking account. 113 | I am more than grateful to you for your help. 114 | I'll take you there. 115 | Hello? Are you still here? 116 | I swear this is what I do every morning. 117 | Tom claimed that he had not eaten for three weeks. 118 | Tom claimed that he owned that expensive-looking red car we saw yesterday. 119 | This work is not necessarily easy. 120 | How are you? 121 | Long time, no see. 122 | What's your name? 123 | My name is Jack. 124 | Where are you from? 125 | Where are you from, Karen? 126 | Enjoy your meal! 127 | France was at war with Russia. 128 | I would like to go to France. 129 | I hope you have a good trip. 130 | Where's the toilet? 131 | I love you. 132 | My hovercraft is full of eels. 133 | Mary's the laziest of her schoolmates. 134 | Nicholas and Maria love each other. 135 | The two women know each other. 136 | Please write it down. 137 | Harry works part-time at the local supermarket. 138 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.gsw-eng.eng: -------------------------------------------------------------------------------- 1 | My name is Hopkins. 2 | My name is Yamada. 3 | I'm fine, thanks. 4 | Thanks, I'm fine. 5 | I don't feel well. 6 | What about you? 7 | I lost my wallet. 8 | Long time, no see. 9 | Pleased to meet you. 10 | Is there a hospital nearby? 11 | The cat is white. 12 | Have a nice day. 13 | What is your name? 14 | What is your last name? 15 | Have a nice meal! 16 | What is your native language? 17 | Have a good journey! 18 | I don't understand. 19 | Speak more slowly, please! 20 | The book is violet. 21 | Speak more slowly, please. 22 | Please write it down. 23 | The house is pretty. 24 | The flower is beautiful. 25 | I thank you. 26 | You are welcome! 27 | You are welcome. 28 | To be or not to be, that is the question. 29 | Can you speak more slowly? 30 | I love you. 31 | I love you! 32 | Get well soon! 33 | The book is red. 34 | Leave me alone! 35 | Leave me alone. 36 | She is Betty. 37 | We are not jealous. 38 | Can we go? 39 | Call the police! 40 | Call the police. 41 | My hovercraft is full of eels. 42 | One language is never enough. 43 | I come from Boston. 44 | See you later! 45 | How are you? 46 | I am single. 47 | Are you married? 48 | What is your occupation? 49 | What's your occupation? 50 | Shall we go? 51 | Can you help me? 52 | We are not rich. 53 | Nice to meet you. 54 | I am thirsty. 55 | Can I help you? 56 | Where are you from? 57 | Enjoy your meal! 58 | I come from Saitama. 59 | Have a nice evening. 60 | Have a nice weekend. 61 | All the best! 62 | Have a nice day! 63 | I am fine, thank you. 64 | I am hungry. 65 | Nice to meet you too. 66 | I don't know. 67 | How old are you? 68 | They come from Beijing. 69 | Do you come from Tokyo? 70 | What's your name? 71 | Do you come from Berlin? 72 | You come from Taipei. 73 | The cat is brown. 74 | You come from Sweden. 75 | The girl is beautiful. 76 | The flower is blue. 77 | The cat is old. 78 | He comes from Hangzhou. 79 | The flower is yellow. 80 | The check, please. 81 | The dog is white. 82 | The dog is beautiful. 83 | The dog is red. 84 | The dog is black. 85 | The flower is red. 86 | I can speak English a little. 87 | I do not understand. 88 | The cat is black. 89 | I am sorry. 90 | Can you repeat that? 91 | Thank you very much! 92 | Don't mention it. 93 | The woman is young. 94 | The woman is not young. 95 | He comes from Geneva. 96 | Do you come from Beijing? 97 | The book is old. 98 | The book is black. 99 | The book is blue. 100 | The book is yellow. 101 | The book is green. 102 | The book is brown. 103 | The book is orange. 104 | The house is red. 105 | The book is pink. 106 | Tom has a bicycle. 107 | I am cold. 108 | The cat is adorable. 109 | The book is white. 110 | I have a slight headache. 111 | I have a headache. 112 | The woman is beautiful. 113 | She comes from Germany. 114 | What is your first name? 115 | He has a bicycle. 116 | I have a bicycle. 117 | My name is Jack. 118 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.gsw-eng.gsw: -------------------------------------------------------------------------------- 1 | Mi name isch Hopkins. 2 | Mi name isch Yamada. 3 | Mir gaats guet, danke. 4 | Danke, mir gaats guet. 5 | Mir gaats nöd so guet. 6 | Und dir? 7 | Ich han mis Portmone verloore. 8 | Mr hänn is schon lang nümme g'seh. 9 | Fröit mi. 10 | Häts da es Spitaal i de Nööchi? 11 | S Büsi isch wiis. 12 | Ich wünsch Ihne e schöne Daag. 13 | Wiä häissät Si? 14 | Wiä häissisch zum Nachnamä? 15 | En Guete! 16 | Was isch dini Mueterschpraach? 17 | Gueti Reis! 18 | Ich verstand nit. 19 | Bitte schwätze Sie langsamer. 20 | S Buech isch violett. 21 | Bitte schwätz langsamer. 22 | Bitte schriibe Sie das uf. 23 | S Huus isch schöön. 24 | D Blueme isch schöön. 25 | Merci. 26 | Nüt z'dangge. 27 | Gäärn gscheh. 28 | Sy, oder Nödsy, das isch hie d Frag. 29 | Chönd Si langsamer redä? 30 | Ich liib dich. 31 | I ha Di gärn. 32 | Gueti Besserig. 33 | S Buech isch root. 34 | Löhn Si mi in Ruh. 35 | Loh mi in Ruh. 36 | Si isch d Betty. 37 | Mir sind nöd eifersüchtig. 38 | Chömer? 39 | Riefe Sie dr Polizei! 40 | Rief dr Polizei! 41 | Mis Luftchüssiboot isch volle Aal. 42 | Ai Sprooch isch nie gnueg. 43 | Ich chumm us Boston. 44 | Bis schpööter! 45 | Wiä häschs? 46 | Ich bi ledig. 47 | Bisch verhäiraatet? 48 | Was isch din Prueff? 49 | Was sind Si vo Prueff? 50 | Gömer? 51 | Chönd ir mir hälfä? 52 | Mir sind nöd riich. 53 | Froit mi. 54 | Ich ha Durscht. 55 | Chan ich Ihnä hälfä? 56 | Vo wo sind Si? 57 | En guetä mitenand! 58 | Ich chumm us Saitama. 59 | En schöönen Aabig. 60 | Es schööns Wuchenend! 61 | Alles Gueti! 62 | En schööne Taag! 63 | Mir gaats guet, dankächöön. 64 | Ich ha Hunger. 65 | Ich au. 66 | Ich wäiss es nöd. 67 | Wiä alt bisch? 68 | Si chommät us Beijing. 69 | Chunnsch us Tokio? 70 | Wiä häissisch? 71 | Chunnsch us Berlin? 72 | Ir chommät us Taipei. 73 | S Büsi isch bruun. 74 | Si chommät us Schweden. 75 | S Mäitli isch schöön. 76 | D Blueme isch blau. 77 | S Büsi isch alt. 78 | Er chunnt us Hangzhou. 79 | D Blueme isch gäl. 80 | D Rächnig, bitte. 81 | Dä Hund isch wiis. 82 | Dä Hund isch schöön. 83 | Dä Hund isch root. 84 | Dä Hund isch schwarz. 85 | D Blueme isch root. 86 | Ich red Änglisch. 87 | Ich verschtaa nöd. 88 | S Büsi isch schwarz. 89 | S tuet mir läid. 90 | Chasch das wiederholä? 91 | Dankäschöön! 92 | Bitteschöön. 93 | D Frau isch jung. 94 | D Frau isch nöd jung. 95 | Er chunnt us Gämf. 96 | Chunnsch us Beijing? 97 | S Buech isch alt. 98 | S Buech isch schwarz. 99 | S Buech isch blau. 100 | S Buech isch gäl. 101 | S Buech isch grüen. 102 | S Buech isch bruun. 103 | S Buech isch orange. 104 | S Huus isch root. 105 | S Buech isch rosa. 106 | Tom hät es Velo. 107 | Ich ha chalt. 108 | S Büsi isch härzig. 109 | S Buech isch wiis. 110 | Ich han es bitzeli Chopfweh. 111 | Ich han Chopfweh. 112 | D Frau isch schöön. 113 | Si chunnt us Düütschland. 114 | Wiä häissisch zum Vornamä? 115 | Er hät es Velo. 116 | Ich ha es Velo. 117 | Mi name isch Jack. 118 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.swg-eng.eng: -------------------------------------------------------------------------------- 1 | He's a doctor. 2 | She's a doctor. 3 | Tom is on the third floor. 4 | I go on foot. 5 | Do you still love me? 6 | What was so valuable? 7 | Is it that urgent? 8 | You can't help me. 9 | The siren is broken. 10 | Turtles don't have teeth. 11 | I can't blame Tom for hating me. 12 | She read his letter again and again. 13 | Tom could've done it by himself. 14 | Stop, people are already looking. 15 | I don't remember that guy's name. 16 | Have you ever eaten in a restaurant alone? 17 | I don't want to drive. 18 | Cows are considered sacred by Hindus. 19 | Hunger is the best cook. 20 | Why did you attack him? 21 | My cat ran away and never came back. 22 | Tom goes to work by bicycle. 23 | The book is violet. 24 | The book is orange. 25 | The book is old. 26 | What did she actually say? 27 | He is ugly. 28 | When I get bored, I just contradict my wife. 29 | A fish rots from the head down. 30 | What time does this museum close? 31 | If you want to swim, swim. 32 | We have dinner at seven. 33 | Tom wasn't very happy. 34 | It cannot be difficult; otherwise, women could not do it. 35 | Revenge is sweet. 36 | This bed is too soft for me. 37 | "I think you should go out with Tom." "No, I don't like him." 38 | Tom would like to work less and make more money. 39 | The walls have ears. 40 | I'd never seen anything like that before. 41 | More haste, less speed. 42 | My family is my life. 43 | Mary used her smartphone as a mirror to touch-up her makeup. 44 | I still have time to do that before 2:30. 45 | We can stop whenever we want. 46 | Could you stop checking your phone every 30 seconds? 47 | Burj Khalifa is currently the tallest skyscraper in the world. 48 | I did that for us. 49 | I wasn't afraid. 50 | The dog is hungry. 51 | Aki is my dog. 52 | We live in a complicated world. 53 | He said that he loved me too. 54 | We figured it out. 55 | We've figured it out. 56 | I have an alibi. 57 | Do you like black cats? 58 | I want to go home. 59 | Not everything with two cheeks is a face. 60 | She comes from Italy. 61 | Did you see the way she was looking at you? 62 | Together they had eight children. 63 | Showering as a couple saves water and time. 64 | Tom spent a year in Germany. 65 | Today I will not cook. 66 | Everyone says it isn't possible. Someone has come here not knowing this and simply done it! 67 | Are these your own books? 68 | We saw Tom last night. 69 | I don't need to do that as often as I used to. 70 | Tom has been out of work for over a month. 71 | Tom died from lack of oxygen. 72 | Tom ran past us. 73 | The movie wasn't as interesting as I expected it to be. 74 | I feel like such an idiot. 75 | We were at a loss what to do. 76 | You should help your father. 77 | Your plan is terrible. 78 | What do you have hidden behind your back? 79 | Can't you swim at all? 80 | We are Swabians. That justifies everything. 81 | I bought a new suit of clothes. 82 | The woman is young. 83 | I love you. 84 | I can't drive the car today, I have to drink! 85 | Get down from that ladder. 86 | What do I know? 87 | It isn't good. 88 | It's not gold. 89 | It's not your money that I want. 90 | It's Monday, you know. 91 | It's my money. 92 | It's my day off. 93 | It's very hot. 94 | I like loud music. 95 | Tom is finicky. 96 | Tom has a bicycle. 97 | I have a bicycle. 98 | She is Betty. 99 | We are not jealous. 100 | The book is pink. 101 | Where are all my friends? 102 | I'm afraid it's too late now. 103 | Shut your mouth. 104 | I saw Tom swimming across the river. 105 | His name's Tom. 106 | May we swim here? 107 | I didn't swim. 108 | One simply can't be good to everyone. 109 | If only everyone were the way I should be. 110 | We're not eating. 111 | How much sugar is healthy? 112 | Tom saw some dead fish floating on the lake. 113 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.swg-eng.swg: -------------------------------------------------------------------------------- 1 | Er isch a Dogdr. 2 | Si isch a dogdore. 3 | Tom isch em dritta Stock. 4 | I gang z'Fuas. 5 | Mogsch du mi no? 6 | Was isch so viel wärt gwä? 7 | Isch dees so pressant? 8 | Du kasch mir net helfa. 9 | Dui Sirena war hee. 10 | Schildkreta hent koi Zee. 11 | I ka des Tom net vrdenka, dass er mi absolut net mog. 12 | Sie hot den Briaf emmer wiedr nochamaol gläsa. 13 | Tom hett des aloi gmacht. 14 | Her auf, d'Leit guggad scho. 15 | I woiss nemme, wie der Sempl hoisst. 16 | Hosch du amal alloi em Wirtshaus gässa? 17 | I han koi Luscht zom fahra. 18 | Kia send de Hendus hoilig. 19 | Dr Honger isch dr beschde Koch. 20 | Worom hasch du den ogriffa? 21 | Mai Katz isch entlaufa ond nemme wiederkomma. 22 | Tom goht zu sainra Arbed mit am Fahrrädle. 23 | Des Buach isch violett. 24 | Des Buach isch oraschfarbig. 25 | Des isch a alds Buach. 26 | Was hot se aigendlich gschwätzt? 27 | Er isch wiaschd. 28 | Wenn mir mol langweilig isch, widersprech i oifach meim Weib. 29 | Dr Fisch schdenkt vom Kopf her. 30 | Om wieviel Uhr macht des Museum zua? 31 | Wenn du schwemma willsch, no schwemm doch! 32 | Mir veschbrad om siebane. 33 | Tom isch ned arg glicklich gwä. 34 | Schwär koas ned sai – sonschd kenndad's d' Weiber ned. 35 | Rache isch siaß. 36 | Des Bedd isch mir zwoich. 37 | „I moin, du sottescht a mol mid Tom ausganga.“ – Noi, den mog i ned.“ 38 | Tom dät gära weniger schaffa ond mee hoimbrenga. 39 | D’Wend hend Ohra. 40 | So ebbes han i no nia gsäa. 41 | No ned huddla. 42 | Mai Familje isch mai Lääba. 43 | Mit dem Wischkäschtle als Spiegl hat sich Maria nachgschmingt. 44 | No ka ies schaffa bis om halber drui. 45 | Mir kennat emmer aŭfheera. 46 | Kenntscht du aufhöra, elle 30 Sekonda dain Telefon rausziaga? 47 | Da Burj Khalifa isch jetzat da hegschde Wolkagratzr vo dr Weld. 48 | I han dees fir oos gmacht. 49 | I han koi Angschs ghett. 50 | Der Hond isch hongrig. 51 | Aki isch mai hond. 52 | Wir leben in einer komplizierten Welt. 53 | Er hod mir gsagd, dass er mi aŭ liaba dud. 54 | Mir hend a Lesong gfonda. 55 | Mir hend ons a Lesung ausdengd. 56 | I hau a Alibi. 57 | Kend ihr schwaarze Katza leida? 58 | I will hoim. 59 | Nes älles mid zwoi Bagga isch a Gsichd. 60 | Dui kommt aus Italien. 61 | Hendr gsäa, wia se eich aguggt hot? 62 | Zamma hend se acht Kender ghet. 63 | Duscha zu zwoid schbard Wasser ond Zeit. 64 | Tom isch oi joor en Deitschland gwä. 65 | Heit koch i nix. 66 | Älle sagad, des gohd ned. No isch oiner komma ond hat des ned gwissd ond hod's oifach gmachd! 67 | Send dees eire oigene Biaĉr? 68 | Mir hend Tom geschdern Obend gsäa. 69 | I muaß des nemme so oft macha wia frier. 70 | Tom hot scho ibr oin Monat koi Arbet mee. 71 | Tom isch an Sauorschtoffmangl gschdorba. 72 | Tom isch an ons vorbeigsaut. 73 | Der Film war et so intressant wia i denkt han. 74 | I glaub i spenn. 75 | Mir hend net gwissd, was mr do soddat. 76 | Du sottescht daim Vadder helfa. 77 | Dai Plan isch firchterlich. 78 | Was hoschd du henter daim Buggel vrsteckt? 79 | Kosch du gar et schwemma? 80 | Mir send Schwoba. Des rechtferdigt älles. 81 | I han mir an nuja Ozug kauft. 82 | Dia Frau isch jong. 83 | I mog di. 84 | I koa heit ned fahra, i muass drenka! 85 | Ronder von dr Loider! 86 | Was woiß i? 87 | Sisch net guad. 88 | Sisch net Gold. 89 | Sisch net dai Geld, was i will. 90 | Sisch Meedig, net woar? 91 | Des isch mai Geld. 92 | Sisch mai freier Dag. 93 | Sisch säär hoiß. 94 | A lauda Muusig gfellt mir. 95 | Dr Thomas isch pengalig. 96 | Dr Tom hots Fahrrädle. 97 | I hans Fahrrädle. 98 | Si isch d'Betty. 99 | Mir send net eifrsichtig. 100 | Des Buach isch rosa. 101 | Wo send älle maine Fraind? 102 | I moin, sisch jetzetle z'schpät drzua. 103 | Halt dai Gosch! 104 | I han Tom ibr da Fluss sĉwemma gsäa. 105 | Er hoißt Tom. 106 | Derfet mir hier schwemma? 107 | I ben et gschwomma. 108 | Mr koas oifach ned ällna reachd macha. 109 | Wenn no älle so wärad, wie i sai sodd. 110 | Mir ässat net. 111 | Wie viel Zukkr isch xond? 112 | Tom hot a baar dode Fisch aufm See schwemma gsäh. 113 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.tuk-eng.eng: -------------------------------------------------------------------------------- 1 | Stars twinkled in the sky. 2 | I don't remember. 3 | The horse is white. 4 | I have a book. 5 | A quarrel between husband and wife is like a spring day's drizzle. 6 | Can you speak Turkmen? 7 | Do you have the book? 8 | I am a volunteer. 9 | I want to drink tea. 10 | He is afraid. 11 | Where is your honey that excites the bees, Pyragy? 12 | I am happy. 13 | I am boring. 14 | Is she beautiful? 15 | I will endure. 16 | I'm not old. 17 | I have two cats. 18 | I have a dog. 19 | Do you understand? 20 | Do you have a cat? 21 | Do you have a match? 22 | I was angry. 23 | I will come. 24 | He will come. 25 | I don't understand. 26 | I'm going to come back. 27 | I didn't read. 28 | So be it. 29 | I will come with you. 30 | You are to come with me. 31 | I knew you'd come. 32 | They'll come back. 33 | I'll come outside. 34 | I'll come now. 35 | I'll come home. 36 | They will come. 37 | I'm going to go. 38 | We're going to go. 39 | I'm going to go home. 40 | I'm going to go to the movies. 41 | Of course I will go. 42 | Who will you go with? 43 | Are you going to go? 44 | I'm going to go there. 45 | I'll go by car. 46 | I'm going to go with you. 47 | Where will you go? 48 | I'll go now. 49 | Yeah, I’ll go. 50 | I'll go see. 51 | Will you go with us? 52 | Will you go by train? 53 | Are you going to a movie? 54 | Are you going to go there? 55 | Are you going to go now? 56 | Are you going to go with Tom? 57 | I will try. 58 | Will you go, too? 59 | I will learn. 60 | I will eat. 61 | Will you go shopping with me? 62 | We're still doing well. 63 | It happened over a year ago. 64 | I've been learning to drive. 65 | I have a fear of the dark. 66 | The bird is in heaven. 67 | They're cooking now. 68 | The bird is in the sky. 69 | Where's Tom right now? 70 | How many girls are there in this picture? 71 | I'm always listening to music; I can't live without it. 72 | It was nice to meet you. 73 | I will never see him. 74 | I don't fear death. 75 | I'm learning Turkmen. 76 | Burj Khalifa is currently the tallest skyscraper in the world. 77 | You won't like it here. 78 | I don't know you. 79 | I have to go to bed. 80 | The horse is black. 81 | It is raining. 82 | I need a taxi! 83 | There are islands in the sea. 84 | I am not a teacher. 85 | This is not a sentence. 86 | This is not a fish. 87 | This is not a table. 88 | I read a book. 89 | I want to drive. 90 | I want to live. 91 | He didn't understand me. 92 | I want to sleep. 93 | I'm listening to music. 94 | I was listening. 95 | I'm cooking now. 96 | They were listening to the radio. 97 | Are you listening? 98 | I'm listening to you. 99 | Tom has been listening. 100 | Tom was listening. 101 | Were you listening? 102 | He was listening to music. 103 | The boy is listening to music. 104 | She is listening to him. 105 | Are you listening to him? 106 | The man is strong. 107 | He is listening to the radio. 108 | I am listening to a song. 109 | Laurie is listening to music. 110 | There is a mistake in the sentence. 111 | Are you listening to the radio? 112 | Tom is listening now. 113 | Tom is listening to music. 114 | Emily was listening to music. 115 | I don't have a clue. 116 | I cannot make noise. The baby is sleeping. 117 | Were you listening to the radio yesterday? 118 | We're speaking English. 119 | Aimee is listening to music now. 120 | We're speaking Turkmen. 121 | I don't know. 122 | I can not. 123 | I don't know where my keys are. 124 | I'm an engineer. 125 | I can't say. 126 | I'm listening to the radio. 127 | I'm not pretty. 128 | I can't remember. 129 | He turned on the light. 130 | I don't remember! 131 | We live on the earth. 132 | I'm not a doctor. 133 | Turtles don't have teeth. 134 | I would like to ask for a translator. 135 | I don't want meat. 136 | I think everybody should learn another language. 137 | Will you come with me to the concert? 138 | Where do you come from? 139 | I am a teacher. 140 | What's your name? 141 | Do you love your mother? 142 | Greece is an old country. 143 | I don't want to go to school. 144 | She's cooking now. 145 | I won't come. 146 | It's very interesting. 147 | I'm afraid of them. 148 | I am sorry. 149 | What do you do? 150 | The girl looked at him too. 151 | I am a graduate student. 152 | Maral's mother is forty-three. 153 | I want to cry. 154 | I bought a book. 155 | This is my choice. 156 | This is my bicycle. 157 | This is my notebook. 158 | This is my dog. 159 | Do you have paper? 160 | This is my daughter. 161 | This is not my home. 162 | Literature is the future of a nation. 163 | How are you? 164 | She's my wife. 165 | I am a housewife. 166 | Where were you? 167 | He is our driver. 168 | This is my mother. 169 | A monarch had six sons. 170 | I haven't got books. 171 | No pain, no gain. 172 | Where are you? 173 | I have the right to criticise. 174 | If you push a button, either you will die or you will live. 175 | I am not married. 176 | I'm not married. 177 | No, I am not married. 178 | That isn't good. 179 | He was listening to music in his room. 180 | Yes, of course. 181 | Happy International Women's Day! 182 | He is not a child. 183 | I don't want to eat. 184 | Tom and Mary are listening. 185 | The horse is not white. 186 | Where are the books? 187 | I have a stomachache. 188 | Better late than never. 189 | He is reading. 190 | I am also unemployed. 191 | Children are the flowers of our lives. 192 | This is my cousin. 193 | I am a student. 194 | Tom is very young. 195 | I have a dream. 196 | I have a headache. 197 | I have a chill. 198 | I have a cough. 199 | However, I won't know whether he came or not. 200 | I have a family. 201 | I have a car. 202 | He has twenty children. 203 | I can't smoke. 204 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.tuk-eng.tuk: -------------------------------------------------------------------------------- 1 | Asmanda ýyldyzlar petreşip görünýärdi. 2 | Aňmaýaryn. 3 | At ak. 4 | Meniň kitabym bar. 5 | Är-aýalyň uruşy – ýaz gününiň ýagyşy. 6 | Türkmençe gepläp bilýäňmi? 7 | Kitap sende barmy? 8 | Men meýletinçi. 9 | Men çaý içmek isleýän. 10 | Ol gorkuly. 11 | Ary örüzen balyň kaýda, Pyragy? 12 | Men şat. 13 | Men gyzyksyz. 14 | Ol owadanmy? 15 | Men çydajak. 16 | Men garry däl. 17 | Iki pişigim bar. 18 | Meniň bir itim bar. 19 | Düşündiňmi? 20 | Seniň pişigiň barmy? 21 | Seniň otluçöpüň barmy? 22 | Men gazaplydym. 23 | Men geljek. 24 | Ol geljek. 25 | Men düşünemok. 26 | Men gaýdyp geljek. 27 | Okamadym. 28 | Bolýar. 29 | Men siziň bilen geljek. 30 | Sen meniň bilen geljek. 31 | Sen geljegiňi bilýärdim. 32 | Olar gaýdyp geljek. 33 | Men daşaryk çykjak. 34 | Men häzir geljek. 35 | Men öýe geljek. 36 | Olar geljek. 37 | Men gitjek. 38 | Biz gitjek. 39 | Men öýe gitjek. 40 | Men kinoteatra gitjek. 41 | Elbetde men gitjek. 42 | Sen kim bilen gitjek? 43 | Sen gitjekmi? 44 | Men ol ýere gitjek. 45 | Men awtomobilde gitjek. 46 | Men seniň bilen gitjek. 47 | Sen nirä gitjek? 48 | Men häzir gitjek. 49 | Hawa, men gitjek. 50 | Men görmäge gitjek. 51 | Sen biziň bilen gitjekmi? 52 | Sen otly bilen gitjekmi? 53 | Sen kinoteatra gitjekmi? 54 | Sen ol ýere gitjekmi? 55 | Sen häzir gitjekmi? 56 | Sen Tom bilen gitjekmi? 57 | Men synanyşjak. 58 | Sen hem gitjekmi? 59 | Men öwrenjek. 60 | Men iýjek. 61 | Sen meniň bilen söwda gitjekmi? 62 | Biz häli ýagşy. 63 | Ol bir ýyldan gowrak wagt boldy. 64 | Men sürmegi öwrenip ýörün. 65 | Garaňkydan gorkýaryn. 66 | Guş jennetde. 67 | Olar häzir bişirýärler. 68 | Guş gökde. 69 | Tom häzir nirede? 70 | Bu suratda näçe gyz bar? 71 | Men hemişe saz diňleýärin; onsuz ýaşap bilmem. 72 | Tanşanymyza örän şat boldum. 73 | Men ony hiç haçan görjek däl. 74 | Men ölümden gorkamok. 75 | Men Türkmençe öwrenýärin. 76 | Häzirki wagtda Burj Khalifa dünýäniň iň uzyn binasydyr. 77 | Sen ony bu ýerde islejek däl. 78 | Men seni tanamok. 79 | Maňa uklamak gerek. 80 | At gara. 81 | Ýagyş ýagýar. 82 | Maňa taksi gerek! 83 | Deňizde adalar bar. 84 | Men mugallym däl. 85 | Bu bir sözlem däl. 86 | Bu bir balyk däl. 87 | Bu bir stol däl. 88 | Men bir kitap okadym. 89 | Men sürmek isleýärin. 90 | Men ýaşamak isleýärin. 91 | Ol maňa düşünmedi. 92 | Men uklamakçy. 93 | Men saz diňleýärin. 94 | Men diňleýärdim. 95 | Men häzir bişirýärin. 96 | Olar radio diňleýärdiler. 97 | Sen diňleýärsiňmi? 98 | Men seni diňleýärin. 99 | Tom diňleýär. 100 | Tom diňleýärdi. 101 | Sen diňleýärdiňmi? 102 | Ol saz diňleýärdi. 103 | Çaga saz diňleýär. 104 | Ol ony diňleýär. 105 | Sen ony diňleýärsiňmi? 106 | Är kişi güýçli. 107 | Ol radio diňleýär. 108 | Men bir aýdym diňleýärin. 109 | Laurie saz diňleýär. 110 | Sözlemde bir ýalňyşlyk bar. 111 | Sen radio diňleýärsiňmi? 112 | Tom häzir diňleýär. 113 | Tom saz diňleýär. 114 | Emily saz diňleýärdi. 115 | Meniň habarym ýok. 116 | Men galmagal edip bilemok. Çaga ýatyr. 117 | Sen düýn radio diňleýärdiňmi? 118 | Biz Iňlisçe gepleýäris. 119 | Aimee häzir saz diňleýär. 120 | Biz Türkmençe gepleýäris. 121 | Men bilmeýärin. 122 | Men edip bilemok. 123 | Men açarlarymyň nirededigini bilemok. 124 | Men inžener. 125 | Men diýip bilemok. 126 | Men radio diňleýärin. 127 | Men owadan däldirin. 128 | Men aňyp bilemok. 129 | Ol yşygy ýakdy. 130 | Men aňmaýaryn. 131 | Biz dünýäde ýaşaýarys. 132 | Men doktor däl. 133 | Pyşbagalaryň dişleri ýok. 134 | Maňa terjimeçi gerek. 135 | Et islemeýärin. 136 | Mençe her bir adam başga dil öwrenmeli. 137 | Sen meniň bilen konserte geljekmi? 138 | Sen nireli? 139 | Men mugallym. 140 | Seň adyň näme? 141 | Sen özyn eneyn söýarsiň? 142 | Gresiýa köne ülke. 143 | Mekdebe gitmek islemeýärin. 144 | Ol häzir bişirýär. 145 | Men geljek däl. 146 | Örän gyzykly. 147 | Men olardan gorkýaryn. 148 | Bagyşlaň. 149 | Käriň näme? 150 | Gyzjagaz hem oňa bakdy. 151 | Men aspirantura talyby. 152 | Maralyň ejesi kyrk üç ýaşynda. 153 | Men eňremek isleýärin. 154 | Kitap satyn aldym. 155 | Bu meniň saýlamam. 156 | Bu meniň welosipedim. 157 | Bu meniň depderim. 158 | Bu meniň itim. 159 | Seniň kagyzyň barmy? 160 | Bu meniň gyzym. 161 | Bu meniň öýim däl. 162 | Edebiýat bir milletiň geljegi. 163 | Ýagdaýlaryň nähili? 164 | Ol meniň aýalym. 165 | Men öý hojalykçy aýal. 166 | Sen niredediň? 167 | Ol biziň sürüjimiz. 168 | Ol meň ejem. 169 | Bir patşanyň alty ogly bar eken. 170 | Meniň kitaplarym ýok. 171 | Zähmet soňy rähnet. 172 | Sen nirede? 173 | Meniň tankyt etmäge hakym bar. 174 | Eger bir düwmä bassaň, ýa sen sagaljak, ýa sen öljek. 175 | Men öýlenemok. 176 | Öýlenemok. 177 | Ýok. Öýlenemok. 178 | Ol ýagşy däl. 179 | Ol otagynda saz diňleýärdi. 180 | Hawa, elbetde. 181 | Halkara ayal-gyzlar bayraminiz gutly bolsun! 182 | Ol çaga däl. 183 | Iýmek islemeýärin. 184 | Tom we Mary diňleýärler. 185 | At ak däl. 186 | Kitaplar nirede? 187 | Meniň garnym agyrýar. 188 | Hiçden giç ýagşy. 189 | Ol okap ýatyr. 190 | Men hem işsiz. 191 | Çagalar durmuşymyzyň gülleridir. 192 | Bu meniň çykanym. 193 | Men talyby. 194 | Tom örän ýaş. 195 | Meniň bir arzuwym bar. 196 | Meniň kelläm agyrýar. 197 | Men üşeýärin. 198 | Men üsgürýärin. 199 | Onuň gelenini-gelmedigini weli men bijek däl. 200 | Meniň bir maşgalam bar. 201 | Meniň bir awtomobilim bar. 202 | Onuň ýigrimi çagasy bar. 203 | Men tüsseleýip bilemok. 204 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.tzl-eng.eng: -------------------------------------------------------------------------------- 1 | I like sleeping. 2 | I'm a teacher. 3 | I talk in my sleep. 4 | It was love at first sight. 5 | Do you believe in love at first sight? 6 | I like to sing. 7 | I love to dance. 8 | I love golf. 9 | This way, please. 10 | Is there a God? 11 | Where are you going? 12 | Do write to me soon! 13 | It's one o'clock. 14 | What time is it? 15 | I am lost. 16 | How about you? 17 | How are you? 18 | I have to buy some new shoes. 19 | I am 12 years old. 20 | I feel terrible. 21 | I don't know. 22 | What's wrong with me? 23 | You are welcome! 24 | What's the dealio? 25 | What's the matter? 26 | I love you. 27 | I'm as tall as Tom. 28 | I hope so! 29 | What would happen? 30 | Do you speak Talossan? 31 | I am a teacher. 32 | I teach mathematics and physics. 33 | He may not be happy. 34 | He is available now. 35 | How could you? 36 | Where are you going to? 37 | I need assistance. 38 | It is a beautiful language. 39 | Thank you very much! 40 | Can you help me? 41 | I am going to the mall. 42 | I want that. 43 | There he is. 44 | There he is! 45 | Tom wondered why Mary wouldn't French kiss him. 46 | Are you waiting for the bus? 47 | I have an appointment. 48 | I have an appointment with Tom. 49 | What do I owe you? 50 | I am going to school. 51 | I am going to work. 52 | What are you doing here so late? 53 | I'm trying to find a new job. 54 | Who is that young woman? 55 | Come as soon as you can. 56 | He is also a new student. 57 | Get it done as soon as possible. 58 | I came as soon as I could. 59 | He is from Egypt. 60 | I will be back soon. 61 | She is from Japan. 62 | He will come down soon. 63 | It's the first door on the right. 64 | Let us go. 65 | What a surprise! 66 | He will come back soon. 67 | We will begin as soon as possible. 68 | One language is never enough. 69 | My hovercraft is full of eels. 70 | Call the police! 71 | Do you want to see Tom? 72 | I loved the old man. 73 | We did it! 74 | Break a leg! 75 | Who is there? 76 | I cut off the head and the arms and the legs. 77 | Nice to meet you! 78 | Have a nice day! 79 | The officers were satisfied. 80 | My manner had convinced them. 81 | I am 19 years old. 82 | I am 18 years old. 83 | I am 24 years old. 84 | I am twelve years old. 85 | I am 20 years old. 86 | I am 30 years old. 87 | None of your business. 88 | I am hungry. 89 | You don't say. 90 | You don't say! 91 | I'm 25 years old. 92 | I don't understand. 93 | I have my doubts. 94 | Happy birthday to you! 95 | I love you! 96 | I hate you. 97 | I hate working. 98 | I hate you! 99 | I hate studying. 100 | I'm 45 years old. 101 | I like it. 102 | I like him very much. 103 | I like traveling. 104 | I love to travel. 105 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.tzl-eng.tzl: -------------------------------------------------------------------------------- 1 | Me piaça dormarh. 2 | Éu sint ün profeßeir. 3 | Praiçéu. 4 | C'esteva fieschada. 5 | ¿Credás in la fieschada? 6 | Me piaça cantarh. 7 | Améu dançarh. 8 | Améu el golf. 9 | Vetz à'ici, sch'o Voi piaça. 10 | ¿Ja'iens Díeu? 11 | ¿Aduve vas't? 12 | ¡Scriitz-me frü! 13 | C'e viensa þora. 14 | ¿Qet sint las quantas? 15 | Téu amistat. 16 | ¿Es tu? 17 | ¿Com'estás't? 18 | Téu à comprarh dels cauçadours ujs. 19 | Éu téu 12 ars. 20 | Eu sentiéu terival. 21 | Éu non säp. 22 | ¿Qet me paßa? 23 | Non per acest. 24 | ¿Qet paßeva? 25 | ¿Qet paßa? 26 | T'améu. 27 | Éu sint sa inalt qe Tom. 28 | ¡Oc'halà! 29 | ¿Qet paßadra? 30 | ¿Parletz-voi Talossan? 31 | Éu sint ‘n ensegnhistà. 32 | Ensegnhéu dels maþematici es dels füçici. 33 | Salacor o non isch feliceu. 34 | O isch avalaval nun. 35 | ¿Come c'e pouçival? 36 | ¿Aduve vetz-voi? 37 | Neceßéu dal aßistançéu. 38 | C'e'n glheþ bel. 39 | Muitas graschcias. 40 | ¿Put-tu m'atxutarh? 41 | Véu àl friul. 42 | Eu volt acest. 43 | O isch là. 44 | ¡O isch là! 45 | Tom undereva perqet Mary non lo viac'hadra. 46 | ¿Sustinetz-voi el bus? 47 | Téu 'n apüntamaintsch. 48 | Téu 'n apüntamaintsch cün Tom. 49 | ¿Quançeu t'eigéu? 50 | Véu àl scuola. 51 | Véu àl traval. 52 | ¿Qet façás't aicì sa schpeit? 53 | Éu atent à trovarh 'n noveu posteu. 54 | ¿Qi'st aceasta xhuvencula? 55 | Va aicì prontu. 56 | O isch ocsà 'n studint noveu. 57 | Fäts-en prontu. 58 | Veneveu prontu. 59 | O isch dal Misiria. 60 | Revenarhéu frü. 61 | A isch dal Cipangu. 62 | Descendarha frü. 63 | C'e la prüma poarta àl drept. 64 | Qe noi venadrent. 65 | ¡Cacsa surpriça! 66 | Revenhara frü. 67 | Començarhent prontu. 68 | Viens glheþ isch txamais aßei. 69 | Va voltigeir isch pien d'anguiglhas. 70 | ¡Clametz àl militzia! 71 | ¿Volt-tu vidarh Tom? 72 | Ameveu el senesch. 73 | Riuschlevent! 74 | ¡Bun-escasença! 75 | ¿Qi’st là? 76 | Escapçeveu el cäps es els brätslilor es las gambas. 77 | ¡Encantat! 78 | ¡Díeu t'alegra! 79 | Els flücs füvent satisfiats. 80 | Va façiun lor tignhova comvimçada. 81 | Téu 19 ars. 82 | Téu 18 ars. 83 | Téu 24 ars. 84 | Téu 12 ars. 85 | Téu 20 ars. 86 | Téu 30 ars. 87 | C'e v'afar. 88 | Faméu. 89 | Tent zirat. 90 | ¡Tent zirat! 91 | Téu 25 ars. 92 | Non cumprenchéu. 93 | Téu vaes duvitaziuns. 94 | ¡Felicia nadaliça! 95 | ¡T'améu! 96 | Te haßéu. 97 | Haßéu travalarh. 98 | ¡Te haßéu! 99 | Haßéu estudiarh. 100 | Téu 45 ars. 101 | Me piaça. 102 | Me piaça mült. 103 | Me piaça voiatxarh. 104 | Améu voiatxarh. 105 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.xho-eng.eng: -------------------------------------------------------------------------------- 1 | I'm your father. 2 | The store closes at eleven. 3 | They have wine. 4 | I don't want to play. 5 | I don't want to. 6 | There's no reason to be afraid. 7 | I don't want to stay. 8 | "What time is it?" "It is ten-thirty." 9 | I don't want to go. 10 | Tom doesn't drink coffee. 11 | "What time is it now?" "It's ten o'clock." 12 | I don't want to talk. 13 | He loves him. 14 | He's kicking me! 15 | Let me go with you. 16 | I don't want to work. 17 | I won't participate in speculation. 18 | Open the door. 19 | You speak Xhosa well. 20 | He spent a little time on his lessons. 21 | He kicked the ball. 22 | I'm taking a book. 23 | No, I don't want to. 24 | How do you feel today? 25 | It wasn't my fault. 26 | I don't want to look. 27 | Tom lives in Boston. 28 | I don't want to cook. 29 | Hello. May I speak to Mr Johnson, please? 30 | My pen is new. 31 | Hello. This is Joe Carlton. May I speak to Michael? 32 | May I speak to Mike, please? 33 | May I speak to Bill? 34 | I don't want to sleep. 35 | May I speak with you? 36 | I still love you. 37 | The mouse sure knows where the cheese is located. 38 | Please close the window. 39 | We will bury you. 40 | Who will pay? 41 | She is brave for a girl. 42 | Hey, I'll be right back. 43 | It snowed a lot last year. 44 | I don't want to cry. 45 | I don't want to go to school. 46 | He speaks a little English. 47 | I don't want to lie. 48 | I love you! 49 | I'm 12 years old and what is this? 50 | I've attempted suicide twice. 51 | This is not a sentence. 52 | I don't need anything. 53 | He is running. 54 | Where is it now? 55 | How big is your house? 56 | My house is small. 57 | How big is your dog? Mine is small. 58 | How many bedrooms does your house have? 59 | My house has two bedrooms. 60 | I love you. 61 | Where is Jim? 62 | Where are you? 63 | I really like working with people. 64 | We may refuse to accept the proposal. 65 | He is a hero. 66 | You must try to understand me. 67 | Hi, my name is Pekka. What is your name? 68 | What is your name? 69 | Hey, what are you guys talking about? 70 | "May I speak to Mr Smith?" "Will you hold the line?" 71 | Can I help you? 72 | The kids are asleep. 73 | I must go now. 74 | He dressed up as a woman. 75 | Will you be going to the party tonight? If not, let's go watch a movie. 76 | He is looking for a job. 77 | I wanted to watch a horror movie, but my girlfriend is scared and rented comedy instead. 78 | Thank you very much for your letter. 79 | We suffered a devastating loss. 80 | Where do you come from? 81 | The rain has stopped. 82 | When was this church built? 83 | If you want pudding, you must eat your meat. 84 | He was sentenced to three years in jail. 85 | I'm afraid to go alone. 86 | I was expecting this. 87 | I know you're going to say no. 88 | You killed my father. 89 | I'm pleased to meet you. 90 | I'm looking for work. 91 | I work in a hospital. 92 | He found his parents. 93 | I am looking for my brother. 94 | Jobs are scarce. 95 | She is looking for her car keys. 96 | She has wine. 97 | Do you remember her name? 98 | Hey, put that back. 99 | How do you like your coffee? 100 | We have wine. 101 | Excuse me, what time is it? 102 | You must leave. 103 | Without my well being, I can't have a good job. 104 | She sometimes helps her mother. 105 | Tom is looking for his glasses. 106 | Tom doesn't like his work. 107 | I understand now. 108 | Open your mouth! 109 | Turkey produces a lot of minerals. 110 | Does your friend like tea? 111 | Tom works at home. 112 | Why does Tom work at home? 113 | Where are you from? 114 | I'm going home. 115 | How do you travel to work? 116 | You are my friend. 117 | Tom was very scared. 118 | You can speak English. 119 | Tom is looking for Mary. 120 | Excuse me! May I open the window? 121 | It's a pretty house. 122 | I brought a book. 123 | A person is a person through other people. 124 | Turn off the light. 125 | Water is important. 126 | I want a blue cake. 127 | I'd like to see you, please. 128 | I ran into him yesterday at the airport. 129 | I am a vegetarian. 130 | I work at my friend's shop. 131 | More coffee, please. 132 | My wife is a vegetarian. 133 | Why did you leave your job on the farm? 134 | You have a big nose. 135 | What time is it? 136 | Have you ever been to Paris? 137 | Sometimes you just want to spend the whole day doing nothing. 138 | It is raining. 139 | Where can I get a vuvuzela? 140 | But I don't want to. 141 | Where does your grandfather live? 142 | A cat came out from under the desk. 143 | -------------------------------------------------------------------------------- /data/tatoeba/v1/tatoeba.xho-eng.xho: -------------------------------------------------------------------------------- 1 | Ndingutata wakho. 2 | Ivenkile ivala ngo11. 3 | Banewayini. 4 | Andifuni ukudlala. 5 | Andifuni. 6 | Asikho isizatho bawoyike. 7 | Andifuni ukuhlala. 8 | "Ngubani ixesha?" "Ngu 10.30." 9 | Andifuni ukuhamba. 10 | uTom akaphungi kofu. 11 | "Ngubani ixesha ngoku?" "Ngu 10." 12 | Andifuni ukuthetha. 13 | Uyamthanda. 14 | Uyandikhaba! 15 | Mandiye nawe. 16 | Andifuni ukusebenza. 17 | Andizukuthatha nxaxheba kwintekelelo. 18 | Vula ucango. 19 | Usithetha kakuhle isiXhosa. 20 | Uchithe ixesha ekincinci kwizifundo zakhe. 21 | Ukhabe ibhola. 22 | Ndithatha incwadi. 23 | Hayi, andifuni. 24 | Uziva njani namhlanje? 25 | Ayonxaki yam. 26 | Andifuni ukujonga. 27 | uTom uhlala eBoston. 28 | Andifuni ukupheka. 29 | Molo. Ndingathetha noMnumzana Johnson, nceda? 30 | Lutsha usiba lwam. 31 | Molo. NguJoe Carlton lo. Ndingathetha noMichael? 32 | Ndingathetha noMike, nceda. 33 | Ndingathetha noBill? 34 | Andifuni ukulela. 35 | Ndingathetha nawe? 36 | Ndisakuthanda. 37 | Impuku ngenene iyasazi apho sikhoyo isonka samanzi. 38 | Nceda uvale ifestile. 39 | Siya kukufihla. 40 | Ngubani oza kuhlawula? 41 | Uyintombi, kodwa unesibindi. 42 | Hey, ndizakubuya. 43 | Kukhithike kakhulu kulo nyaka uphelileyo. 44 | Andifuni ukulila. 45 | Andifuni ukuya esikolweni. 46 | Usithetha isiNgesi kancinci. 47 | Andifuni ukuxoka. 48 | Ndiyakuthanda. 49 | Ndineminyaka engu-12, yintoni lento? 50 | Ndizame ukuzibulala kabini. 51 | Ayingomqholo lo. 52 | Andazi nto. 53 | Uyabaleka. 54 | Uphi ngoku? 55 | Ingakanani indlu yakho? 56 | Indlu yam incinci. 57 | Ingakanani inja yakho? Eyam incinci. 58 | Indlu yakho inamagumbi okulala amangaphi? 59 | Indlu yam inamagumbi okulala amabini. 60 | Ndiyakuthanda! 61 | Uphi uJim? 62 | Uphi? 63 | Ndiyakuthanda kakhulu ukusebenza nabantu. 64 | Sizakwala ukwamkela isindululo. 65 | Uligorha. 66 | Funeke uzame ukundiqonda. 67 | Molo, igama lam nguPekka. Ngubani igama lakho? 68 | Ngubani igama lakho? 69 | Hey, yintoni enithetha ngayo? 70 | "Ndingathetha noMnumzana Smith?" "Khawubambe njalo." 71 | Ndingakunceda? 72 | Abantwana balele. 73 | Kufuneka ndiye ngoku. 74 | Unxibe njengomfazi. 75 | Uzakuya kwitheko ngokuhlwa nje? Ukuba awuyi, masambe siyokubukela umboniso bhanya bhanya. 76 | Ukhangela umsebenzi. 77 | Bendifuna ukubukela umboniso bhanyabhanya owoyikisayo, kodwa intombi yam iyoyika,saqesha ohlekisayo. 78 | Enkosi kakhulu ngeleta yakho. 79 | Sohlelwe yintlungu enzulu. 80 | Nivela phi? 81 | Imvula iyekile. 82 | Icawa yakhiwa nini? 83 | Ukuba ufuna ipudini,kufuneka utye inyama yakho. 84 | Ugwetyiwe iminyaka emithathu entolongweni. 85 | Ndiyoyika ukuhamba ndedwa. 86 | Bendiyilindele le nto. 87 | Ndiyayaz uzothi hayi. 88 | Wambulala utata wam. 89 | Ndiyavuya ukukwazi. 90 | Ndifuna umsebenzi. 91 | Ndisebenza esibhedlele. 92 | Ufumene abazali bakho. 93 | Ndikhangela ubhuti wam. 94 | Imisebenzi inqabile. 95 | Uzikhangela izitshixo zemoto yakhe. 96 | Unewayini. 97 | Uyalikhumbula igama lakhe? 98 | Hey,beka lo nto apho ubuyithatha khona. 99 | Uyithanda njani ikofu yakho? 100 | Sinewayini. 101 | Uxolo, ngubani ixesha? 102 | Kufuneka uhambe. 103 | Ngaphandle kokuba ngumqabaqaba,andikwazi ukuba nomsebenzi olungileyo. 104 | Ngamanye amaxesha ubceda umama wakhe. 105 | uTom uzikhangela iindondo zakhe. 106 | uTom akawuthandi umsebenzi wakhe. 107 | Ndiyaqonda ngoku. 108 | Vula umlomo wakho! 109 | Ilizwe laseTurkey livelisa izambiwa phantsi komhlaba ezininzi. 110 | Umhlobo wakho uyithanda iti? 111 | uTom usebenza ekhaya. 112 | Kutheni uTom esebenza ekhaya? 113 | Uvela phi? 114 | Ndiyagoduka. 115 | Uhamba ngantoni emsebenzini? 116 | Ungumhlobo wam. 117 | UTom ebesoyika kakhulu. 118 | Ungathetha isiNgesi. 119 | uTom ukhangela uMary. 120 | Uxolo! Ndingavula ifestile na? 121 | Yindlu entle. 122 | Ndizise incwadi. 123 | Umntu ngumntu ngabantu. 124 | Cima isibane. 125 | Amanzi abalulekile. 126 | Ndifuna ikeyiki eluhlaza. 127 | Nceda, ndifuna ukukubona. 128 | Ndidibene naye izolo kwisitishi senqwelo moya. 129 | Nditya imifuno kuphela. 130 | Ndisebenza evenkileni yomhlobo wam. 131 | Ndicela enye ikofu. 132 | Umfazi wam utya imifuno kuphela. 133 | Kutheni uwuyekile umsebenzi wakho efama? 134 | Unempumlo enkulu. 135 | Ngubani ixesha? 136 | Wakhe waya eParis? 137 | Ngelinye ixesha ufuna ukuchitha imini yonke ungenzi nto. 138 | Kuyanetha. 139 | Ndingayifumana phi ivuvuzela? 140 | Kodwa andifuni. 141 | Utatomkhulu wakho uhlala phi? 142 | Ikati iphume phantsi kwedesika. 143 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | MAINTAINER Gilles Bodart 4 | 5 | # Install build-essential (compiler and development tools) 6 | RUN apt-get update && \ 7 | apt-get install -y build-essential && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | RUN conda create -n env python=3.8 11 | RUN echo "source activate env" > ~/.bashrc 12 | ENV PATH /opt/conda/envs/env/bin:$PATH 13 | 14 | # Set the working directory to /app 15 | WORKDIR /app 16 | 17 | # Copy the local laser-encoders repository 18 | COPY laser_encoders /app/laser_encoders 19 | COPY pyproject.toml /app/pyproject.toml 20 | 21 | RUN pip install --upgrade pip 22 | RUN pip install -e . 23 | RUN pip install Flask==2.3.3 Requests==2.31.0 24 | 25 | # Define the argument for language 26 | ARG langs="eng_Latn" 27 | 28 | # Download language models for each specified language 29 | RUN for lang in $langs; do \ 30 | python -m laser_encoders.download_models --lang=$lang; \ 31 | done 32 | 33 | # Open the port 80 34 | EXPOSE 80 35 | 36 | COPY docker/app.py /app/app.py 37 | 38 | CMD ["/bin/bash"] 39 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | ## LASER Docker Image 2 | 3 | This image provides a convenient way to run LASER in a Docker container. 4 | 5 | ### Building the image 6 | To build the image, run the following command from the root of the LASER directory: 7 | 8 | ``` 9 | docker build --tag laser -f docker/Dockerfile . 10 | ``` 11 | ### Specifying Languages with `langs` Argument 12 | 13 | You can pre-download the encoders and tokenizers for specific languages by using the `langs` build argument. This argument accepts a space-separated list of language codes. For example, to build an image with models for English and French, use the following command: 14 | ``` 15 | docker build --build-arg langs="eng_Latn fra_Latn" -t laser -f docker/Dockerfile . 16 | ``` 17 | If the `langs` argument is not specified during the build process, the image will default to building with English (`eng_Latn`). It's important to note that in this default case where English is selected, the LASER2 model, which supports 92 languages, is used. For a comprehensive list of LASER2 supported languages, refer to `LASER2_LANGUAGES_LIST` in [`language_list.py`](https://github.com/facebookresearch/LASER/blob/main/laser_encoders/language_list.py). 18 | 19 | 20 | ### Running the Image 21 | Once the image is built, you can run it with the following command: 22 | 23 | ``` 24 | docker run -it laser 25 | ``` 26 | **Note:** If you want to expose a local port to the REST server on top of the embed task, you can do so by executing the following command instead of the last command: 27 | 28 | ``` 29 | docker run -it -p [CHANGEME_LOCAL_PORT]:80 laser python app.py 30 | ``` 31 | This will override the command line entrypoint of the Docker container. 32 | 33 | Example: 34 | 35 | ``` 36 | docker run -it -p 8081:80 laser python app.py 37 | ``` 38 | 39 | This Flask server will serve a REST Api that can be use by calling your server with this URL : 40 | 41 | ``` 42 | http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE] 43 | ``` 44 | 45 | Example: 46 | 47 | ``` 48 | http://127.0.0.1:8081/vectorize?q=ki%20lo%20'orukọ%20ẹ&lang=yor 49 | ``` 50 | 51 | Sample response: 52 | ``` 53 | { 54 | "content": "ki lo 'orukọ ẹ", 55 | "embedding": [ 56 | [ 57 | -0.10241681337356567, 58 | 0.11120740324258804, 59 | -0.26641348004341125, 60 | -0.055699944496154785, 61 | .... 62 | .... 63 | .... 64 | -0.034048307687044144, 65 | 0.11005636304616928, 66 | -0.3238321840763092, 67 | -0.060631975531578064, 68 | -0.19269055128097534, 69 | ] 70 | } 71 | ``` 72 | 73 | Here is an example of how you can send requests to it with python: 74 | 75 | ```python 76 | import requests 77 | import numpy as np 78 | url = "http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize" 79 | params = {"q": "Hey, how are you?\nI'm OK and you?", "lang": "en"} 80 | resp = requests.get(url=url, params=params).json() 81 | print(resp["embedding"]) 82 | ``` -------------------------------------------------------------------------------- /docker/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import socket 5 | 6 | from flask import Flask, jsonify, request 7 | 8 | from laser_encoders import LaserEncoderPipeline 9 | from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE 10 | 11 | app = Flask(__name__) 12 | 13 | # Global cache for encoders 14 | encoder_cache = {} 15 | 16 | laser2_encoder = None 17 | 18 | 19 | @app.route("/") 20 | def root(): 21 | print("/") 22 | html = "

Hello {name}!

" "Hostname: {hostname}
" 23 | return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname()) 24 | 25 | 26 | @app.route("/vectorize", methods=["GET"]) 27 | def vectorize(): 28 | content = request.args.get("q") 29 | lang = request.args.get( 30 | "lang", "eng" 31 | ) # Default to English if 'lang' is not provided 32 | 33 | if content is None: 34 | return jsonify({"error": "Missing input content"}), 400 35 | 36 | try: 37 | global laser2_encoder 38 | if lang in LASER2_LANGUAGE: # Checks for both 3-letter code or 8-letter code 39 | if not laser2_encoder: 40 | laser2_encoder = LaserEncoderPipeline(lang=lang) 41 | encoder = laser2_encoder 42 | else: 43 | lang_code = LASER3_LANGUAGE.get( 44 | lang, lang 45 | ) # Use language code as key to prevent multiple entries for same language 46 | if lang_code not in encoder_cache: 47 | encoder_cache[lang_code] = LaserEncoderPipeline(lang=lang_code) 48 | encoder = encoder_cache[lang_code] 49 | 50 | embeddings = encoder.encode_sentences([content]) 51 | embeddings_list = embeddings.tolist() 52 | body = {"content": content, "embedding": embeddings_list} 53 | return jsonify(body), 200 54 | 55 | except ValueError as e: 56 | # Check if the exception is due to an unsupported language 57 | if "unsupported language" in str(e).lower(): 58 | return jsonify({"error": f"Language '{lang}' is not supported."}), 400 59 | else: 60 | return jsonify({"error": str(e)}), 400 61 | 62 | 63 | if __name__ == "__main__": 64 | app.run(debug=True, port=80, host="0.0.0.0") 65 | -------------------------------------------------------------------------------- /docker/decode.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | dim = 1024 5 | X = np.fromfile(sys.argv[1], dtype=np.float32, count=-1) 6 | X.resize(X.shape[0] // dim, dim) 7 | print(X) 8 | -------------------------------------------------------------------------------- /install_external_tools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script installs third party software 16 | # 17 | 18 | if [ -z ${LASER} ] ; then 19 | echo "Please set the environment variable 'LASER'" 20 | exit 21 | fi 22 | 23 | ################################################################### 24 | # 25 | # Generic helper functions 26 | # 27 | ################################################################### 28 | 29 | MKDIR () { 30 | dname=$1 31 | if [ ! -d ${dname} ] ; then 32 | echo " - creating directory ${dname}" 33 | mkdir -p ${dname} 34 | fi 35 | } 36 | 37 | 38 | bdir="${LASER}" 39 | tools_ext="${bdir}/tools-external" 40 | MKDIR $tools_ext 41 | 42 | ################################################################### 43 | # 44 | # Tokenization tools from Moses 45 | # It is important to use the official release V4 and not the current one 46 | # to obtain the same results than the published ones. 47 | # (the behavior of the tokenizer for end-of-sentence abbreviations has changed) 48 | # 49 | ################################################################### 50 | 51 | InstallMosesTools () { 52 | moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts" 53 | moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \ 54 | "tokenizer/normalize-punctuation.perl" \ 55 | "tokenizer/remove-non-printing-char.perl" \ 56 | "tokenizer/deescape-special-chars.perl" \ 57 | "tokenizer/lowercase.perl" \ 58 | "tokenizer/basic-protected-patterns" \ 59 | ) 60 | 61 | wdir="${tools_ext}/moses-tokenizer/tokenizer" 62 | MKDIR ${wdir} 63 | cd ${wdir} 64 | 65 | for f in ${moses_files[@]} ; do 66 | if [ ! -f `basename ${f}` ] ; then 67 | echo " - download ${f}" 68 | wget -q ${moses_git}/${f} 69 | fi 70 | done 71 | chmod 755 *perl 72 | 73 | # download non-breaking prefixes per language 74 | moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix" 75 | moses_non_breaking_langs=( \ 76 | "ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \ 77 | "it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \ 78 | "ta" "yue" "zh" ) 79 | wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes" 80 | MKDIR ${wdir} 81 | cd ${wdir} 82 | 83 | for l in ${moses_non_breaking_langs[@]} ; do 84 | f="${moses_non_breakings}.${l}" 85 | if [ ! -f `basename ${f}` ] ; then 86 | echo " - download ${f}" 87 | wget -q ${moses_git}/${f} 88 | fi 89 | done 90 | } 91 | 92 | 93 | ################################################################### 94 | # 95 | # FAST BPE 96 | # 97 | ################################################################### 98 | 99 | InstallFastBPE () { 100 | cd ${tools_ext} 101 | if [ ! -x fastBPE/fast ] ; then 102 | echo " - download fastBPE software from github" 103 | wget https://github.com/glample/fastBPE/archive/master.zip 104 | unzip master.zip 105 | /bin/rm master.zip 106 | mv fastBPE-master fastBPE 107 | cd fastBPE 108 | echo " - compiling" 109 | g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast 110 | if [ $? -eq 1 ] ; then 111 | echo "ERROR: compilation failed, please install manually"; exit 112 | fi 113 | python setup.py install 114 | fi 115 | } 116 | 117 | ################################################################### 118 | # 119 | # SENTENCEPIECE 120 | # 121 | ################################################################### 122 | 123 | InstallSentencePiece () { 124 | cd ${tools_ext} 125 | if [ ! -d sentencepiece-master ] ; then 126 | echo " - download sentencepiece from github" 127 | wget https://github.com/google/sentencepiece/archive/master.zip 128 | unzip master.zip 129 | /bin/rm master.zip 130 | if [ ! -s /usr/local/bin/spm_encode ] ; then 131 | echo " - building code " 132 | cd sentencepiece-master 133 | mkdir build 134 | cd build 135 | cmake .. 136 | make -j 10 137 | fi 138 | fi 139 | } 140 | 141 | 142 | ################################################################### 143 | # 144 | # Install Japanese tokenizer Mecab 145 | # We do not use automatic installation with "pip" but directly add the soruce directory 146 | # 147 | ################################################################### 148 | 149 | InstallMecab () { 150 | cd ${tools_ext} 151 | if [ ! -x mecab/mecab/bin/mecab ] ; then 152 | echo " - download mecab from github" 153 | wget https://github.com/taku910/mecab/archive/master.zip 154 | unzip master.zip 155 | #/bin/rm master.zip 156 | if [ ! -s mecab/bin/mecab ] ; then 157 | mkdir mecab 158 | cd mecab-master/mecab 159 | echo " - installing code" 160 | ./configure --prefix ${tools_ext}/mecab && make && make install 161 | if [ $? -q 1 ] ; then 162 | echo "ERROR: installation failed, please install manually"; exit 163 | fi 164 | fi 165 | if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then 166 | cd ${tools_ext}/mecab-master/mecab-ipadic 167 | echo " - installing dictionaries" 168 | ./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \ 169 | && make && make install 170 | if [ $? -eq 1 ] ; then 171 | echo "ERROR: compilation failed, please install manually"; exit 172 | fi 173 | fi 174 | fi 175 | } 176 | 177 | 178 | ################################################################### 179 | # 180 | # main 181 | # 182 | ################################################################### 183 | 184 | echo "Installing the laser_encoders package in editable mode" 185 | 186 | pip install -e . 187 | 188 | echo "Installing external tools" 189 | 190 | InstallMosesTools 191 | InstallFastBPE 192 | InstallSentencePiece 193 | 194 | #InstallMecab 195 | echo "" 196 | echo "automatic installation of the Japanese tokenizer mecab may be tricky" 197 | echo "Please install it manually from https://github.com/taku910/mecab" 198 | echo "" 199 | echo "The installation directory should be ${LASER}/tools-external/mecab" 200 | echo "" 201 | -------------------------------------------------------------------------------- /install_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script installs sentence encoders from Amazon s3 16 | # 17 | 18 | if [ -z ${LASER} ] ; then 19 | echo "Please set the environment variable 'LASER'" 20 | exit 21 | fi 22 | 23 | mdir="${LASER}/models" 24 | 25 | # available encoders 26 | s3="https://dl.fbaipublicfiles.com/laser/models" 27 | networks=("bilstm.eparl21.2018-11-19.pt" \ 28 | "eparl21.fcodes" "eparl21.fvocab" \ 29 | "bilstm.93langs.2018-12-26.pt" \ 30 | "93langs.fcodes" "93langs.fvocab") 31 | 32 | 33 | echo "Downloading networks" 34 | 35 | if [ ! -d ${mdir} ] ; then 36 | echo " - creating directory ${mdir}" 37 | mkdir -p ${mdir} 38 | fi 39 | 40 | cd ${mdir} 41 | for f in ${networks[@]} ; do 42 | if [ -f ${f} ] ; then 43 | echo " - ${mdir}/${f} already downloaded" 44 | else 45 | echo " - ${f}" 46 | wget -q ${s3}/${f} 47 | fi 48 | done 49 | -------------------------------------------------------------------------------- /laser_encoders/__init__.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # ------------------------------------------------------- 14 | 15 | from laser_encoders.laser_tokenizer import initialize_tokenizer 16 | from laser_encoders.models import LaserEncoderPipeline, initialize_encoder 17 | -------------------------------------------------------------------------------- /laser_encoders/download_models.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # ------------------------------------------------------- 14 | # 15 | # This python script installs NLLB LASER2 and LASER3 sentence encoders from Amazon s3 16 | 17 | import argparse 18 | import logging 19 | import os 20 | import shutil 21 | import sys 22 | import tempfile 23 | from pathlib import Path 24 | 25 | import requests 26 | from tqdm import tqdm 27 | 28 | from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE 29 | 30 | logging.basicConfig( 31 | stream=sys.stdout, 32 | level=logging.INFO, 33 | format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 34 | ) 35 | logger = logging.getLogger(__name__) 36 | 37 | 38 | class LaserModelDownloader: 39 | def __init__(self, model_dir: str = None): 40 | if model_dir is None: 41 | model_dir = os.path.expanduser("~/.cache/laser_encoders") 42 | os.makedirs(model_dir, exist_ok=True) 43 | 44 | self.model_dir = Path(model_dir) 45 | self.base_url = "https://dl.fbaipublicfiles.com/nllb/laser" 46 | 47 | def download(self, filename: str): 48 | # Because on windows os.path.join will use "\" insted of "/", so link would be: 49 | # https://dl.fbaipublicfiles.com/nllb/laser\laser2.pt instead of https://dl.fbaipublicfiles.com/nllb/laser/laser2.pt 50 | # which results in a failed download. 51 | url = f"{self.base_url}/{filename}" 52 | local_file_path = os.path.join(self.model_dir, filename) 53 | 54 | if os.path.exists(local_file_path): 55 | logger.info(f" - {filename} already downloaded") 56 | else: 57 | logger.info(f" - Downloading {filename}") 58 | 59 | tf = tempfile.NamedTemporaryFile(delete=False) 60 | temp_file_path = tf.name 61 | 62 | with tf: 63 | response = requests.get(url, stream=True) 64 | total_size = int(response.headers.get("Content-Length", 0)) 65 | progress_bar = tqdm(total=total_size, unit_scale=True, unit="B") 66 | 67 | for chunk in response.iter_content(chunk_size=1024): 68 | tf.write(chunk) 69 | progress_bar.update(len(chunk)) 70 | progress_bar.close() 71 | 72 | shutil.move(temp_file_path, local_file_path) 73 | 74 | def get_language_code(self, language_list: dict, lang: str) -> str: 75 | try: 76 | lang_3_4 = language_list[lang] 77 | if isinstance(lang_3_4, list): 78 | options = ", ".join(f"'{opt}'" for opt in lang_3_4) 79 | raise ValueError( 80 | f"Language '{lang}' has multiple options: {options}. Please specify using the 'lang' argument." 81 | ) 82 | return lang_3_4 83 | except KeyError: 84 | raise ValueError( 85 | f"language name: {lang} not found in language list. Specify a supported language name" 86 | ) 87 | 88 | def download_laser2(self): 89 | self.download("laser2.pt") 90 | self.download("laser2.spm") 91 | self.download("laser2.cvocab") 92 | 93 | def download_laser3(self, lang: str, spm: bool = False): 94 | result = self.get_language_code(LASER3_LANGUAGE, lang) 95 | 96 | if isinstance(result, list): 97 | raise ValueError( 98 | f"There are script-specific models available for {lang}. Please choose one from the following: {result}" 99 | ) 100 | 101 | lang = result 102 | self.download(f"laser3-{lang}.v1.pt") 103 | if spm: 104 | if lang in SPM_LANGUAGE: 105 | self.download(f"laser3-{lang}.v1.spm") 106 | self.download(f"laser3-{lang}.v1.cvocab") 107 | else: 108 | self.download(f"laser2.spm") 109 | self.download(f"laser2.cvocab") 110 | 111 | def main(self, args): 112 | if args.laser: 113 | if args.laser == "laser2": 114 | self.download_laser2() 115 | elif args.laser == "laser3": 116 | self.download_laser3(lang=args.lang, spm=args.spm) 117 | else: 118 | raise ValueError( 119 | f"Unsupported laser model: {args.laser}. Choose either laser2 or laser3." 120 | ) 121 | else: 122 | if args.lang in LASER3_LANGUAGE: 123 | self.download_laser3(lang=args.lang, spm=args.spm) 124 | elif args.lang in LASER2_LANGUAGE: 125 | self.download_laser2() 126 | else: 127 | raise ValueError( 128 | f"Unsupported language name: {args.lang}. Please specify a supported language name using --lang." 129 | ) 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = argparse.ArgumentParser(description="LASER: Download Laser models") 134 | parser.add_argument( 135 | "--laser", 136 | type=str, 137 | help="Laser model to download", 138 | ) 139 | parser.add_argument( 140 | "--lang", 141 | type=str, 142 | help="The language name in FLORES200 format", 143 | ) 144 | parser.add_argument( 145 | "--spm", 146 | action="store_false", 147 | help="Do not download the SPM model?", 148 | ) 149 | parser.add_argument( 150 | "--model-dir", type=str, help="The directory to download the models to" 151 | ) 152 | args = parser.parse_args() 153 | downloader = LaserModelDownloader(args.model_dir) 154 | downloader.main(args) 155 | -------------------------------------------------------------------------------- /laser_encoders/test_models_initialization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import pytest 5 | 6 | from laser_encoders.download_models import LaserModelDownloader 7 | from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE 8 | from laser_encoders.laser_tokenizer import initialize_tokenizer 9 | from laser_encoders.models import initialize_encoder 10 | 11 | 12 | def test_validate_achnese_models_and_tokenize_laser3(lang="acehnese"): 13 | with tempfile.TemporaryDirectory() as tmp_dir: 14 | print(f"Created temporary directory for {lang}", tmp_dir) 15 | 16 | downloader = LaserModelDownloader(model_dir=tmp_dir) 17 | downloader.download_laser3(lang) 18 | encoder = initialize_encoder(lang, model_dir=tmp_dir) 19 | tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) 20 | 21 | # Test tokenization with a sample sentence 22 | tokenized = tokenizer.tokenize("This is a sample sentence.") 23 | 24 | print(f"{lang} model validated successfully") 25 | 26 | 27 | def test_validate_english_models_and_tokenize_laser2(lang="english"): 28 | with tempfile.TemporaryDirectory() as tmp_dir: 29 | print(f"Created temporary directory for {lang}", tmp_dir) 30 | 31 | downloader = LaserModelDownloader(model_dir=tmp_dir) 32 | downloader.download_laser2() 33 | 34 | encoder = initialize_encoder(lang, model_dir=tmp_dir) 35 | tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) 36 | 37 | # Test tokenization with a sample sentence 38 | tokenized = tokenizer.tokenize("This is a sample sentence.") 39 | 40 | print(f"{lang} model validated successfully") 41 | 42 | 43 | def test_validate_kashmiri_models_and_tokenize_laser3(lang="kas"): 44 | with tempfile.TemporaryDirectory() as tmp_dir: 45 | print(f"Created temporary directory for {lang}", tmp_dir) 46 | 47 | downloader = LaserModelDownloader(model_dir=tmp_dir) 48 | with pytest.raises(ValueError): 49 | downloader.download_laser3(lang) 50 | 51 | encoder = initialize_encoder(lang, model_dir=tmp_dir) 52 | tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) 53 | 54 | # Test tokenization with a sample sentence 55 | tokenized = tokenizer.tokenize("This is a sample sentence.") 56 | 57 | print(f"{lang} model validated successfully") 58 | -------------------------------------------------------------------------------- /laser_encoders/validate_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import pytest 5 | 6 | from laser_encoders.download_models import LaserModelDownloader 7 | from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE 8 | from laser_encoders.laser_tokenizer import initialize_tokenizer 9 | from laser_encoders.models import initialize_encoder 10 | 11 | 12 | @pytest.mark.slow 13 | @pytest.mark.parametrize("lang", LASER3_LANGUAGE) 14 | def test_validate_language_models_and_tokenize_laser3(lang): 15 | with tempfile.TemporaryDirectory() as tmp_dir: 16 | print(f"Created temporary directory for {lang}", tmp_dir) 17 | 18 | downloader = LaserModelDownloader(model_dir=tmp_dir) 19 | if lang in ["kashmiri", "kas", "central kanuri", "knc"]: 20 | with pytest.raises(ValueError) as excinfo: 21 | downloader.download_laser3(lang) 22 | assert "ValueError" in str(excinfo.value) 23 | print(f"{lang} language model raised a ValueError as expected.") 24 | else: 25 | downloader.download_laser3(lang) 26 | encoder = initialize_encoder(lang, model_dir=tmp_dir) 27 | tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) 28 | 29 | # Test tokenization with a sample sentence 30 | tokenized = tokenizer.tokenize("This is a sample sentence.") 31 | 32 | print(f"{lang} model validated successfully") 33 | 34 | 35 | @pytest.mark.slow 36 | @pytest.mark.parametrize("lang", LASER2_LANGUAGE) 37 | def test_validate_language_models_and_tokenize_laser2(lang): 38 | with tempfile.TemporaryDirectory() as tmp_dir: 39 | print(f"Created temporary directory for {lang}", tmp_dir) 40 | 41 | downloader = LaserModelDownloader(model_dir=tmp_dir) 42 | downloader.download_laser2() 43 | 44 | encoder = initialize_encoder(lang, model_dir=tmp_dir) 45 | tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) 46 | 47 | # Test tokenization with a sample sentence 48 | tokenized = tokenizer.tokenize("This is a sample sentence.") 49 | 50 | print(f"{lang} model validated successfully") 51 | 52 | 53 | class MockLaserModelDownloader(LaserModelDownloader): 54 | def __init__(self, model_dir): 55 | self.model_dir = model_dir 56 | 57 | def download_laser3(self, lang): 58 | lang = self.get_language_code(LASER3_LANGUAGE, lang) 59 | file_path = os.path.join(self.model_dir, f"laser3-{lang}.v1.pt") 60 | if not os.path.exists(file_path): 61 | raise FileNotFoundError(f"Could not find {file_path}.") 62 | 63 | def download_laser2(self): 64 | files = ["laser2.pt", "laser2.spm", "laser2.cvocab"] 65 | for file_name in files: 66 | file_path = os.path.join(self.model_dir, file_name) 67 | if not os.path.exists(file_path): 68 | raise FileNotFoundError(f"Could not find {file_path}.") 69 | 70 | 71 | CACHE_DIR = "/home/user/.cache/models" # Change this to the desired cache directory 72 | 73 | # This uses the mock downloader 74 | @pytest.mark.slow 75 | @pytest.mark.parametrize("lang", LASER3_LANGUAGE) 76 | def test_validate_language_models_and_tokenize_mock_laser3(lang): 77 | downloader = MockLaserModelDownloader(model_dir=CACHE_DIR) 78 | 79 | try: 80 | downloader.download_laser3(lang) 81 | except FileNotFoundError as e: 82 | raise pytest.error(str(e)) 83 | 84 | encoder = initialize_encoder(lang, model_dir=CACHE_DIR) 85 | tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR) 86 | 87 | tokenized = tokenizer.tokenize("This is a sample sentence.") 88 | 89 | print(f"{lang} model validated successfully") 90 | 91 | 92 | # This uses the mock downloader 93 | @pytest.mark.slow 94 | @pytest.mark.parametrize("lang", LASER2_LANGUAGE) 95 | def test_validate_language_models_and_tokenize_mock_laser2(lang): 96 | downloader = MockLaserModelDownloader(model_dir=CACHE_DIR) 97 | 98 | try: 99 | downloader.download_laser2() 100 | except FileNotFoundError as e: 101 | raise pytest.error(str(e)) 102 | 103 | encoder = initialize_encoder(lang, model_dir=CACHE_DIR) 104 | tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR) 105 | 106 | tokenized = tokenizer.tokenize("This is a sample sentence.") 107 | 108 | print(f"{lang} model validated successfully") 109 | -------------------------------------------------------------------------------- /nllb/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script installs NLLB LASER2 and LASER3 sentence encoders from Amazon s3 16 | 17 | # default to download to current directory 18 | mdir=$(pwd) 19 | 20 | echo "Directory for model download: ${mdir}" 21 | 22 | version=1 # model version 23 | 24 | echo "Downloading networks..." 25 | 26 | if [ ! -d ${mdir} ] ; then 27 | echo " - creating model directory: ${mdir}" 28 | mkdir -p ${mdir} 29 | fi 30 | 31 | function download { 32 | file=$1 33 | if [ -f ${mdir}/${file} ] ; then 34 | echo " - ${mdir}/$file already downloaded"; 35 | else 36 | echo " - $s3/${file}"; 37 | wget -q $s3/${file}; 38 | fi 39 | } 40 | 41 | cd ${mdir} # move to model directory 42 | 43 | # available encoders 44 | s3="https://dl.fbaipublicfiles.com/nllb/laser" 45 | 46 | # LASER2 (download by default) 47 | if [ ! -f ${mdir}/laser2.pt ] ; then 48 | echo " - $s3/laser2.pt" 49 | wget --trust-server-names -q https://tinyurl.com/nllblaser2 50 | else 51 | echo " - ${mdir}/laser2.pt already downloaded" 52 | fi 53 | download "laser2.spm" 54 | download "laser2.cvocab" 55 | 56 | # LASER3 models 57 | if [ ! $# -eq 0 ]; then 58 | # chosen model subset from command line 59 | langs=$@ 60 | else 61 | # all available LASER3 models 62 | langs=(ace_Latn aka_Latn als_Latn amh_Ethi asm_Beng awa_Deva ayr_Latn azb_Arab azj_Latn bak_Cyrl bam_Latn ban_Latn bel_Cyrl \ 63 | bem_Latn ben_Beng bho_Deva bjn_Latn bod_Tibt bug_Latn ceb_Latn cjk_Latn ckb_Arab crh_Latn cym_Latn dik_Latn diq_Latn \ 64 | dyu_Latn dzo_Tibt ewe_Latn fao_Latn fij_Latn fon_Latn fur_Latn fuv_Latn gaz_Latn gla_Latn gle_Latn grn_Latn guj_Gujr \ 65 | hat_Latn hau_Latn hin_Deva hne_Deva hye_Armn ibo_Latn ilo_Latn ind_Latn jav_Latn kab_Latn kac_Latn kam_Latn kan_Knda \ 66 | kas_Arab kas_Deva kat_Geor kaz_Cyrl kbp_Latn kea_Latn khk_Cyrl khm_Khmr kik_Latn kin_Latn kir_Cyrl kmb_Latn kmr_Latn \ 67 | knc_Arab knc_Latn kon_Latn lao_Laoo lij_Latn lim_Latn lin_Latn lmo_Latn ltg_Latn ltz_Latn lua_Latn lug_Latn luo_Latn \ 68 | lus_Latn mag_Deva mai_Deva mal_Mlym mar_Deva min_Latn mlt_Latn mni_Beng mos_Latn mri_Latn mya_Mymr npi_Deva nso_Latn \ 69 | nus_Latn nya_Latn ory_Orya pag_Latn pan_Guru pap_Latn pbt_Arab pes_Arab plt_Latn prs_Arab quy_Latn run_Latn sag_Latn \ 70 | san_Deva sat_Beng scn_Latn shn_Mymr sin_Sinh smo_Latn sna_Latn snd_Arab som_Latn sot_Latn srd_Latn ssw_Latn sun_Latn \ 71 | swh_Latn szl_Latn tam_Taml taq_Latn tat_Cyrl tel_Telu tgk_Cyrl tgl_Latn tha_Thai tir_Ethi tpi_Latn tsn_Latn tso_Latn \ 72 | tuk_Latn tum_Latn tur_Latn twi_Latn tzm_Tfng uig_Arab umb_Latn urd_Arab uzn_Latn vec_Latn war_Latn wol_Latn xho_Latn \ 73 | ydd_Hebr yor_Latn zsm_Latn zul_Latn) 74 | fi 75 | 76 | spm_langs=(amh_Ethi ayr_Latn azj_Latn bak_Cyrl bel_Cyrl bod_Tibt ckb_Arab crh_Latn dik_Latn dzo_Tibt fur_Latn \ 77 | fuv_Latn grn_Latn kab_Latn kac_Latn kaz_Cyrl kir_Cyrl kmr_Latn lij_Latn lim_Latn lmo_Latn ltg_Latn \ 78 | mya_Mymr pbt_Arab pes_Arab prs_Arab sat_Beng scn_Latn srd_Latn szl_Latn taq_Latn tgk_Cyrl tir_Ethi \ 79 | tzm_Tfng vec_Latn) 80 | 81 | for lang in ${langs[@]}; do 82 | download "laser3-$lang.v$version.pt"; 83 | for spm_lang in ${spm_langs[@]}; do 84 | if [[ $lang == $spm_lang ]] ; then 85 | download "laser3-$lang.v$version.spm"; 86 | download "laser3-$lang.v$version.cvocab"; 87 | fi 88 | done 89 | done -------------------------------------------------------------------------------- /nllb/nllb_laser3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/LASER/d7e2544234c1d2a7076280944bdc2637f98ef3c2/nllb/nllb_laser3.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4", "setuptools"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "laser_encoders" 7 | version = "0.0.2" 8 | authors = [{name = "Facebook AI Research"}] 9 | description = "LASER Language-Agnostic SEntence Representations is a toolkit to calculate multilingual sentence embeddings and to use them for document classification, bitext filtering and mining" 10 | readme = "laser_encoders/README.md" 11 | requires-python = ">=3.8" 12 | 13 | dependencies = [ 14 | 'sacremoses==0.1.0', 15 | 'unicategories>=0.1.2', 16 | 'sentencepiece>=0.1.99', 17 | 'numpy>=1.21.3', 18 | 'torch>=1.10.0', 19 | 'fairseq>=0.12.2', 20 | ] 21 | 22 | classifiers=[ 23 | "License :: OSI Approved :: BSD License", 24 | "Topic :: Scientific/Engineering", 25 | "Development Status :: 4 - Beta", 26 | ] 27 | 28 | [project.urls] 29 | "Homepage" = "https://github.com/facebookresearch/LASER" 30 | "Bug Tracker" = "https://github.com/facebookresearch/LASER/issues" 31 | 32 | [project.optional-dependencies] 33 | dev = [ 34 | # Test 35 | "pytest>=4.3.0", 36 | # Format 37 | "black==22.3.0", 38 | "isort>=5.10.1", 39 | # Linters 40 | "mypy>=0.782", 41 | "pylint>=2.8.0", 42 | # Release 43 | "flit>=3.5.1" 44 | ] 45 | 46 | [tool.black] 47 | # Black defaults are great ! 48 | 49 | [tool.isort] 50 | profile = "black" 51 | skip_gitignore = true 52 | skip_glob = ["website/*", "*.pyx"] 53 | 54 | [tool.mypy] 55 | python_version = "3.8" 56 | show_error_codes = true 57 | check_untyped_defs = true 58 | 59 | ignore_missing_imports = true 60 | 61 | files = [ 62 | "laser_encoders/" 63 | ] 64 | 65 | [tool.pytest.ini_options] 66 | testpaths = ["laser_encoders"] 67 | python_files = [ 68 | "test_*.py", 69 | ] -------------------------------------------------------------------------------- /remove_external_tools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script removes all installed third party software 16 | # 17 | 18 | if [ -z ${LASER+x} ] ; then 19 | echo "Please set the environment variable 'LASER'" 20 | exit 21 | fi 22 | 23 | bdir="${LASER}" 24 | tools_ext="${bdir}/tools-external" 25 | 26 | /bin/rm -rf ${tools_ext} 27 | -------------------------------------------------------------------------------- /source/lib/romanize_lc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # Romanize and lower case text 16 | 17 | import os 18 | import sys 19 | import argparse 20 | from transliterate import translit, get_available_language_codes 21 | 22 | parser = argparse.ArgumentParser( 23 | formatter_class=argparse.RawDescriptionHelpFormatter, 24 | description="Calculate multilingual sentence encodings") 25 | parser.add_argument( 26 | '--input', '-i', type=argparse.FileType('r', encoding='UTF-8'), 27 | default=sys.stdin, 28 | metavar='PATH', 29 | help="Input text file (default: standard input).") 30 | parser.add_argument( 31 | '--output', '-o', type=argparse.FileType('w', encoding='UTF-8'), 32 | default=sys.stdout, 33 | metavar='PATH', 34 | help="Output text file (default: standard output).") 35 | parser.add_argument( 36 | '--language', '-l', type=str, 37 | metavar='STR', default="none", 38 | help="perform transliteration into Roman characters" 39 | " from the specified language (default none)") 40 | parser.add_argument( 41 | '--preserve-case', '-C', action='store_true', 42 | help="Preserve case of input texts (default is all lower case)") 43 | 44 | args = parser.parse_args() 45 | 46 | for line in args.input: 47 | if args.language != "none": 48 | line = translit(line, args.language, reversed=True) 49 | if not args.preserve_case: 50 | line = line.lower() 51 | args.output.write(line) 52 | -------------------------------------------------------------------------------- /source/similarity_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # Quora Q&A paraphrase detection 16 | 17 | import os 18 | import sys 19 | import argparse 20 | import faiss 21 | import numpy as np 22 | 23 | # get environment 24 | assert os.environ.get('LASER'), 'Please set the enviornment variable LASER' 25 | LASER = os.environ['LASER'] 26 | 27 | sys.path.append(LASER + '/source') 28 | sys.path.append(LASER + '/source/lib') 29 | from embed import SentenceEncoder, EncodeLoad, EncodeFile 30 | from text_processing import Token, BPEfastApply 31 | from indexing import IndexCreate, IndexSearchMultiple, IndexPrintConfusionMatrix 32 | 33 | ############################################################################### 34 | 35 | parser = argparse.ArgumentParser('LASER: similarity search') 36 | parser.add_argument('--base-dir', type=str, default='.', 37 | help='Base directory for all data files') 38 | parser.add_argument('--data', type=str, required=True, 39 | help='Direcory and basename of input data (language name will be added)') 40 | parser.add_argument('--output', type=str, required=True, 41 | help='Directory and basename of created data (language name will be added)') 42 | parser.add_argument('--textual', action='store_true', 43 | help='Use textual comparison instead of indicies') 44 | parser.add_argument( 45 | '--lang', '-l', nargs='+', required=True, 46 | help="List of languages to test on") 47 | 48 | # preprocessing 49 | parser.add_argument('--bpe-codes', type=str, required=True, 50 | help='Fast BPPE codes and vocabulary') 51 | parser.add_argument('--verbose', action='store_true', 52 | help='Detailed output') 53 | 54 | # options for encoder 55 | parser.add_argument('--encoder', type=str, required=True, 56 | help='encoder to be used') 57 | parser.add_argument('--buffer-size', type=int, default=100, 58 | help='Buffer size (sentences)') 59 | parser.add_argument('--max-tokens', type=int, default=12000, 60 | help='Maximum number of tokens to process in a batch') 61 | parser.add_argument('--max-sentences', type=int, default=None, 62 | help='Maximum number of sentences to process in a batch') 63 | parser.add_argument('--cpu', action='store_true', 64 | help='Use CPU instead of GPU') 65 | 66 | args = parser.parse_args() 67 | 68 | print('LASER: similarity search') 69 | 70 | print('\nProcessing:') 71 | all_texts = [] 72 | if args.textual: 73 | print(' - using textual comparision') 74 | for l in args.lang: 75 | with open(os.path.join(args.base_dir, args.data + '.' + l), 76 | encoding='utf-8', errors='surrogateescape') as f: 77 | texts = f.readlines() 78 | print(' - {:s}: {:d} lines'.format(args.data + '.' + l, len(texts))) 79 | all_texts.append(texts) 80 | 81 | enc = EncodeLoad(args) 82 | 83 | out_dir = os.path.dirname(args.output) 84 | if not os.path.exists(out_dir): 85 | print(' - creating directory {}'.format(out_dir)) 86 | os.mkdir(out_dir) 87 | 88 | all_data = [] 89 | all_index = [] 90 | for l in args.lang: 91 | Token(os.path.join(args.base_dir, args.data + '.' + l), 92 | os.path.join(args.base_dir, args.output + '.tok.' + l), 93 | lang=l, 94 | romanize=True if l == 'el' else False, 95 | lower_case=True, 96 | verbose=args.verbose, over_write=False) 97 | BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l), 98 | os.path.join(args.base_dir, args.output + '.bpe.' + l), 99 | args.bpe_codes, 100 | verbose=args.verbose, over_write=False) 101 | EncodeFile(enc, 102 | os.path.join(args.base_dir, args.output + '.bpe.' + l), 103 | os.path.join(args.base_dir, args.output + '.enc.' + l), 104 | verbose=args.verbose, over_write=False) 105 | d, idx = IndexCreate(os.path.join(args.base_dir, args.output + '.enc.' + l), 106 | 'FlatL2', 107 | verbose=args.verbose, save_index=False) 108 | all_data.append(d) 109 | all_index.append(idx) 110 | 111 | err = IndexSearchMultiple(all_data, all_index, args.lang, texts=all_texts, 112 | verbose=False, print_errors=False) 113 | IndexPrintConfusionMatrix(err, args.lang) 114 | -------------------------------------------------------------------------------- /source/xsim.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | # LASER Language-Agnostic SEntence Representations 8 | # is a toolkit to calculate multilingual sentence embeddings 9 | # and to use them for document classification, bitext filtering 10 | # and mining 11 | # 12 | # -------------------------------------------------------- 13 | # 14 | # Tool to calculate multilingual similarity error rate (xSIM) 15 | 16 | import faiss 17 | import numpy as np 18 | import typing as tp 19 | import os 20 | import json 21 | from enum import Enum 22 | 23 | 24 | class Margin(Enum): 25 | RATIO = "ratio" 26 | DISTANCE = "distance" 27 | ABSOLUTE = "absolute" 28 | 29 | @classmethod 30 | def has_value(cls, value): 31 | return value in cls._value2member_map_ 32 | 33 | 34 | def xSIM( 35 | x: tp.Union[str, np.ndarray], 36 | y: tp.Union[str, np.ndarray], 37 | margin: str = Margin.RATIO.value, 38 | k: int = 4, 39 | dim: int = 1024, 40 | fp16: bool = False, 41 | eval_text: str = None, 42 | augmented_json: str = None, 43 | ) -> tp.Tuple[int, int, tp.Dict[str, int]]: 44 | assert Margin.has_value(margin), f"Margin type: {margin}, is not supported." 45 | if not isinstance(x, np.ndarray): 46 | x = _load_embeddings(x, dim, fp16) 47 | if not isinstance(y, np.ndarray): 48 | y = _load_embeddings(y, dim, fp16) 49 | # calculate xSIM error 50 | return calculate_error(x, y, margin, k, eval_text, augmented_json) 51 | 52 | 53 | def _load_embeddings(infile: str, dim: int, fp16: bool = False) -> np.ndarray: 54 | assert os.path.isfile(infile), f"file: {infile} does not exist." 55 | emb = np.fromfile(infile, dtype=np.float16 if fp16 else np.float32) 56 | num_examples = emb.shape[0] // dim 57 | emb.resize(num_examples, dim) 58 | if fp16: 59 | emb = emb.astype(np.float32) # faiss currently only supports fp32 60 | return emb 61 | 62 | 63 | def score_margin( 64 | Dxy: np.ndarray, 65 | Ixy: np.ndarray, 66 | Ax: np.ndarray, 67 | Ay: np.ndarray, 68 | margin: str, 69 | k: int, 70 | ) -> np.ndarray: 71 | nbex = Dxy.shape[0] 72 | scores = np.zeros((nbex, k)) 73 | for i in range(nbex): 74 | for j in range(k): 75 | jj = Ixy[i, j] 76 | a = Dxy[i, j] 77 | b = (Ax[i] + Ay[jj]) / 2 78 | if margin == Margin.RATIO.value: 79 | scores[i, j] = a / b 80 | else: # distance margin 81 | scores[i, j] = a - b 82 | return scores 83 | 84 | 85 | def _score_knn(x: np.ndarray, y: np.ndarray, k: int, margin: str) -> np.ndarray: 86 | nbex, dim = x.shape 87 | # create index 88 | idx_x = faiss.IndexFlatIP(dim) 89 | idx_y = faiss.IndexFlatIP(dim) 90 | # L2 normalization needed for cosine distance 91 | faiss.normalize_L2(x) 92 | faiss.normalize_L2(y) 93 | idx_x.add(x) 94 | idx_y.add(y) 95 | if margin == Margin.ABSOLUTE.value: 96 | scores, indices = idx_y.search(x, 1) 97 | else: 98 | # return cosine similarity and indices of k closest neighbors 99 | Cos_xy, Idx_xy = idx_y.search(x, k) 100 | Cos_yx, Idx_yx = idx_x.search(y, k) 101 | 102 | # average cosines 103 | Avg_xy = Cos_xy.mean(axis=1) 104 | Avg_yx = Cos_yx.mean(axis=1) 105 | 106 | scores = score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k) 107 | 108 | # find best 109 | best = scores.argmax(axis=1) 110 | indices = np.zeros((nbex, 1), dtype=np.int32) 111 | for i in range(nbex): 112 | indices[i] = Idx_xy[i, best[i]] 113 | return indices 114 | 115 | 116 | def get_transform(augmented_json, closest_neighbor, src): 117 | if ( 118 | closest_neighbor in augmented_json 119 | and augmented_json[closest_neighbor]["src"] == src 120 | ): 121 | return augmented_json[closest_neighbor]["errtype"] 122 | return "Misaligned" 123 | 124 | 125 | def calculate_error( 126 | x: np.ndarray, 127 | y: np.ndarray, 128 | margin: str = None, 129 | k: int = 4, 130 | eval_text: str = None, 131 | augmented_json: str = None, 132 | ) -> tp.Tuple[int, int, tp.Dict[str, int]]: 133 | if augmented_json: 134 | with open(augmented_json) as f: 135 | augmented_json = json.load(f) 136 | assert ( 137 | x.shape[0] < y.shape[0] 138 | ), f"Shape mismatch: {x.shape[0]} >= target {y.shape[0]}" 139 | else: 140 | assert ( 141 | x.shape == y.shape 142 | ), f"number of source {x.shape} / target {y.shape} shapes mismatch, " 143 | nbex = x.shape[0] 144 | augmented_report = {} 145 | 146 | # for each x calculate the highest scoring neighbor from y 147 | closest_neighbor = _score_knn(x, y, k, margin) 148 | 149 | if eval_text: # calc textual error 150 | lines = open(eval_text, encoding="utf-8", errors="surrogateescape").readlines() 151 | err = 0 152 | for ex in range(nbex): 153 | if lines[ex] != lines[closest_neighbor[ex, 0]]: 154 | err += 1 155 | if augmented_json: 156 | transform = get_transform( 157 | augmented_json, 158 | lines[closest_neighbor[ex, 0]].strip(), 159 | lines[ex].strip(), 160 | ) 161 | augmented_report[transform] = augmented_report.get(transform, 0) + 1 162 | else: # calc index error 163 | ref = np.linspace(0, nbex - 1, nbex).astype(int) # [0, nbex) 164 | err = nbex - np.equal(closest_neighbor.reshape(nbex), ref).astype(int).sum() 165 | return err, nbex, augmented_report 166 | -------------------------------------------------------------------------------- /tasks/CCMatrix/MatrixMine.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/LASER/d7e2544234c1d2a7076280944bdc2637f98ef3c2/tasks/CCMatrix/MatrixMine.pdf -------------------------------------------------------------------------------- /tasks/CCMatrix/README.md: -------------------------------------------------------------------------------- 1 | # CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB 2 | 3 | ## Parallel data 4 | 5 | We show that margin-based bitext mining in LASER's multilingual sentence space can be applied to monolingual corpora of billions of sentences to produce high quality aligned translation data. We use thirty-two snapshots of a curated common crawl corpus [1] totaling 69 billion unique sentences. Using one unified approach for 80 languages, we were able to mine 10.8 billion parallel sentences, out of which only 2.9 billion are aligned with English. 6 | 7 | ## Download 8 | 9 | We open-source our scripts in this directory so that others may reproduce the data, evaluation and results reported in the CCMatrix paper. 10 | ``` 11 | pip3 install cc_net 12 | python3 dl_cc_matrix.py 13 | ``` 14 | 15 | Please cite reference [2][3] if you use this data. 16 | 17 | 18 | ## Evaluation 19 | 20 | Evaluation 21 | We have assessed the quality of our mined data with bilingual models and multilingual models. 22 | 23 | * Bilingual models [2]: To evaluate the quality of the mined bitexts, we train NMT systems for most of the language pairs and evaluate them on TED, WMT and WAT test sets. Using our mined bitexts only and no human translated parallel data, we achieve a new state-of-the-art for a single system on the WMT'19 test set for translation between English and German, Russian and Chinese, as well as German/French. In particular, our English/German system outperforms the best single one by close to 4 BLEU points and is almost on pair with best WMT'19 evaluation system which uses system combination and back-translation. We also achieve excellent results for distant languages pairs like Russian/Japanese, outperforming the best submission at the 2019 workshop on Asian Translation (WAT). 24 | 25 | * Multilingual models [3]: CCMatrix data is used to train M2M-100, a large-scale Many-to-Many multilingual translation model. The thousands of directions we mine produce training data for direct translations without relying solely on English data. We mine using novel strategy which exploits language groupings and bridge languages to avoid mining every possible direction while maintaining good accuracy. By training on this data and scaling model capacity through model parallelism and language-specific parameters, M2M-100 outperforms English-Centric multilingual models trained on data where either the source or target language is English. The system improves over 10 BLEU on average compared to an English-Centric baseline when translating directly between non-English directions. M2M-100 is competitive to bilingual models from WMT and improves over existing publicly available multilingual translation systems. To download the data, follow our instructions above. To download the models and reproduce the training, click [*here*](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) 26 | 27 | Please note that additional data filtering was applied before training the M2M-100 model, see [3] for details. 28 | Also, we have improved mining against English which leads to more bitexts, in particular for mid- and low-resources languages. 29 | This new data was not used for M2M-100. 30 | 31 | ## References 32 | 33 | [1] Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Jouli and Edouard Grave, 34 | [*CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data*](https://arxiv.org/abs/1911.00359) 35 | 36 | [2] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin, 37 | [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944) 38 | 39 | [3] Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, and Armand Joulin. Beyond English-Centric Multilingual Machine Translation 40 | -------------------------------------------------------------------------------- /tasks/SentimentAnalysis/README.md: -------------------------------------------------------------------------------- 1 | # Laser Encoder: Sentiment Analysis 2 | 3 | ## Overview 4 | 5 | This project demonstrates the application of the Laser Encoder tool for creating sentence embeddings in the context of sentiment analysis. The Laser Encoder is used to encode text data, and a sentiment analysis model is trained to predict the sentiment of the text. 6 | 7 | ## Getting Started 8 | 9 | To run the notebook in Google Colab, click the "Open in Colab" button below: 10 | 11 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NIXBLACK11/LASER-fork/blob/Sentiment-analysis-laser/tasks/SentimentAnalysis/SentimentAnalysis.ipynb) 12 | 13 | Also, check out the hugging face space with the button below: 14 | 15 | [![Open In Hugging Face Space](https://img.shields.io/badge/Open%20In-Hugging%20Face%20Space-blue?logo=huggingface)](https://huggingface.co/spaces/NIXBLACK/SentimentAnalysis_LASER_) 16 | 17 | 18 | ## Example Usage 19 | 20 | Run the Example Notebook: 21 | Execute the provided Jupyter Notebook SentimentAnalysis.ipynb 22 | 23 | jupyter notebook SentimentAnalysis.ipynb 24 | 25 | 26 | ## Customization 27 | 28 | - Modify the model architecture, hyperparameters, and training settings in the neural network model section based on your requirements. 29 | - Customize the sentiment mapping and handling of unknown sentiments in the data preparation section. 30 | 31 | ## Additional Notes 32 | - Feel free to experiment with different models, embeddings, and hyperparameters to optimize performance. 33 | - Ensure that the dimensions of embeddings and model inputs are compatible. 34 | Adapt the code based on your specific dataset and use case. 35 | -------------------------------------------------------------------------------- /tasks/WikiMatrix/README.md: -------------------------------------------------------------------------------- 1 | # WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia 2 | 3 | The goal of this project is to mine for parallel sentences in the textual content of Wikipedia for all possible language pairs. 4 | 5 | 6 | ## Mined data 7 | * 85 different languages, 1620 language pairs 8 | * 134M parallel sentences, out of which 34M are aligned with English 9 | * this [*table shows the amount of mined parallel sentences for most of the language pairs*](WikiMatrix-sizes.pdf) 10 | * the mined bitext are stored on AWS and can de downloaded with the following command: 11 | ```bash 12 | wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-fr.tsv.gz 13 | ``` 14 | Replace "en-fr" with the ISO codes of the desired language pair. 15 | The language pair must be in alphabetical order, e.g. "de-en" and not "en-de". 16 | The list of available bitexts and their sizes are given in the file [*list_of_bitexts.txt*](list_of_bitexts.txt). 17 | Please do **not loop over all files** since AWs implements some [*limitations*](https://dl.fbaipublicfiles.com/README) to avoid abuse. 18 | 19 | Use this command if you want to download all 1620 language pairs in one tar file (but this is 65GB!): 20 | ```bash 21 | wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/WikiMatrix.v1.1620_language_pairs.tar 22 | ``` 23 | 24 | ## Approach 25 | 26 | We use LASER's bitext mining approach and encoder for 93 languages [2,3]. 27 | We do not use the inter-language links provided by Wikipedia, 28 | but search over all Wikipedia articles of each language. We approach the 29 | computational challenge to mine in almost 600 million sentences by using fast 30 | indexing and similarity search with [*FAISS*](https://github.com/facebookresearch/faiss). 31 | Prior to mining parallel sentences, we perform 32 | sentence segmentation, deduplication and language identification. 33 | Please see reference [1] for details. 34 | 35 | 36 | ## Data extraction and threshold optimization 37 | We provide a tool to extract parallel texts from the the TSV files: 38 | ```bash 39 | python3 extract.py \ 40 | --tsv WikiMatrix.en-fr.tsv.gz \ 41 | --bitext WikiMatrix.en-fr.txt \ 42 | --src-lang en --trg-lang fr \ 43 | --threshold 1.04 44 | ``` 45 | One can specify the threshold on the margin score. 46 | The higher the value, the more likely the sentences are mutual translations, but the less data one will get. 47 | **A value of 1.04 seems to be good choice for most language pairs.** Please see the analysis in the paper for 48 | more information [1]. 49 | 50 | ## Evaluation 51 | To assess the quality of the mined bitexts, we trained neural MT system on all language pairs 52 | for which we were able to mine at least 25k parallel sentences (with a margin threshold of 1.04). 53 | We trained systems in both directions, source to target and target to source, and report BLEU scores 54 | on the [*TED test*](https://github.com/neulab/word-embeddings-for-nmt) set proposed in [4]. 55 | This totals 1886 different NMT systems. 56 | This [*table shows the BLEU scores for the most frequest language pairs*](WikiMatrix-bleu.pdf). 57 | We achieve BLEU scores over 30 for several language pairs. 58 | 59 | The goal is not to build state of the art systems for each language pair, but 60 | to get an indication of the quality of the automatically mined data. These 61 | BLEU scores should be of course appreciated in context of the sizes of the 62 | mined corpora. 63 | 64 | Obviously, we can not exclude that the 65 | provided data contains some wrong alignments even though the margin is large. 66 | Finally, we would like to point out that we run our approach on all available 67 | languages in Wikipedia, independently of the quality of LASER's sentence 68 | embeddings for each one. 69 | 70 | 71 | ## License 72 | 73 | The mined data is distributed under the Creative Commons Attribution-ShareAlike license. 74 | 75 | Please cite reference [1] if you use this data. 76 | 77 | ## References 78 | 79 | [1] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, 80 | [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791) 81 | arXiv, July 11 2019. 82 | 83 | [2] Mikel Artetxe and Holger Schwenk, 84 | [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136) 85 | arXiv, Nov 3 2018. 86 | 87 | [3] Mikel Artetxe and Holger Schwenk, 88 | [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464) 89 | arXiv, Dec 26 2018. 90 | 91 | [4] Ye Qi, Devendra Sachan, Matthieu Felix, Sarguna Padmanabhan and Graham Neubig, 92 | [*When and Why Are Pre-Trained Word Embeddings Useful for Neural Machine Translation?*](https://www.aclweb.org/anthology/papers/N/N18/N18-2084/) 93 | NAACL, pages 529-535, 2018. 94 | -------------------------------------------------------------------------------- /tasks/WikiMatrix/WikiMatrix-bleu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/LASER/d7e2544234c1d2a7076280944bdc2637f98ef3c2/tasks/WikiMatrix/WikiMatrix-bleu.pdf -------------------------------------------------------------------------------- /tasks/WikiMatrix/WikiMatrix-sizes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/LASER/d7e2544234c1d2a7076280944bdc2637f98ef3c2/tasks/WikiMatrix/WikiMatrix-sizes.pdf -------------------------------------------------------------------------------- /tasks/WikiMatrix/extract.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # Tool to extract subset of mined bitexts in a tsv.gz file 16 | 17 | import os 18 | import sys 19 | import gzip 20 | import argparse 21 | 22 | ############################################################################### 23 | # 24 | # Main 25 | # 26 | ############################################################################### 27 | 28 | parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix') 29 | parser.add_argument('--encoding', default='utf-8', 30 | help='character encoding for input/output') 31 | parser.add_argument('--tsv', type=str, required=True, 32 | help='File with mined bitexts') 33 | parser.add_argument('--bitext', type=str, required=True, 34 | help='Text file after sentence splitting') 35 | parser.add_argument('--src-lang', type=str, required=True, 36 | help='Source language') 37 | parser.add_argument('--trg-lang', type=str, required=True, 38 | help='Traget language') 39 | parser.add_argument('--threshold', type=float, default=1.05, 40 | help='Threshold on margin score') 41 | parser.add_argument('--nb-sents', type=int, default=999999999, 42 | help='Maximal number of sentences') 43 | parser.add_argument('--nb-words-src', type=int, default=999999999, 44 | help='Maxmimal numer of total words in the source language') 45 | parser.add_argument('--nb-words-trg', type=int, default=999999999, 46 | help='Maxmimal numer of total words in the target language') 47 | args = parser.parse_args() 48 | 49 | print('Tool to extract bitext from the WikiMatrix') 50 | 51 | nl = 0 52 | nw_src = 0 53 | nw_trg = 0 54 | print('Processing {}'.format(args.tsv)) 55 | with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv: 56 | with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc: 57 | with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg: 58 | while nl < args.nb_sents: 59 | line = tsv.readline() 60 | if not line: 61 | break 62 | fields = line.split('\t') 63 | cur_src = len(fields[1].split()) 64 | cur_trg = len(fields[2].split()) 65 | if float(fields[0]) < args.threshold: 66 | break 67 | if nw_src + cur_src > args.nb_words_src: 68 | break 69 | if nw_trg + cur_trg > args.nb_words_trg: 70 | break 71 | fsrc.write(fields[1].strip() + '\n') 72 | ftrg.write(fields[2].strip() + '\n') 73 | nw_src += cur_src 74 | nw_trg += cur_trg 75 | nl += 1 76 | if nl % 100000 == 0: 77 | print('\r - {:d} lines read'.format(nl), end='') 78 | 79 | print('\r - wrote {:d} lines'.format(nl)) 80 | print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg)) 81 | print(' - last threshold is {:.4f}'.format(float(fields[0]))) 82 | -------------------------------------------------------------------------------- /tasks/bucc/README.md: -------------------------------------------------------------------------------- 1 | # LASER: application to bitext mining 2 | 3 | This codes shows how to use the multilingual sentence embeddings to mine 4 | for parallel data in (huge) collections of monolingual data. 5 | 6 | The underlying idea is pretty simple: 7 | * embed the sentences in the two languages into the joint sentence space 8 | * calculate all pairwise distances between the sentences. 9 | This is of complexity O(N\*M) and can be done very efficiently with 10 | the FAISS library [2] 11 | * all sentence pairs which have a distance below a threshold 12 | are considered as parallel 13 | * this approach can be further improved using a margin criterion [3] 14 | 15 | Here, we apply this idea to the data provided by the shared task of the BUCC 16 | [Workshop on Building and Using Comparable Corpora](https://comparable.limsi.fr/bucc2018/bucc2018-task.html). 17 | 18 | The same approach can be scaled up to huge collections of monolingual texts 19 | (several billions) using more advanced features of the FAISS toolkit. 20 | 21 | ## Installation 22 | 23 | * Please first download the BUCC shared task data 24 | [here](https://comparable.limsi.fr/bucc2017/cgi-bin/download-data-2018.cgi) 25 | and install it the directory "downloaded" 26 | * running the script 27 | ```bash 28 | ./bucc.sh 29 | ``` 30 | 31 | ## Results 32 | 33 | Optimized on the F-scores on the training corpus. 34 | These results differ slighty from those published in [4] due to the switch from PyTorch 0.4 to 1.0. 35 | 36 | | Languages | Threshold | precision | Recall | F-score | 37 | |-----------|-----------|-----------|--------|---------| 38 | | fr-en | 1.088131 | 91.52 | 93.32 | 92.41 | 39 | | de-en | 1.092056 | 95.65 | 95.19 | 95.42 | 40 | | ru-en | 1.093404 | 90.60 | 94.04 | 92.29 | 41 | | zh-en | 1.085999 | 91.99 | 91.31 | 91.65 | 42 | 43 | Results on the official test set are scored by the organizers of the BUCC workshop. 44 | 45 | 46 | Below, we compare our approach to the [official results of the 2018 edition 47 | of the BUCC workshop](http://lrec-conf.org/workshops/lrec2018/W8/pdf/12_W8.pdf) [1]. 48 | More details on our approach are provided in [2,3,4] 49 | 50 | | System | fr-en | de-en | ru-en | zh-en | 51 | |----------------------|-------|-------|-------|-------| 52 | | Azpeitia et al '17 | 79.5 | 83.7 | - | - | 53 | | Azpeitia et al '18 | 81.5 | 85.5 | 81.3 | 77.5 | 54 | |Bouamor and Sajjad '18| 76.0 | - | - | - | 55 | | Chongman et al '18 | - | - | - | 56 | 56 | | LASER [3] | 75.8 | 76.9 | - | - | 57 | | LASER [4] | 93.1 | 96.2 | 92.3 | 92.7 | 58 | 59 | All numbers are F1-scores on the test set. 60 | 61 | ## Bonus 62 | 63 | To show case the highly multilingual aspect of LASER's sentence embeddings, 64 | we also mine for bitexts for language pairs which do not include English, e.g. 65 | French-German, Russian-French or Chinese-Russian. 66 | This is also performed by the script bucc.sh 67 | 68 | Below the number of extracted parallel sentences for each language pair. 69 | 70 | | src/trg | French | German | Russian | Chinese | 71 | |---------|--------|--------|---------|---------| 72 | | French | n/a | 2795 | 3327 | 387 | 73 | | German | 2795 | n/a | 3661 | 466 | 74 | | Russian | 3327 | 3661 | n/a | 664 | 75 | | Chinese | 387 | 466 | 664 | n/a | 76 | 77 | 78 | ## References 79 | 80 | [1] Pierre Zweigenbaum, Serge Sharoff and Reinhard Rapp,` 81 | [*Overview of the Third BUCC Shared Task: Spotting Parallel Sentences in Comparable Corpora*](http://lrec-conf.org/workshops/lrec2018/W8/pdf/12_W8.pdf), 82 | LREC, 2018. 83 | 84 | [2] Holger Schwenk, 85 | [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](https://arxiv.org/abs/1805.09822), 86 | ACL, July 2018 87 | 88 | [3] Mikel Artetxe and Holger Schwenk, 89 | [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136) 90 | arXiv, 3 Nov 2018. 91 | 92 | [3] Mikel Artetxe and Holger Schwenk, 93 | [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464) 94 | arXiv, 26 Dec 2018. 95 | -------------------------------------------------------------------------------- /tasks/bucc/bucc.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # Python tools for BUCC bitext mining 16 | 17 | import argparse 18 | 19 | ############################################################################### 20 | # 21 | # Find te optimal threshold given gold alignments 22 | # 23 | ############################################################################### 24 | 25 | def BuccOptimize(candidate2score, gold): 26 | items = sorted(candidate2score.items(), key=lambda x: -x[1]) 27 | ngold = len(gold) 28 | nextract = ncorrect = 0 29 | threshold = 0 30 | best_f1 = 0 31 | for i in range(len(items)): 32 | nextract += 1 33 | if '\t'.join(items[i][0]) in gold: 34 | ncorrect += 1 35 | if ncorrect > 0: 36 | precision = ncorrect / nextract 37 | recall = ncorrect / ngold 38 | f1 = 2 * precision * recall / (precision + recall) 39 | if f1 > best_f1: 40 | best_f1 = f1 41 | threshold = (items[i][1] + items[i + 1][1]) / 2 42 | return threshold 43 | 44 | 45 | ############################################################################### 46 | # 47 | # Main 48 | # 49 | ############################################################################### 50 | 51 | parser = argparse.ArgumentParser(description='LASER: tools for BUCC bitext mining') 52 | parser.add_argument('--encoding', default='utf-8', 53 | help='character encoding for input/output') 54 | parser.add_argument('--src-lang', required=True, 55 | help='the source language id') 56 | parser.add_argument('--trg-lang', required=True, 57 | help='the target language id') 58 | parser.add_argument('--bucc-texts', required=True, 59 | help='Base name of the text files (language added)') 60 | parser.add_argument('--bucc-ids', required=True, 61 | help='Base name of the ID files (language added)') 62 | parser.add_argument('--candidates', required=True, 63 | help='File name of candidate alignments') 64 | parser.add_argument('--gold', default=None, 65 | help='File name of gold alignments') 66 | parser.add_argument('--threshold', type=float, default=-1, 67 | help='Threshold (used with --output)') 68 | parser.add_argument('--output', default=None, 69 | help='File name of output alignments which are below threshold') 70 | parser.add_argument('--verbose', action='store_true', 71 | help='Detailed output') 72 | args = parser.parse_args() 73 | 74 | print('LASER: tools for BUCC bitext mining') 75 | 76 | assert (args.gold or args.threshold > 0) \ 77 | and not (args.gold and args.threshold > 0), \ 78 | 'Either "--gold" or "--threshold" must be specified' 79 | if args.verbose: 80 | print(' - reading sentences and IDs') 81 | 82 | src_sent2id, trg_sent2id = {}, {} 83 | for lang, sent2id in (args.src_lang, src_sent2id), (args.trg_lang, trg_sent2id): 84 | repeated = set() 85 | with open(args.bucc_texts + '.' + lang, encoding=args.encoding, errors='surrogateescape') as f: 86 | sentences = [line.strip() for line in f] 87 | with open(args.bucc_ids + '.' + lang, encoding=args.encoding, errors='surrogateescape') as f: 88 | ids = [line.strip() for line in f] 89 | for id, sent in zip(ids, sentences): 90 | if sent in sent2id: 91 | repeated.add(sent) 92 | else: 93 | sent2id[sent] = id 94 | for sent in repeated: 95 | del sent2id[sent] 96 | 97 | if args.verbose: 98 | print(' - reading candidates {}'.format(args.candidates)) 99 | candidate2score = {} 100 | # id2txt = {} 101 | with open(args.candidates, encoding=args.encoding, errors='surrogateescape') as f: 102 | for line in f: 103 | score, src, trg = line.split('\t') 104 | score = float(score) 105 | src = src.strip() 106 | trg = trg.strip() 107 | if src in src_sent2id and trg in trg_sent2id: 108 | src_id = src_sent2id[src] 109 | trg_id = trg_sent2id[trg] 110 | score = max(score, candidate2score.get((src_id, trg_id), score)) 111 | candidate2score[(src_id, trg_id)] = score 112 | # id2txt[src_id + '\t' + trg_id] = src + '\t' + trg 113 | 114 | def BuccExtract(cand2score, th, fname): 115 | if fname: 116 | of = open(fname, 'w', encoding=args.encoding) 117 | bitexts = [] 118 | for (src, trg), score in cand2score.items(): 119 | if score >= th: 120 | bitexts.append(src + '\t' + trg) 121 | if fname: 122 | of.write(src + '\t' + trg + '\n') 123 | if fname: 124 | of.close() 125 | return bitexts 126 | 127 | if args.gold: 128 | if args.verbose: 129 | print(' - optimizing threshold on gold alignments {}'.format(args.gold)) 130 | if args.output: 131 | print(' - extracted bitext are written into {:s}'.format(args.output)) 132 | gold = {line.strip() for line in open(args.gold)} 133 | threshold = BuccOptimize(candidate2score, gold) 134 | 135 | bitexts = BuccExtract(candidate2score, threshold, args.output) 136 | ncorrect = len(gold.intersection(bitexts)) 137 | if ncorrect > 0: 138 | precision = ncorrect / len(bitexts) 139 | recall = ncorrect / len(gold) 140 | f1 = 2*precision*recall / (precision + recall) 141 | else: 142 | precision = recall = f1 = 0 143 | 144 | print(' - best threshold={:f}: precision={:.2f}, recall={:.2f}, F1={:.2f}' 145 | .format(threshold, 100*precision, 100*recall, 100*f1)) 146 | 147 | 148 | if args.threshold > 0: 149 | if args.verbose: 150 | print(' - extracting bitexts for threshold {:f} into {:s}'.format(args.threshold, args.output)) 151 | BuccExtract(candidate2score, args.threshold, args.output) 152 | -------------------------------------------------------------------------------- /tasks/clustering/README.md: -------------------------------------------------------------------------------- 1 | # Laser Encoder: Sentiment Analysis 2 | 3 | ## Overview 4 | 5 | In this tutorial, we'll explore the power of Language-Agnostic SEntence Representations ([LASER](https://github.com/facebookresearch/LASER)) for generating multilingual embeddings. We'll then use these embeddings to perform clustering on the [MASSIVE](https://github.com/alexa/massive) dataset. Our goal was to show that LASER embeddings can effectively group texts not only by their semantic content but also across different languages. LASER can encode sentences from multiple languages into a shared embedding space, allowing for cross-lingual understanding and comparison. We'll see how this capability is useful for tasks like multilingual embeddings clustering. 6 | 7 | ## Getting Started 8 | 9 | To run the notebook in Google Colab, simply click the "Open in Colab" button below: 10 | 11 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Paulooh007/LASER/blob/laser-clustering/tasks/clustering/LaserClusteringExample.ipynb) 12 | 13 | ## Simple interactive Demo 14 | To better understand this tutorial, you can checkout this interactive demo hosted on huggingface spaces. 15 | 16 | [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-lg.svg)](https://huggingface.co/spaces/paulokewunmi/laser_multilingual_embeddings_viz) 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /tasks/embed/README.md: -------------------------------------------------------------------------------- 1 | # LASER: calculation of sentence embeddings 2 | 3 | Tool to calculate sentence embeddings for an arbitrary text file: 4 | ``` 5 | bash ./embed.sh INPUT-FILE OUTPUT-FILE [LANGUAGE] 6 | ``` 7 | 8 | The input will first be tokenized, and then sentence embeddings will be generated. If a `language` is specified, 9 | then `embed.sh` will look for a language-specific LASER3 encoder using the format: `{model_dir}/laser3-{language}.{version}.pt`. 10 | Otherwise it will default to LASER2 which covers the same 93 languages as [the original LASER encoder](https://arxiv.org/pdf/1812.10464.pdf). 11 | 12 | **NOTE:** please set the model location (`model_dir` in `embed.sh`) before running. We recommend to download the models from the NLLB 13 | release (see [here](/nllb/README.md)). Optionally you can also select the model version number for downloaded LASER3 models. This currently defaults to: `1` (initial release). 14 | 15 | ## Output format 16 | 17 | The embeddings are stored in float32 matrices in raw binary format. 18 | They can be read in Python by: 19 | ``` 20 | import numpy as np 21 | dim = 1024 22 | X = np.fromfile("my_embeddings.bin", dtype=np.float32, count=-1) 23 | X.resize(X.shape[0] // dim, dim) 24 | ``` 25 | X is a N x 1024 matrix where N is the number of lines in the text file. 26 | 27 | ## Examples 28 | 29 | In order to encode an input text in any of the 93 languages supported by LASER2 (e.g. Afrikaans, English, French): 30 | ``` 31 | ./embed.sh input_file output_file 32 | ``` 33 | 34 | To use a language-specific encoder (if available), such as for example: Wolof, Hausa, or Irish: 35 | ``` 36 | ./embed.sh input_file output_file wol_Latn 37 | ``` 38 | ``` 39 | ./embed.sh input_file output_file hau_Latn 40 | ``` 41 | ``` 42 | ./embed.sh input_file output_file gle_Latn 43 | ``` 44 | 45 | -------------------------------------------------------------------------------- /tasks/embed/embed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # bash script to calculate sentence embeddings for arbitrary 16 | # text file 17 | 18 | ############################# 19 | # BEGIN PARAMETERS TO SET 20 | ############################# 21 | # location of models (e.g. /path/to/models); no trailing slash 22 | model_dir="" 23 | 24 | # version number for LASER3 models 25 | version=1 26 | ############################# 27 | # END PARAMETERS TO SET 28 | ############################# 29 | 30 | if [ -z ${model_dir} ]; then 31 | echo "Please set model directory within script" 32 | exit 1 33 | elif [ ! -d ${model_dir} ]; then 34 | echo "Can't find model directory: $model_dir" 35 | exit 1 36 | fi 37 | 38 | if [ -z ${LASER} ] ; then 39 | echo "Please set the environment variable 'LASER'" 40 | exit 1 41 | fi 42 | 43 | if [ $# -lt 2 ] ; then 44 | echo "usage: embed.sh input-file output-file [language]" 45 | exit 1 46 | fi 47 | 48 | infile=$1 49 | outfile=$2 50 | language=$3 51 | 52 | # default to laser2 53 | model_file=${model_dir}/laser2.pt 54 | spm=${model_dir}/laser2.spm 55 | 56 | if [ ! -z ${language} ]; then 57 | model_file=${model_dir}/laser3-$language.v$version.pt 58 | lang_specific_spm=${model_dir}/laser3-$language.v$version.spm 59 | if [[ -s $lang_specific_spm ]]; then 60 | spm=$lang_specific_spm 61 | fi 62 | fi 63 | 64 | if [[ ! -s $model_file ]]; then 65 | echo "couldn't find model file: $model_file" 66 | exit 1 67 | fi 68 | 69 | if [[ ! -s $spm ]]; then 70 | echo "couldn't find spm: $spm" 71 | exit 1 72 | fi 73 | 74 | python3 ${LASER}/source/embed.py \ 75 | --input ${infile} \ 76 | --encoder ${model_file} \ 77 | --spm-model $spm \ 78 | --output ${outfile} \ 79 | --verbose 80 | -------------------------------------------------------------------------------- /tasks/librivox-s2s/README.md: -------------------------------------------------------------------------------- 1 | # Librivox S2S: Automatically mined Speech-to-Speech translations 2 | 3 | ## Abstract 4 | 5 | We present an approach to encode a speech signal into a fixed-size representation which minimizes the cosine loss with the existing massively multilingual LASER text embedding space. Sentences are close in this embedding space, independently of their language and modality, either text or audio. Using a similarity metric in that multimodal embedding space, we perform mining of audio in German, French, Spanish and English from [*Librivox*](https://librivox.org/) against billions of sentences from Common Crawl. This yielded more than twenty thousand hours of aligned speech translations. To evaluate the automatically mined speech/text corpora, we train neural speech translation systems for several languages pairs. Adding the mined data, achieves significant improvements in the BLEU score on the CoVoST2 and the MUST-C test sets with respect to a very competitive baseline. Our approach can also be used to directly perform speech-to-speech mining, without the need to first transcribe or translate the data. We obtain more than one thousand three hundred hours of aligned speech in French, German, Spanish and English. This speech corpus has the potential to boost research in speech-to-speech translation which suffers from scarcity of natural end-to-end training data. 6 | 7 | ## Download 8 | 9 | Manifest files for all languages directions are available [*here*](https://dl.fbaipublicfiles.com/librivox_s2s/manifests.zip). 10 | S2S alignments are sorted by decreasing mining scores (first column). Audios files for each language direction can be downloaded separately. For each language direction, we give the amount of aligned hours in the source and target language. 11 | 12 | - [*English-French*](https://dl.fbaipublicfiles.com/librivox_s2s/ena-fra.zip) (470h / 447h) 13 | - [*English-German*](https://dl.fbaipublicfiles.com/librivox_s2s/dea-ena.zip) (363h / 324h) 14 | - [*English-Spanish*](https://dl.fbaipublicfiles.com/librivox_s2s/ena-esa.zip) (425h / 442h) 15 | - [*French-German*](https://dl.fbaipublicfiles.com/librivox_s2s/dea-fra.zip) (33h / 38h) 16 | - [*French-Spanish*](https://dl.fbaipublicfiles.com/librivox_s2s/esa-fra.zip) (101h / 111h) 17 | - [*German-Spanish*](https://dl.fbaipublicfiles.com/librivox_s2s/dea-esa.zip) (41h / 40h) 18 | 19 | The aligned Speech-to-Speech segments are distributed under the same copyright than [*Librivox*](https://librivox.org/). 20 | 21 | Please cite reference [1] if you use this data. 22 | The mined speech-to-speech data was successfully used to train Speech-to-Speech translation systems [2]. 23 | 24 | 25 | ## References 26 | 27 | [1] Paul-Ambroise Duquenne, Hongyu Gong, Holger Schwenk, 28 | [*Multimodal and Multilingual Embeddings for Large-Scale Speech Mining,*](https://papers.nips.cc/paper/2021/hash/8466f9ace6a9acbe71f75762ffc890f1-Abstract.html), NeurIPS 2021, pages 15748-15761. 29 | 30 | [2] Ann Lee, Hongyu Gong, Paul-Ambroise Duquenne, Holger Schwenk, Peng-Jen Chen, Changhan Wang, Sravya Popuri, Juan Pino, Jiatao Gu, Wei-Ning Hsu, 31 | [*Textless Speech-to-Speech Translation on Real Data*](https://arxiv.org/abs/2112.08352), arXiv Dec 15 2021. to appear at NAACL'22. 32 | 33 | -------------------------------------------------------------------------------- /tasks/mldoc/README.md: -------------------------------------------------------------------------------- 1 | # LASER: application to cross-lingual document classification 2 | 3 | This codes shows how to use the multilingual sentence embedding for 4 | cross-lingual document classification, using the MLDoc corpus [1]. 5 | 6 | We train a document classifier on one language (e.g. English) and apply it then 7 | to several other languages without using any resource of that language 8 | (e.g. German, Spanish, French, Italian, Japanese, Russian and Chinese) 9 | 10 | ## Installation 11 | 12 | * Please first download the MLDoc corpus from 13 | [here](https://github.com/facebookresearch/MLDoc) 14 | and install it in the directory MLDoc 15 | * Calculate the multilingual sentence embeddings for all languages 16 | and train the classifier `bash ./mldoc.sh` 17 | 18 | ## Results 19 | 20 | We use an MLP classifier with two hidden layers and Adam optimization. 21 | 22 | You should get the following results for zero-short cross-lingual transfer 23 | These results are in average better than those reported in [2] since the system has 24 | been improved since publication. 25 | 26 | | Train language | En | De | Es | Fr | It | Ja | Ru | Zh | 27 | |----------------|--------|--------|--------|--------|--------|--------|--------|-------| 28 | | English (en) | 90.73 | 86.25 | 79.30 | 78.03 | 70.20 | 60.95 | 67.25 | 70.98 | 29 | | German (de) | 80.75 | 92.70 | 79.60 | 82.83 | 73.25 | 56.80 | 68.18 | 72.90 | 30 | | Spanish (es) | 69.58 | 79.73 | 88.75 | 75.30 | 71.10 | 59.65 | 59.83 | 61.70 | 31 | | French (fr) | 80.08 | 87.03 | 78.40 | 90.80 | 71.08 | 53.60 | 67.55 | 66.12 | 32 | | Italian (it) | 74.15 | 80.73 | 82.60 | 78.35 | 85.93 | 55.15 | 68.83 | 56.10 | 33 | | Japanese (ja) | 68.45 | 81.90 | 67.95 | 67.95 | 57.98 | 85.15 | 53.70 | 66.12 | 34 | | Russian (ru) | 72.60 | 79.62 | 68.18 | 71.28 | 67.00 | 59.23 | 84.65 | 65.62 | 35 | | Chinese (zh) | 77.95 | 83.38 | 78.38 | 75.83 | 70.33 | 55.25 | 66.62 | 88.98 | 36 | 37 | All numbers are accuracies on the test set. 38 | 39 | ## References 40 | 41 | Details on the corpus are described in this paper: 42 | 43 | [1] Holger Schwenk and Xian Li, 44 | [*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf), 45 | LREC, pages 3548-3551, 2018. 46 | 47 | Detailed system description: 48 | 49 | [2] Mikel Artetxe and Holger Schwenk, 50 | [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464), 51 | arXiv, Dec 26 2018. 52 | -------------------------------------------------------------------------------- /tasks/mldoc/mldoc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # Calculate embeddings of MLDoc corpus 16 | 17 | 18 | import os 19 | import sys 20 | import argparse 21 | 22 | # get environment 23 | assert os.environ.get('LASER'), 'Please set the enviornment variable LASER' 24 | LASER = os.environ['LASER'] 25 | 26 | sys.path.append(LASER + '/source') 27 | sys.path.append(LASER + '/source/tools') 28 | from embed import SentenceEncoder, EncodeLoad, EncodeFile 29 | from text_processing import Token, BPEfastApply, SplitLines, JoinEmbed 30 | 31 | 32 | ############################################################################### 33 | 34 | parser = argparse.ArgumentParser('LASER: calculate embeddings for MLDoc') 35 | parser.add_argument( 36 | '--mldoc', type=str, default='MLDoc', 37 | help='Directory of the MLDoc corpus') 38 | parser.add_argument( 39 | '--data_dir', type=str, default='embed', 40 | help='Base directory for created files') 41 | 42 | # options for encoder 43 | parser.add_argument( 44 | '--encoder', type=str, required=True, 45 | help='Encoder to be used') 46 | parser.add_argument( 47 | '--bpe_codes', type=str, required=True, 48 | help='Directory of the tokenized data') 49 | parser.add_argument( 50 | '--lang', '-L', nargs='+', default=None, 51 | help="List of languages to test on") 52 | parser.add_argument( 53 | '--buffer-size', type=int, default=10000, 54 | help='Buffer size (sentences)') 55 | parser.add_argument( 56 | '--max-tokens', type=int, default=12000, 57 | help='Maximum number of tokens to process in a batch') 58 | parser.add_argument( 59 | '--max-sentences', type=int, default=None, 60 | help='Maximum number of sentences to process in a batch') 61 | parser.add_argument( 62 | '--cpu', action='store_true', 63 | help='Use CPU instead of GPU') 64 | parser.add_argument( 65 | '--verbose', action='store_true', 66 | help='Detailed output') 67 | args = parser.parse_args() 68 | 69 | print('LASER: calculate embeddings for MLDoc') 70 | 71 | if not os.path.exists(args.data_dir): 72 | os.mkdir(args.data_dir) 73 | 74 | enc = EncodeLoad(args) 75 | 76 | print('\nProcessing:') 77 | for part in ('train1000', 'dev', 'test'): 78 | # for lang in "en" if part == 'train1000' else args.lang: 79 | for lang in args.lang: 80 | cfname = os.path.join(args.data_dir, 'mldoc.' + part) 81 | Token(cfname + '.txt.' + lang, 82 | cfname + '.tok.' + lang, 83 | lang=lang, 84 | romanize=(True if lang == 'el' else False), 85 | lower_case=True, gzip=False, 86 | verbose=args.verbose, over_write=False) 87 | SplitLines(cfname + '.tok.' + lang, 88 | cfname + '.split.' + lang, 89 | cfname + '.sid.' + lang) 90 | BPEfastApply(cfname + '.split.' + lang, 91 | cfname + '.split.bpe.' + lang, 92 | args.bpe_codes, 93 | verbose=args.verbose, over_write=False) 94 | EncodeFile(enc, 95 | cfname + '.split.bpe.' + lang, 96 | cfname + '.split.enc.' + lang, 97 | verbose=args.verbose, over_write=False, 98 | buffer_size=args.buffer_size) 99 | JoinEmbed(cfname + '.split.enc.' + lang, 100 | cfname + '.sid.' + lang, 101 | cfname + '.enc.' + lang) 102 | -------------------------------------------------------------------------------- /tasks/mldoc/mldoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # bash script to calculate sentence embeddings for the MLDoc corpus, 16 | # train and evaluate the classifier 17 | 18 | if [ -z ${LASER+x} ] ; then 19 | echo "Please set the environment variable 'LASER'" 20 | exit 21 | fi 22 | 23 | # general config 24 | mldir="MLDoc" # raw texts of MLdoc 25 | edir="embed" # normalized texts and embeddings 26 | languages=('en' 'de' 'es' 'fr' 'it' 'ja' 'ru' 'zh') 27 | 28 | # encoder 29 | model_dir="${LASER}/models" 30 | encoder="${model_dir}/bilstm.93langs.2018-12-26.pt" 31 | bpe_codes="${model_dir}/93langs.fcodes" 32 | 33 | edir="embed" 34 | 35 | ################################################################### 36 | # 37 | # Extract files with labels and texts from the MLdoc corpus 38 | # 39 | ################################################################### 40 | 41 | ExtractMLdoc () { 42 | ifname=$1 43 | ofname=$2 44 | lang=$3 45 | if [ ! -f ${ifname}.${lang} ] ; then 46 | echo "Please install the MLDoc corpus first" 47 | exit 48 | fi 49 | 50 | if [ ! -f ${ofname}.lbl.${lang} ] ; then 51 | echo " - extract labels from ${ifname}.${lang}" 52 | cut -d' ' -f1 ${ifname}.${lang} \ 53 | | sed -e 's/C/0/' -e 's/E/1/' -e 's/G/2/' -e 's/M/3/' \ 54 | > ${ofname}.lbl.${lang} 55 | fi 56 | if [ ! -f ${ofname}.txt.${lang} ] ; then 57 | echo " - extract texts from ${ifname}.${lang}" 58 | # remove text which is not useful for classification 59 | cut -d' ' -f2 ${ifname}.${lang} \ 60 | | sed -e 's/ Co \./ Co./g' -e s'/ Inc \. / Inc. /g' \ 61 | -e 's/([cC]) Reuters Limited 199[0-9]\.//g' \ 62 | > ${ofname}.txt.${lang} 63 | fi 64 | } 65 | 66 | 67 | ################################################################### 68 | # 69 | # Create all files 70 | # 71 | ################################################################### 72 | 73 | # create output directories 74 | for d in ${edir} ; do 75 | mkdir -p ${d} 76 | done 77 | 78 | # Embed all data 79 | echo -e "\nExtracting MLDoc data" 80 | #ExtractMLdoc ${mldir}/mldoc.train1000 ${edir}/mldoc.train1000 "en" 81 | for part in "mldoc.train1000" "mldoc.dev" "mldoc.test" ; do 82 | for l in ${languages[@]} ; do 83 | ExtractMLdoc ${mldir}/${part} ${edir}/${part} ${l} 84 | done 85 | done 86 | 87 | MECAB="${LASER}/tools-external/mecab" 88 | export LD_LIBRARY_PATH="${MECAB}/lib:${LD_LIBRARY_PATH}" 89 | python3 mldoc.py --data_dir ${edir} --lang ${languages[@]} --bpe_codes ${bpe_codes} --encoder ${encoder} 90 | 91 | # MLDoc classifier parameters 92 | nb_cl=4 93 | N=500 94 | lr=0.001 95 | wd=0.0 96 | nhid="10 8" 97 | drop=0.2 98 | seed=1 99 | bsize=12 100 | 101 | echo -e "\nTraining MLDoc classifier (log files in ${edir})" 102 | #for ltrn in "en" ; do 103 | for ltrn in ${languages[@]} ; do 104 | ldev=${ltrn} 105 | lf="${edir}/mldoc.${ltrn}-${ldev}.log" 106 | echo " - train on ${ltrn}, dev on ${ldev}" 107 | if [ ! -f ${lf} ] ; then 108 | python3 ${LASER}/source/sent_classif.py \ 109 | --gpu 0 --base-dir ${edir} \ 110 | --train mldoc.train1000.enc.${ltrn} \ 111 | --train-labels mldoc.train1000.lbl.${ltrn} \ 112 | --dev mldoc.dev.enc.${ldev} \ 113 | --dev-labels mldoc.dev.lbl.${ldev} \ 114 | --test mldoc.test.enc \ 115 | --test-labels mldoc.test.lbl \ 116 | --nb-classes ${nb_cl} \ 117 | --nhid ${nhid[@]} --dropout ${drop} --bsize ${bsize} \ 118 | --seed ${seed} --lr ${lr} --wdecay ${wd} --nepoch ${N} \ 119 | --lang ${languages[@]} \ 120 | > ${lf} 121 | fi 122 | done 123 | 124 | # display results 125 | echo -e "\nAccuracy matrix:" 126 | echo -n "Train " 127 | for l1 in ${languages[@]} ; do 128 | printf " %2s " ${l1} 129 | done 130 | echo "" 131 | for l1 in ${languages[@]} ; do 132 | lf="${edir}/mldoc.${l1}-${l1}.log" 133 | echo -n " ${l1}: " 134 | for l2 in ${languages[@]} ; do 135 | grep "Test lang ${l2}" $lf | sed -e 's/%//' | awk '{printf(" %5.2f", $10)}' 136 | done 137 | echo "" 138 | done 139 | -------------------------------------------------------------------------------- /tasks/pxsim/README.md: -------------------------------------------------------------------------------- 1 | # LASER: P-xSIM (dual approach multilingual similarity error rate) 2 | 3 | This README shows how to calculate the P-xSIM error rate (Seamless Communication et al., 2023) for a given language pair. 4 | 5 | P-xSIM returns the error rate for recreating gold alignments using a blended combination of two different approaches. 6 | It works by performing a k-nearest-neighbor search and margin calculation (i.e. margin-based parallel alignment) using the 7 | first approach, followed by the scoring of each candidate neighbor using an auxiliary model (the second approach). Finally, 8 | the scores of both the margin-based alignment and the auxiliary model are combined together using a blended score defined as: 9 | 10 | $$ \text{blended-score}(x, y) = \alpha \cdot \text{margin} + (1 - \alpha) \cdot \text{auxiliary-score} $$ 11 | 12 | where the parameter $\alpha$ controls the combination of both the margin-based and auxiliary scores. By default, the auxiliary-score will be calculated as the cosine between the source and candidate neighbors using the auxiliary embeddings. However, there is also an option to perform inference using a comparator model (Seamless Communication et al., 2023). In this instance, the auxiliary-score will be the AutoPCP outputs. 13 | 14 | P-xSIM offers three margin-based scoring options (discussed in detail [here](https://arxiv.org/pdf/1811.01136.pdf)): 15 | - distance 16 | - ratio 17 | - absolute 18 | 19 | ## Example usage 20 | 21 | Simply run the example script `bash ./eval.sh` to download a sample dataset (flores200), sample encoders (laser2 and LaBSE), 22 | and then perform P-xSIM. In this toy example, we use laser2 to provide the k-nearest-neighbors, followed by applying LaBSE as an 23 | auxiliary model on each candidate neighbor, before then applying the blended scoring function defined above. Dependending on 24 | your data sources, you may want to alter the approach used for either margin-based parallel alignment, or the scoring of each candidate neighbor 25 | (i.e. the auxiliary model). 26 | 27 | In addition to LaBSE in the example above, you can also calculate P-xSIM using any model hosted on [HuggingFace sentence-transformers](https://huggingface.co/sentence-transformers). 28 | -------------------------------------------------------------------------------- /tasks/pxsim/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for various tasks such as document classification, 11 | # and bitext filtering 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script downloads the flores200 dataset, laser2, and then 16 | # performs pxsim evaluation 17 | 18 | if [ -z ${LASER} ] ; then 19 | echo "Please set the environment variable 'LASER'" 20 | exit 21 | fi 22 | 23 | ddir="${LASER}/data" 24 | cd $ddir # move to data directory 25 | 26 | if [ ! -d $ddir/flores200 ] ; then 27 | echo " - Downloading flores200..." 28 | wget --trust-server-names -q https://tinyurl.com/flores200dataset 29 | tar -xf flores200_dataset.tar.gz 30 | /bin/mv flores200_dataset flores200 31 | /bin/rm flores200_dataset.tar.gz 32 | else 33 | echo " - flores200 already downloaded" 34 | fi 35 | 36 | cd - 37 | 38 | mdir="${LASER}/models" 39 | if [ ! -d ${mdir} ] ; then 40 | echo " - creating model directory: ${mdir}" 41 | mkdir -p ${mdir} 42 | fi 43 | 44 | function download { 45 | file=$1 46 | save_dir=$2 47 | if [ -f ${save_dir}/${file} ] ; then 48 | echo " - ${save_dir}/$file already downloaded"; 49 | else 50 | cd $save_dir 51 | echo " - Downloading $s3/${file}"; 52 | wget -q $s3/${file}; 53 | cd - 54 | fi 55 | } 56 | 57 | # available encoders 58 | s3="https://dl.fbaipublicfiles.com/nllb/laser" 59 | 60 | if [ ! -f ${mdir}/laser2.pt ] ; then 61 | cd $mdir 62 | echo " - Downloading $s3/laser2.pt" 63 | wget --trust-server-names -q https://tinyurl.com/nllblaser2 64 | cd - 65 | else 66 | echo " - ${mdir}/laser2.pt already downloaded" 67 | fi 68 | download "laser2.spm" $mdir 69 | download "laser2.cvocab" $mdir 70 | 71 | # encode FLORES200 texts using both LASER2 and LaBSE 72 | for lang in eng_Latn wol_Latn; do 73 | infile=$LASER/data/flores200/devtest/$lang.devtest 74 | python3 ${LASER}/source/embed.py \ 75 | --input $infile \ 76 | --encoder $mdir/laser2.pt \ 77 | --spm-model $mdir/laser2.spm \ 78 | --output $lang.devtest.laser2 \ 79 | --verbose 80 | 81 | python3 ${LASER}/source/embed.py \ 82 | --input $infile \ 83 | --encoder LaBSE \ 84 | --use-hugging-face \ 85 | --output $lang.devtest.labse \ 86 | --verbose 87 | done 88 | 89 | # run pxsim using LaBSE as an auxiliary scoring model 90 | echo " - calculating p-xsim" 91 | python3 $LASER/source/pxsim.py run \ 92 | --src_emb wol_Latn.devtest.laser2 \ 93 | --tgt_emb eng_Latn.devtest.laser2 \ 94 | --src_aux_emb wol_Latn.devtest.labse \ 95 | --tgt_aux_emb eng_Latn.devtest.labse \ 96 | --alpha 0.1 \ 97 | --k 32 \ 98 | --aux_emb_dim 768 99 | -------------------------------------------------------------------------------- /tasks/similarity/README.md: -------------------------------------------------------------------------------- 1 | # LASER: application to multilingual similarity search 2 | 3 | This codes shows how to embed an N-way parallel corpus (we 4 | use the publicly available newstest2012 from WMT 2012), and 5 | how to calculate the similarity search error rate for each language pair. 6 | 7 | For each sentence in the source language, we calculate the closest sentence in 8 | the joint embedding space in the target language. If this sentence has the same 9 | index in the file, it is considered as correct, and as an error else wise. 10 | Therefore, the N-way parallel corpus **should not contain duplicates.** 11 | 12 | ## Installation 13 | 14 | * simply run the script `bash ./wmt.sh` 15 | to downloads the data, calculate the sentence embeddings 16 | and the similarity search error rate for each language pair. 17 | 18 | ## Results 19 | 20 | You should get the following similarity search errors: 21 | 22 | | | cs | de | en | es | fr | avg | 23 | |-----|-------|-------|-------|--------|-------|-------| 24 | | cs | 0.00% | 0.70% | 0.90% | 0.67% | 0.77% | 0.76% | 25 | | de | 0.83% | 0.00% | 1.17% | 0.90% | 1.03% | 0.98% | 26 | | en | 0.93% | 1.27% | 0.00% | 0.83% | 1.07% | 1.02% | 27 | | es | 0.53% | 0.77% | 0.97% | 0.00% | 0.57% | 0.71% | 28 | | fr | 0.50% | 0.90% | 1.13% | 0.60% | 0.00% | 0.78% | 29 | | avg | 0.70% | 0.91% | 1.04% | 0.75% | 0.86% | 1.06% | 30 | -------------------------------------------------------------------------------- /tasks/similarity/wmt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # evaluate similarity search on WMT newstest2011 16 | 17 | if [ -z ${LASER+x} ] ; then 18 | echo "Please set the environment variable 'LASER'" 19 | exit 20 | fi 21 | 22 | # encoder 23 | model_dir="${LASER}/models" 24 | encoder="${model_dir}/bilstm.93langs.2018-12-26.pt" 25 | bpe_codes="${model_dir}/93langs.fcodes" 26 | 27 | edir="embed" 28 | 29 | 30 | if [ ! -d dev ] ; then 31 | echo " - Download WMT data" 32 | wget -q http://www.statmt.org/wmt13/dev.tgz 33 | tar --wildcards -xf dev.tgz "dev/newstest2012.??" 34 | /bin/rm dev.tgz 35 | fi 36 | 37 | python3 ${LASER}//source/similarity_search.py \ 38 | --bpe-codes ${bpe_codes} --encoder ${encoder} \ 39 | --base-dir . \ 40 | --data dev/newstest2012 --output ${edir}/newstest2012 \ 41 | --lang cs de en es fr --verbose 42 | -------------------------------------------------------------------------------- /tasks/wmt22/README.md: -------------------------------------------------------------------------------- 1 | # LASER: sentence encoders for WMT '22 shared task - data track 2 | 3 | More information on the shared task can be found here: 4 | https://statmt.org/wmt22/large-scale-multilingual-translation-task.html 5 | 6 | ## Downloading encoders 7 | 8 | To download encoders for all 24 supported languages, 9 | please run the `download_models.sh` script within this directory 10 | ``` 11 | bash ./download_models.sh 12 | ``` 13 | This will place all supported models within the directory: `$LASER/models/wmt22` 14 | 15 | **Note**: encoders for each focus language are in the format: `laser3-xxx`, except for 16 | Afrikaans (afr), English (eng), and French (fra) which are all supported by the laser2 model. 17 | 18 | Available languages are: amh, fuv, hau, ibo, kam, kin, lin, lug, luo, nso, nya, orm, sna, som, ssw, swh, tsn, tso, umb, wol, xho, yor and zul 19 | 20 | ## Embedding texts 21 | 22 | Once all encoders are downloaded, you can then begin embedding texts by following the 23 | instructions under: `LASER/tasks/embed/README.md` 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /tasks/wmt22/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script installs WMT'22 sentence encoders from Amazon s3 16 | 17 | if [ -z ${LASER} ] ; then 18 | echo "Please set the environment variable 'LASER'" 19 | exit 20 | fi 21 | 22 | mdir="${LASER}/models/wmt22" 23 | version=1 # model version 24 | 25 | echo "Downloading networks..." 26 | 27 | if [ ! -d ${mdir} ] ; then 28 | echo " - creating model directory: ${mdir}" 29 | mkdir -p ${mdir} 30 | fi 31 | 32 | function download { 33 | file=$1 34 | if [ -f ${mdir}/${file} ] ; then 35 | echo " - ${mdir}/$file already downloaded"; 36 | else 37 | echo " - $s3/${file}"; 38 | wget -q $s3/${file}; 39 | fi 40 | } 41 | 42 | cd ${mdir} # move to model directory 43 | 44 | # available encoders 45 | s3="https://dl.fbaipublicfiles.com/laser/models" 46 | 47 | # [afr, eng, and fra] are supported by the same LASER2 model (93 langs total) 48 | download "laser2.pt" 49 | download "laser2.spm" 50 | download "laser2.cvocab" 51 | 52 | # other WMT '22 supported languages (-afr,eng,fra) 53 | langs=(amh fuv hau ibo kam \ 54 | kin lin lug luo nso \ 55 | nya orm sna som ssw \ 56 | swh tsn tso umb wol \ 57 | xho yor zul) 58 | 59 | for lang in ${langs[@]}; do 60 | download "laser3-$lang.v$version.pt"; 61 | if [ $lang == "fuv" ] || [ $lang == "amh" ] ; then 62 | download "laser3-$lang.v$version.spm"; 63 | download "laser3-$lang.v$version.cvocab"; 64 | fi 65 | done -------------------------------------------------------------------------------- /tasks/xnli/README.md: -------------------------------------------------------------------------------- 1 | # LASER: application to cross-lingual natural language inference 2 | 3 | This codes shows how to use the multilingual sentence embedding for 4 | cross-lingual NLI, using the XNLI corpus. 5 | 6 | We train a NLI classifier on the English MultiNLI corpus, optimizing 7 | the meta-parameters on the English XNLI development corpus. 8 | We then apply that classifier to the test set for all 14 transfer languages. 9 | The foreign languages development set is not used. 10 | 11 | ## Installation 12 | 13 | Just run `bash ./xnli.sh` 14 | which install XNLI and MultiNLI corpora, 15 | calculates the multilingual sentence embeddings, 16 | trains the classifier and displays results. 17 | 18 | The XNLI corpus is available [here](https://www.nyu.edu/projects/bowman/xnli/). 19 | 20 | ## Results 21 | 22 | You should get the following results for zero-short cross-lingual transfer. 23 | They slightly differ from those published in the initial version of the paper [2] 24 | due to the change to PyTorch 1.0 and variations in random number generation, new optimization of meta-parameters, etc. 25 | 26 | | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur | 27 | |-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------| 28 | | 74.65 | 72.26 | 73.15 | 72.48 | 72.73 | 73.35 | 71.08 | 69.84 | 70.48 | 71.94 | 69.20 | 71.38 | 65.95 | 62.14 | 61.82 | 29 | 30 | All numbers are accuracies on the test set 31 | 32 | ## References 33 | 34 | Details on the corpus are described in this paper: 35 | 36 | [1] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov, 37 | [*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269), 38 | EMNLP, 2018. 39 | 40 | Detailed system description: 41 | 42 | [2] Mikel Artetxe and Holger Schwenk, 43 | [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/pdf/1812.10464), 44 | arXiv, Dec 26 2018. 45 | -------------------------------------------------------------------------------- /tasks/xnli/xnli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # XNLI 16 | 17 | import os 18 | import sys 19 | import argparse 20 | import pdb 21 | import faiss 22 | import numpy as np 23 | 24 | # get environment 25 | assert os.environ.get('LASER'), 'Please set the enviornment variable LASER' 26 | LASER = os.environ['LASER'] 27 | 28 | sys.path.append(LASER + '/source') 29 | sys.path.append(LASER + '/source/tools') 30 | from embed import SentenceEncoder, EncodeLoad, EncodeFile 31 | from text_processing import Token, BPEfastApply 32 | 33 | 34 | ################################################################################ 35 | 36 | parser = argparse.ArgumentParser('LASER: training and evaluation for XNLI') 37 | parser.add_argument('--tsv', type=str, default='tsv', 38 | help='Directory of the TSV file') 39 | parser.add_argument('--data_dir', type=str, default='.', 40 | help='Base directory for created files') 41 | parser.add_argument('--bpe_codes', type=str, required=True, 42 | help='Directory of the tokenized data') 43 | parser.add_argument('--verbose', action='store_true', 44 | help='Detailed output') 45 | 46 | # options for encoder 47 | parser.add_argument('--encoder', type=str, required=True, 48 | help='encoder to be used') 49 | parser.add_argument( 50 | '--lang', '-L', nargs='+', default=None, 51 | help="List of languages to test on") 52 | parser.add_argument('--buffer-size', type=int, default=10000, 53 | help='Buffer size (sentences)') 54 | parser.add_argument('--max-tokens', type=int, default=12000, 55 | help='Maximum number of tokens to process in a batch') 56 | parser.add_argument('--max-sentences', type=int, default=None, 57 | help='Maximum number of sentences to process in a batch') 58 | parser.add_argument('--cpu', action='store_true', 59 | help='Use CPU instead of GPU') 60 | 61 | args = parser.parse_args() 62 | 63 | print('LASER: training and evaluation for XNLI') 64 | 65 | if not os.path.exists(args.data_dir): 66 | os.mkdir(args.data_dir) 67 | 68 | enc = EncodeLoad(args) 69 | 70 | languages_train = ('en',) 71 | languages = ('en', 'ar', 'bg', 'de', 'el', 'es', 'fr', 'hi', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh') 72 | 73 | print('\nProcessing train:') 74 | for lang in languages_train: 75 | for part in ('prem', 'hyp'): 76 | cfname = os.path.join(args.data_dir, 'xnli.train.' + part + '.') 77 | Token(cfname + lang, 78 | cfname + 'tok.' + lang, 79 | lang=lang, 80 | romanize=True if lang=='el' else False, 81 | lower_case=True, gzip=True, 82 | verbose=args.verbose, over_write=False) 83 | BPEfastApply(cfname + 'tok.' + lang, 84 | cfname + 'bpe.' + lang, 85 | args.bpe_codes, 86 | verbose=args.verbose, over_write=False) 87 | EncodeFile(enc, 88 | cfname + 'bpe.' + lang, 89 | cfname + 'enc.' + lang, 90 | verbose=args.verbose, over_write=False, 91 | buffer_size=args.buffer_size) 92 | 93 | for corpus in ('xnli.dev', 'xnli.test'): 94 | print('\nProcessing {}:'.format(corpus)) 95 | for part in ('prem', 'hyp'): 96 | cfname = os.path.join(args.data_dir, corpus + '.' + part + '.') 97 | for lang in languages: 98 | Token(cfname + lang, 99 | cfname + 'tok.' + lang, 100 | lang=lang, 101 | romanize=True if lang=='el' else False, 102 | lower_case=True, gzip=False, 103 | verbose=args.verbose, over_write=False) 104 | BPEfastApply(cfname + 'tok.' + lang, 105 | cfname + 'bpe.' + lang, 106 | args.bpe_codes, 107 | verbose=args.verbose, over_write=False) 108 | EncodeFile(enc, 109 | cfname + 'bpe.' + lang, 110 | cfname + 'enc.' + lang, 111 | verbose=args.verbose, over_write=False, 112 | buffer_size=args.buffer_size) 113 | 114 | -------------------------------------------------------------------------------- /tasks/xnli/xnli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | # -------------------------------------------------------- 14 | # 15 | # bash script to downlaod and extract XNLI and multiNLI corpus 16 | 17 | if [ -z ${LASER+x} ] ; then 18 | echo "Please set the environment variable 'LASER'" 19 | exit 20 | fi 21 | 22 | xnli="XNLI-1.0" 23 | xnli_mt="XNLI-MT-1.0" 24 | xnli_http="https://dl.fbaipublicfiles.com/XNLI" 25 | mnli_http="https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip" 26 | 27 | languages=("en" "fr" "es" "de" "el" "bg" "ru" "tr" "ar" "vi" "th" "zh" "hi" "sw" "ur") 28 | 29 | edir="embed" 30 | 31 | # encoder 32 | model_dir="${LASER}/models" 33 | encoder="${model_dir}/bilstm.93langs.2018-12-26.pt" 34 | bpe_codes="${model_dir}/93langs.fcodes" 35 | 36 | # NLI classifier params 37 | N=200 38 | nhid="512 384" 39 | drop=0.3 40 | seed=159753 41 | bsize=128 42 | lr=0.001 43 | 44 | ############################################################################################## 45 | # get the XNLI dev and test corpus in 15 languages 46 | 47 | ExtractXNLI () { 48 | echo "Installing XNLI" 49 | if [ ! -s ${xnli}/xnli.test.tsv ] ; then 50 | echo " - Downloading " 51 | wget -q ${xnli_http}/${xnli}.zip 52 | echo " - unzip " 53 | unzip -q ${xnli}.zip 54 | /bin/rm -rf __MACOS ${xnli}.zip 55 | fi 56 | 57 | for lang in ${languages[@]} ; do 58 | for part in "dev" "test" ; do 59 | if [ ! -f ${edir}/xnli.${part}.prem.${lang} ] ; then 60 | echo " - extracting xnli.${part}.${lang}" 61 | tail -n +2 ${xnli}/xnli.${part}.tsv \ 62 | | grep "^${lang}" | cut -f7 \ 63 | > ${edir}/xnli.${part}.prem.${lang} 64 | tail -n +2 ${xnli}/xnli.${part}.tsv \ 65 | | grep "^${lang}" | cut -f8 \ 66 | > ${edir}/xnli.${part}.hyp.${lang} 67 | tail -n +2 ${xnli}/xnli.${part}.tsv \ 68 | | grep "^${lang}" | cut -f2 \ 69 | | sed -e 's/entailment/0/' -e 's/neutral/1/' -e 's/contradiction/2/' \ 70 | > ${edir}/xnli.${part}.cl.${lang} 71 | fi 72 | done 73 | done 74 | } 75 | 76 | ############################################################################################## 77 | # https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip 78 | # MT translated data is already tokenized ! 79 | 80 | ExtractXNLI_MT () { 81 | echo "Installing XNLI MT" 82 | if [ ! -d ${xnli_mt}/multinli ] ; then 83 | echo " - Downloading " 84 | wget -q ${xnli_http}/${xnli_mt}.zip 85 | echo " - unzip " 86 | unzip -q ${xnli_mt}.zip 87 | /bin/rm -rf __MACOS ${xnli_mt}.zip 88 | fi 89 | 90 | part="train" 91 | for lang in "en" ; do 92 | if [ ! -f ${edir}/multinli.${part}.prem.${lang}.gz ] ; then 93 | echo " - extracting ${part}.${lang}" 94 | tail -n +2 ${xnli_mt}/multinli/multinli.${part}.${lang}.tsv \ 95 | | cut -f1 > ${edir}/multinli.${part}.prem.${lang} 96 | tail -n +2 ${xnli_mt}/multinli/multinli.${part}.${lang}.tsv \ 97 | | cut -f2 > ${edir}/multinli.${part}.hyp.${lang} 98 | tail -n +2 ${xnli_mt}/multinli/multinli.${part}.${lang}.tsv \ 99 | | cut -f3 \ 100 | | sed -e 's/entailment/0/' -e 's/neutral/1/' -e 's/contradictory/2/' \ 101 | > ${edir}/multinli.${part}.cl.${lang} 102 | fi 103 | done 104 | } 105 | 106 | ############################################################################################## 107 | # https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip 108 | # MT translated data is already tokenized ! 109 | 110 | ExtractMNLI () { 111 | echo "Installing MultiNLI" 112 | train_txt="multinli_1.0/multinli_1.0_train.txt" 113 | if [ ! -d ${edir} ] ; then mkdir -p ${edir}; fi 114 | 115 | if [ ! -f ${edir}/xnli.train.cl.en ] ; then 116 | echo " - Downloading" 117 | wget -q ${mnli_http} 118 | echo " - unzip" 119 | unzip -q multinli_1.0.zip ${train_txt} 120 | 121 | echo " - extracting" 122 | tail -n +2 ${train_txt} | cut -f6 | gzip > ${edir}/xnli.train.prem.en.gz 123 | tail -n +2 ${train_txt} | cut -f7 | gzip > ${edir}/xnli.train.hyp.en.gz 124 | tail -n +2 ${train_txt} | cut -f1 \ 125 | | sed -e 's/entailment/0/' -e 's/neutral/1/' -e 's/contradiction/2/' \ 126 | > ${edir}/xnli.train.cl.en 127 | fi 128 | } 129 | 130 | ############################################################################################## 131 | 132 | if [ ! -d ${edir} ] ; then mkdir -p ${edir}; fi 133 | 134 | ExtractXNLI 135 | ExtractMNLI 136 | 137 | # calculate embeddings 138 | export PYTHONPATH="$PYTHONPATH:$LASER/tools-external/jieba" 139 | python3 xnli.py --data_dir ${edir} --lang ${languages[@]} --bpe_codes ${bpe_codes} --encoder ${encoder} --verbose 140 | 141 | #for fr in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ; do 142 | for fr in 0.6 0.7 0.8 0.9 ; do 143 | echo -e "\nTraining the classifier (see ${edir}/xnli.fract${fr}.log)" 144 | python3 ${LASER}/source/nli.py -b ${edir} \ 145 | --train xnli.train.%s.enc.en --train-labels xnli.train.cl.en \ 146 | --dev xnli.dev.%s.enc.en --dev-labels xnli.dev.cl.en \ 147 | --test xnli.test.%s.enc --test-labels xnli.test.cl --lang ${languages[@]} \ 148 | --nhid ${nhid[@]} --dropout ${drop} --bsize ${bsize} \ 149 | --seed ${seed} --lr ${lr} --nepoch ${N} \ 150 | --cross-lingual \ 151 | --fraction $fr \ 152 | --save-outputs ${edir}/xnli.fract${fr}.outputs \ 153 | --gpu 1 > ${edir}/xnli.fract${fr}.log 154 | done 155 | -------------------------------------------------------------------------------- /tasks/xsim/README.md: -------------------------------------------------------------------------------- 1 | # LASER: xSIM (multilingual similarity search) 2 | 3 | This README shows how to calculate the xsim (multilingual similarity) error rate for a given language pair. 4 | 5 | xSIM returns the error rate for encoding bitexts into the same embedding space i.e., given a bitext 6 | with source language embeddings X, and target language embeddings Y, xSIM aligns the embeddings from 7 | X and Y based on a margin-based similarity, and then returns the percentage of incorrect alignments. 8 | 9 | xSIM offers three margin-based scoring options (discussed in detail [here](https://arxiv.org/pdf/1811.01136.pdf)): 10 | - distance 11 | - ratio 12 | - absolute 13 | 14 | ## Example usage 15 | 16 | ### Sample script 17 | 18 | Simply run the example script `bash ./eval.sh` to download a sample dataset (flores200), a sample encoder (laser2), 19 | and calculate the sentence embeddings and the xSIM error rate for a set of (comma separated) languages. 20 | 21 | You can also calculate xsim for encoders hosted on [HuggingFace sentence-transformers](https://huggingface.co/sentence-transformers). For example, to use LaBSE you can modify/add the following arguments in the sample script: 22 | ``` 23 | --src-encoder LaBSE 24 | --use-hugging-face 25 | --embedding-dimension 768 26 | ``` 27 | Note: for HuggingFace encoders there is no need to specify `--src-spm-model`. 28 | 29 | ### Python 30 | 31 | Import xsim 32 | 33 | ``` 34 | from xsim import xSIM 35 | ``` 36 | Calculate xsim from either numpy float arrays (e.g. np.float32) or binary embedding files 37 | ``` 38 | # A: numpy arrays x and y 39 | 40 | err, nbex = xSIM(x, y) 41 | 42 | # B: binary embedding files x and y 43 | 44 | fp16_flag = False # set true if embeddings are saved in 16 bit 45 | embedding_dim = 1024 # set dimension of saved embeddings 46 | err, nbex = xSIM( 47 | x, 48 | y, 49 | dim=embedding_dim, 50 | fp16=fp16_flag 51 | ) 52 | ``` 53 | Error type 54 | ``` 55 | # A: textual-based error (allows for duplicates) 56 | 57 | tgt_text = "/path/to/target-text-file" 58 | err, nbex = xSIM(x, y, eval_text=tgt_text) 59 | 60 | # B: index-based error (default) 61 | 62 | err, nbex = xSIM(x, y) 63 | ``` 64 | Margin selection 65 | ``` 66 | # A: ratio (default) 67 | err, nbex = xSIM(x, y) 68 | 69 | # B: distance 70 | err, nbex = xSIM(x, y, margin='distance') 71 | 72 | # C: absolute 73 | err, nbex = xSIM(x, y, margin='absolute') 74 | ``` 75 | Finally, to calculate the error rate simply return: `100 * err / nbex` (number of errors over total examples). 76 | -------------------------------------------------------------------------------- /tasks/xsim/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script installs the flores200 dataset, downloads laser2, and then 16 | # performs xsim (multilingual similarity) evaluation with ratio margin 17 | 18 | if [ -z ${LASER} ] ; then 19 | echo "Please set the environment variable 'LASER'" 20 | exit 21 | fi 22 | 23 | ddir="${LASER}/data" 24 | cd $ddir # move to data directory 25 | 26 | if [ ! -d $ddir/flores200 ] ; then 27 | echo " - Downloading flores200..." 28 | wget --trust-server-names -q https://tinyurl.com/flores200dataset 29 | tar -xf flores200_dataset.tar.gz 30 | /bin/mv flores200_dataset flores200 31 | /bin/rm flores200_dataset.tar.gz 32 | else 33 | echo " - flores200 already downloaded" 34 | fi 35 | 36 | mdir="${LASER}/models" 37 | if [ ! -d ${mdir} ] ; then 38 | echo " - creating model directory: ${mdir}" 39 | mkdir -p ${mdir} 40 | fi 41 | 42 | function download { 43 | file=$1 44 | if [ -f ${mdir}/${file} ] ; then 45 | echo " - ${mdir}/$file already downloaded"; 46 | else 47 | echo " - Downloading $s3/${file}"; 48 | wget -q $s3/${file}; 49 | fi 50 | } 51 | 52 | cd $mdir # move to model directory 53 | 54 | # available encoders 55 | s3="https://dl.fbaipublicfiles.com/nllb/laser" 56 | 57 | if [ ! -f ${mdir}/laser2.pt ] ; then 58 | echo " - Downloading $s3/laser2.pt" 59 | wget --trust-server-names -q https://tinyurl.com/nllblaser2 60 | else 61 | echo " - ${mdir}/laser2.pt already downloaded" 62 | fi 63 | download "laser2.spm" 64 | download "laser2.cvocab" 65 | 66 | corpus_part="devtest" 67 | corpus="flores200" 68 | 69 | # note: example evaluation script expects format: basedir/corpus/corpus_part/lang.corpus_part 70 | 71 | echo " - calculating xsim" 72 | python3 $LASER/source/eval.py \ 73 | --base-dir $ddir \ 74 | --corpus $corpus \ 75 | --corpus-part $corpus_part \ 76 | --margin ratio \ 77 | --src-encoder $LASER/models/laser2.pt \ 78 | --src-spm-model $LASER/models/laser2.spm \ 79 | --src-langs afr_Latn,fin_Latn,fra_Latn,hin_Deva,tha_Thai,eng_Latn \ 80 | --nway --verbose 81 | -------------------------------------------------------------------------------- /tasks/xsimplusplus/README.md: -------------------------------------------------------------------------------- 1 | # LASER: xSIM++ 2 | 3 | This README shows how to calculate the xSIM++ error rate for a given language pair. 4 | 5 | xSIM++ is an extension of [xSIM](https://github.com/facebookresearch/LASER/tree/main/tasks/xsim). In comparison to xSIM, this evaluates using target-side data with additional synthetic, hard-to-distinguish examples. You can find more details about it in the publication: [xSIM++: An Improved Proxy to Bitext Mining Performance for Low-Resource Languages](https://arxiv.org/abs/2306.12907). 6 | 7 | ## Example usage 8 | 9 | Simply run the example script `bash ./eval.sh` to download a sample dataset (flores200), download synthetically augmented English evaluation data from Flores, a sample encoder (laser2), and calculate both the sentence embeddings and the xSIM++ error rate for a set of (comma separated) languages. 10 | 11 | The evaluation command is similar to xSIM, however there is an additional option to provide the comma-separated list of augmented languages: `--tgt-aug-langs`. These refer 12 | to languages in the chosen evaluation set which also have a separate augmented data file. In addition to the error rate, the script also provides a breakdown of the number of errors by type (e.g. incorrect entity/number etc.). 13 | 14 | You can also calculate xsim++ for encoders hosted on [HuggingFace sentence-transformers](https://huggingface.co/sentence-transformers). For example, to use LaBSE you can modify/add the following arguments in the sample script: 15 | ``` 16 | --src-encoder LaBSE 17 | --use-hugging-face 18 | --embedding-dimension 768 19 | ``` 20 | Note: for HuggingFace encoders there is no need to specify `--src-spm-model`. 21 | -------------------------------------------------------------------------------- /tasks/xsimplusplus/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | # 8 | # LASER Language-Agnostic SEntence Representations 9 | # is a toolkit to calculate multilingual sentence embeddings 10 | # and to use them for document classification, bitext filtering 11 | # and mining 12 | # 13 | #------------------------------------------------------- 14 | # 15 | # This bash script installs the flores200 dataset, downloads laser2, and then 16 | # performs xsim++ evaluation with ratio margin 17 | 18 | if [ -z ${LASER} ] ; then 19 | echo "Please set the environment variable 'LASER'" 20 | exit 21 | fi 22 | 23 | ddir="${LASER}/data" 24 | cd $ddir # move to data directory 25 | 26 | if [ ! -d $ddir/flores200 ] ; then 27 | echo " - Downloading flores200..." 28 | wget --trust-server-names -q https://tinyurl.com/flores200dataset 29 | tar -xf flores200_dataset.tar.gz 30 | /bin/mv flores200_dataset flores200 31 | /bin/rm flores200_dataset.tar.gz 32 | else 33 | echo " - flores200 already downloaded" 34 | fi 35 | 36 | mdir="${LASER}/models" 37 | if [ ! -d ${mdir} ] ; then 38 | echo " - creating model directory: ${mdir}" 39 | mkdir -p ${mdir} 40 | fi 41 | 42 | function download { 43 | file=$1 44 | save_dir=$2 45 | if [ -f ${save_dir}/${file} ] ; then 46 | echo " - ${save_dir}/$file already downloaded"; 47 | else 48 | cd $save_dir 49 | echo " - Downloading $s3/${file}"; 50 | wget -q $s3/${file}; 51 | cd - 52 | fi 53 | } 54 | 55 | # available encoders 56 | s3="https://dl.fbaipublicfiles.com/nllb/laser" 57 | 58 | if [ ! -f ${mdir}/laser2.pt ] ; then 59 | cd $mdir 60 | echo " - Downloading $s3/laser2.pt" 61 | wget --trust-server-names -q https://tinyurl.com/nllblaser2 62 | cd - 63 | else 64 | echo " - ${mdir}/laser2.pt already downloaded" 65 | fi 66 | download "laser2.spm" $mdir 67 | download "laser2.cvocab" $mdir 68 | 69 | corpus_part="devtest" 70 | corpus="flores200" 71 | 72 | # download flores200 augmented data (eng_Latn) 73 | s3="https://dl.fbaipublicfiles.com/nllb/laser/xsimplusplus" 74 | augmented_dir=$ddir/$corpus/${corpus_part}_augmented 75 | if [ ! -d $augmented_dir ]; then mkdir $augmented_dir; fi 76 | download "eng_Latn_augmented.$corpus_part" $augmented_dir 77 | download "eng_Latn_errtype.$corpus_part.json" $augmented_dir 78 | 79 | # note: example evaluation script expects format: basedir/corpus/corpus_part/lang.corpus_part 80 | 81 | echo " - calculating xsim++" 82 | python3 $LASER/source/eval.py \ 83 | --base-dir $ddir \ 84 | --corpus $corpus \ 85 | --corpus-part $corpus_part \ 86 | --margin ratio \ 87 | --src-encoder $LASER/models/laser2.pt \ 88 | --src-spm-model $LASER/models/laser2.spm \ 89 | --src-langs afr_Latn,fin_Latn,fra_Latn,hin_Deva,tha_Thai \ 90 | --tgt-langs eng_Latn \ 91 | --tgt-aug-langs eng_Latn \ 92 | --verbose 93 | -------------------------------------------------------------------------------- /utils/requirements.txt: -------------------------------------------------------------------------------- 1 | indic-nlp-library==0.81 2 | sentence-splitter==1.4 3 | botok==0.8.8 4 | khmer-nltk==1.5 5 | LaoNLP==0.6 6 | sacremoses==0.1.0 7 | xxhash==3.0.0 8 | emoji==1.7.0 -------------------------------------------------------------------------------- /utils/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="sentence_cleaner_splitter", 5 | version="1.0.1", 6 | url="https://github.com/facebookresearch/LASER/", 7 | author="NLLB Data Team", 8 | author_email="nllb_data@fb.com", 9 | description="Clean and split sentences", 10 | packages=["sentence_cleaner_splitter"], 11 | package_dir={"sentence_cleaner_splitter": "src"}, 12 | install_requires=[ 13 | "indic-nlp-library==0.81", 14 | "sentence-splitter==1.4", 15 | "botok==0.8.8", 16 | "khmer-nltk==1.5", 17 | "LaoNLP==0.6", 18 | "sacremoses==0.1.0", 19 | "xxhash==3.0.0", 20 | "emoji==1.7.0", 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /utils/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/LASER/d7e2544234c1d2a7076280944bdc2637f98ef3c2/utils/src/__init__.py -------------------------------------------------------------------------------- /utils/src/cleaner_splitter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import typing as tp 4 | import unicodedata 5 | 6 | import xxhash 7 | from sacremoses import MosesPunctNormalizer 8 | 9 | from .demojizer import Demojizer, legacy_demojizer 10 | from .remove_non_printing_char import \ 11 | get_replacer as non_printing_char_replacer 12 | from .sentence_split import get_split_algo 13 | 14 | demojizer = Demojizer() 15 | 16 | 17 | class SentenceSplitClean: 18 | def __init__(self, splitter_lang: str, split_algo: str): 19 | # setup sentence splitter 20 | self.splitter = get_split_algo(splitter_lang, split_algo=split_algo) 21 | 22 | # setup "moses" normalization 23 | self.mpn = MosesPunctNormalizer(lang="en", perl_parity=True) # TODO 24 | self.replace_nonprint = non_printing_char_replacer(" ") 25 | 26 | def __call__(self, line): 27 | sentence_splits = self.splitter(line) 28 | line_hash = xxhash.xxh3_64_intdigest(line) 29 | 30 | for sent in sentence_splits: 31 | # normalize -- moses equivalent 32 | clean = self.mpn.normalize(sent) 33 | clean = self.replace_nonprint(clean) 34 | # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca 35 | clean = unicodedata.normalize("NFKC", clean) 36 | 37 | yield (line_hash, sent, clean) 38 | 39 | 40 | def remove_on_unicode_category(x: str) -> str: 41 | return "".join(filter(lambda ch: not unicodedata.category(ch) in {"So"}, x)) 42 | 43 | 44 | def get_replacer_unicode_category( 45 | skip_min: int, max_num: int, replace_by: str = " " 46 | ) -> str: 47 | def replace_by_unicode_category(x: str) -> str: 48 | total_counter = 0 49 | skip_counter = 0 50 | 51 | def flt(ch): 52 | nonlocal total_counter 53 | nonlocal skip_counter 54 | if max_num == 0 or total_counter < max_num: 55 | if unicodedata.category(ch) in {"So"}: 56 | if skip_counter < skip_min: 57 | skip_counter += 1 58 | return ch 59 | total_counter += 1 60 | return replace_by 61 | return ch 62 | 63 | return "".join(map(flt, x)) 64 | 65 | return replace_by_unicode_category 66 | 67 | 68 | # to map with previous versions of the pipeline 69 | def get_sentence_candidate_modifiers() -> tp.List[tp.Callable]: 70 | return [ 71 | lambda x: x, 72 | lambda x: x + " ", 73 | lambda x: " " + x, 74 | lambda x: " " + x + " ", 75 | lambda x: " " + x, 76 | lambda x: x.rstrip(), 77 | lambda x: x.lstrip(), 78 | lambda x: " " + x.rstrip(), 79 | lambda x: x.strip(), 80 | lambda x: demojizer(x, ""), 81 | lambda x: demojizer(x, "").strip(), 82 | lambda x: " " + demojizer(x, ""), 83 | legacy_demojizer, 84 | remove_on_unicode_category, 85 | get_replacer_unicode_category(1, 1), 86 | get_replacer_unicode_category(0, 0), 87 | ] 88 | 89 | 90 | def reach_sentence_from_paragraph( 91 | paragraph: str, 92 | expected_paragraph_digest: int, 93 | expected_sentence_digest: int, 94 | lang: str, 95 | sentence_splitters: tp.Dict[str, "SentenceSplitClean"], 96 | debug_candidates: bool, 97 | ): 98 | if lang not in sentence_splitters: 99 | sentence_splitters[lang] = SentenceSplitClean(lang, "default") 100 | 101 | def no_splitter(paragraph): 102 | line_h = xxhash.xxh3_64_intdigest(paragraph) 103 | return [(line_h, paragraph, paragraph)] 104 | 105 | sentence_splitter = sentence_splitters[lang] 106 | splitter_candidates = [sentence_splitter, no_splitter] 107 | for duct_candidate in get_sentence_candidate_modifiers(): 108 | for split_cand in splitter_candidates: 109 | for line_hash, sent, clean in split_cand(paragraph): 110 | assert line_hash == expected_paragraph_digest 111 | clean_cand = duct_candidate(clean) 112 | reached_sentence_digest = xxhash.xxh3_64_intdigest(clean_cand) 113 | if debug_candidates: 114 | print(f"{reached_sentence_digest}::\t::{clean_cand}::") 115 | if reached_sentence_digest == expected_sentence_digest: 116 | return clean_cand 117 | 118 | return None 119 | 120 | 121 | def split_clean(): 122 | split_algo = "default" 123 | sentence_splitters = {} 124 | 125 | for line in sys.stdin: 126 | line_stripped = line.rstrip("\n") 127 | metadata, paragraph = line_stripped.split("\t") 128 | ( 129 | _, 130 | _, 131 | _, 132 | _, 133 | paragraph_digest, 134 | sentence_digest, 135 | _, 136 | _, 137 | _, 138 | lang, 139 | _, 140 | ) = metadata.split() 141 | paragraph_digest = int(paragraph_digest) 142 | sentence_digest = int(sentence_digest) 143 | 144 | sentence = reach_sentence_from_paragraph( 145 | paragraph, 146 | paragraph_digest, 147 | sentence_digest, 148 | lang, 149 | sentence_splitters, 150 | False, 151 | ) 152 | 153 | if sentence is not None: 154 | print(f"{line_stripped}\t{sentence}") 155 | else: 156 | print( 157 | f"Couldn't match sentence for paragraph: {paragraph_digest} sentence: {sentence_digest} lang: {lang}", 158 | file=sys.stderr, 159 | ) 160 | 161 | 162 | def main(): 163 | split_clean() 164 | 165 | 166 | if __name__ == "__main__": 167 | main() 168 | -------------------------------------------------------------------------------- /utils/src/demojizer.py: -------------------------------------------------------------------------------- 1 | import emoji 2 | 3 | 4 | def legacy_demojizer(x: str) -> str: 5 | return "".join(filter(lambda ch: not emoji.is_emoji(ch), x)) 6 | 7 | 8 | class Demojizer: 9 | """ 10 | based on: 11 | https://github.com/carpedm20/emoji/blob/d8bbfe455c6fcd12b96ed1dce6e0978fe7a47431/emoji/core.py#L141 12 | """ 13 | 14 | def _get_search_tree(self): 15 | _SEARCH_TREE = {} 16 | for emj in emoji.unicode_codes.EMOJI_DATA: 17 | sub_tree = _SEARCH_TREE 18 | lastidx = len(emj) - 1 19 | for i, char in enumerate(emj): 20 | if char not in sub_tree: 21 | sub_tree[char] = {} 22 | sub_tree = sub_tree[char] 23 | if i == lastidx: 24 | sub_tree["data"] = emoji.unicode_codes.EMOJI_DATA[emj] 25 | return _SEARCH_TREE 26 | 27 | def __init__(self) -> None: 28 | self.search_tree = self._get_search_tree() 29 | 30 | def __call__(self, string: str, replace_str: str): 31 | result = [] 32 | i = 0 33 | length = len(string) 34 | state = 0 35 | while i < length: 36 | consumed = False 37 | char = string[i] 38 | if char in self.search_tree: 39 | j = i + 1 40 | sub_tree = self.search_tree[char] 41 | while j < length and string[j] in sub_tree: 42 | sub_tree = sub_tree[string[j]] 43 | j += 1 44 | if "data" in sub_tree: 45 | state = 1 46 | consumed = True 47 | result.append(replace_str) 48 | i = j - 1 49 | else: 50 | state = 0 51 | elif state == 1: 52 | if char.isspace(): 53 | consumed = True 54 | else: 55 | state = 0 56 | 57 | if not consumed and char != "\ufe0e" and char != "\ufe0f": 58 | result.append(char) 59 | i += 1 60 | 61 | return "".join(result) 62 | -------------------------------------------------------------------------------- /utils/src/remove_non_printing_char.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Remove non printable char as per: 8 | # https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python 9 | # 10 | # This is supposed to be a drop in replacement to moses strip-non-printing-char.perl 11 | 12 | import sys 13 | import unicodedata 14 | 15 | 16 | def get_replacer(replace_by: str = " ") -> str: 17 | non_printable_map = { 18 | ord(c): replace_by 19 | for c in (chr(i) for i in range(sys.maxunicode + 1)) 20 | # same as \p{C} in perl 21 | # see https://www.unicode.org/reports/tr44/#General_Category_Values 22 | if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"} 23 | } 24 | 25 | def replace_non_printing_char(line) -> str: 26 | return line.translate(non_printable_map) 27 | 28 | return replace_non_printing_char 29 | 30 | 31 | def test_remove(): 32 | replaceby_ = get_replacer("_") 33 | 34 | assert ( 35 | replaceby_("See what's hidden in your string… or be​hind") 36 | == "See what's hidden in your string…_or be_hind_" 37 | ) 38 | 39 | replacebyspace = get_replacer(" ") 40 | 41 | assert replacebyspace("\x00\x11Hello\u200bWorld") == " Hello World" 42 | --------------------------------------------------------------------------------