├── .github └── workflows │ ├── codeql-analysis.yml │ └── tests.yml ├── .gitignore ├── FEATURES ├── HISTORY.rst ├── LICENSE ├── README.rst ├── py3langid ├── __init__.py ├── data │ └── model.plzma ├── examples │ ├── _twokenize.py │ └── process_twitter.py ├── langid.py ├── tools │ ├── __init__.py │ ├── featWeights.py │ └── printfeats.py └── train │ ├── BLweight.py │ ├── DFfeatureselect.py │ ├── IGweight.py │ ├── LDfeatureselect.py │ ├── NBtrain.py │ ├── README │ ├── __init__.py │ ├── common.py │ ├── index.py │ ├── scanner.py │ ├── tokenize.py │ └── train.py ├── pyproject.toml └── tests ├── __init__.py ├── test_langid.py └── test_server.py /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '23 1 * * 1' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Tests 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | os: [ubuntu-latest] 20 | # https://github.com/actions/python-versions/blob/main/versions-manifest.json 21 | python-version: [3.8, 3.9, "3.10", "3.11", "3.12", "3.13-dev"] 22 | include: 23 | # other OS version necessary 24 | - os: macos-latest 25 | python-version: "3.10" 26 | - os: windows-latest 27 | python-version: "3.10" 28 | steps: 29 | # Python and pip setup 30 | - name: Set up Python ${{ matrix.python-version }} 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | 35 | - name: Upgrade pip 36 | run: python -m pip install --upgrade pip 37 | 38 | - name: Get pip cache dir 39 | id: pip-cache 40 | run: | 41 | echo "::set-output name=dir::$(pip cache dir)" 42 | 43 | - name: pip cache 44 | uses: actions/cache@v4 45 | with: 46 | path: ${{ steps.pip-cache.outputs.dir }} 47 | key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} 48 | restore-keys: | 49 | ${{ runner.os }}-pip- 50 | 51 | # package setup 52 | - uses: actions/checkout@v4 53 | 54 | - name: Install dependencies 55 | run: python -m pip install -e "." 56 | 57 | # tests 58 | - name: Test with pytest 59 | run: | 60 | python -m pip install pytest pytest-cov 61 | pytest --cov=./ --cov-report=xml 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # IDE settings 105 | .vscode/ 106 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.3.0 6 | ----- 7 | 8 | * Modernized setup, dropped support for Python 3.6 & 3.7 9 | * Simplified inference code 10 | * Support for Numpy 2.0 11 | 12 | 13 | 0.2.2 14 | ----- 15 | 16 | * Fixed bug in probability normalization (#6) 17 | * Fully implemented data type argument in ``classify()`` 18 | * Adapted training scripts to Python3 (untested) 19 | 20 | 21 | 0.2.1 22 | ----- 23 | 24 | * Maintenance: update and simplify code 25 | 26 | 27 | 0.2.0 28 | ----- 29 | 30 | * Change Numpy data type for features (``uint32`` → ``uint16``) 31 | * Code cleaning 32 | 33 | 34 | 0.1.2 35 | ----- 36 | 37 | * Include data in non-wheel package versions 38 | 39 | 40 | 0.1.1 41 | ----- 42 | 43 | * Faster module loading 44 | * Extended tests and readme 45 | 46 | 47 | 0.1.0 48 | ----- 49 | 50 | * Fork re-packaged 51 | * Efficiency improvements in ``langid.py`` 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | py3langid - Language Identifier 2 | BSD 3-Clause License 3 | 4 | Modifications (fork): Copyright (c) 2021, Adrien Barbaresi. 5 | 6 | Original code: Copyright (c) 2011 Marco Lui . 7 | Based on research by Marco Lui and Tim Baldwin. 8 | 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without modification, are 12 | permitted provided that the following conditions are met: 13 | 14 | 1. Redistributions of source code must retain the above copyright notice, this 15 | list of conditions and the following disclaimer. 16 | 17 | 2. Redistributions in binary form must reproduce the above copyright notice, 18 | this list of conditions and the following disclaimer in the documentation 19 | and/or other materials provided with the distribution. 20 | 21 | 3. Neither the name of the copyright holder nor the names of its 22 | contributors may be used to endorse or promote products derived from 23 | this software without specific prior written permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 26 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 27 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 28 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 31 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 32 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 33 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | ``py3langid`` 3 | ============= 4 | 5 | 6 | ``py3langid`` is a fork of the standalone language identification tool ``langid.py`` by Marco Lui. 7 | 8 | Original license: BSD-2-Clause. Fork license: BSD-3-Clause. 9 | 10 | 11 | 12 | Changes in this fork 13 | -------------------- 14 | 15 | Execution speed has been improved and the code base has been optimized for Python 3.6+: 16 | 17 | - Import: Loading the package (``import py3langid``) is about 30% faster 18 | - Startup: Loading the default classification model is 25-30x faster 19 | - Execution: Language detection with ``langid.classify`` is 5-6x faster on paragraphs (less on longer texts) 20 | 21 | For implementation details see this blog post: `How to make language detection with langid.py faster `_. 22 | 23 | For more information and older Python versions see `changelog `_. 24 | 25 | 26 | Usage 27 | ----- 28 | 29 | Drop-in replacement 30 | ~~~~~~~~~~~~~~~~~~~ 31 | 32 | 33 | 1. Install the package: 34 | 35 | * ``pip3 install py3langid`` (or ``pip`` where applicable) 36 | 37 | 2. Use it: 38 | 39 | * with Python: ``import py3langid as langid`` 40 | * on the command-line: ``langid`` 41 | 42 | 43 | With Python 44 | ~~~~~~~~~~~ 45 | 46 | Basics: 47 | 48 | .. code-block:: python 49 | 50 | >>> import py3langid as langid 51 | 52 | >>> text = 'This text is in English.' 53 | # identified language and probability 54 | >>> langid.classify(text) 55 | ('en', -56.77429) 56 | # unpack the result tuple in variables 57 | >>> lang, prob = langid.classify(text) 58 | # all potential languages 59 | >>> langid.rank(text) 60 | 61 | 62 | More options: 63 | 64 | .. code-block:: python 65 | 66 | >>> from py3langid.langid import LanguageIdentifier, MODEL_FILE 67 | 68 | # subset of target languages 69 | >>> identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE) 70 | >>> identifier.set_languages(['de', 'en', 'fr']) 71 | # this won't work well... 72 | >>> identifier.classify('这样不好') 73 | ('en', -81.831665) 74 | 75 | # normalization of probabilities to an interval between 0 and 1 76 | >>> identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) 77 | >>> identifier.classify('This should be enough text.') 78 | ('en', 1.0) 79 | 80 | 81 | Note: the Numpy data type for the feature vector has been changed to optimize for speed. If results are inconsistent, try restoring the original setting: 82 | 83 | .. code-block:: python 84 | 85 | >>> langid.classify(text, datatype='uint32') 86 | 87 | 88 | On the command-line 89 | ~~~~~~~~~~~~~~~~~~~ 90 | 91 | .. code-block:: bash 92 | 93 | # basic usage with probability normalization 94 | $ echo "This should be enough text." | langid -n 95 | ('en', 1.0) 96 | 97 | # define a subset of target languages 98 | $ echo "This won't be recognized properly." | langid -n -l fr,it,tr 99 | ('it', 0.97038305) 100 | 101 | 102 | Legacy documentation 103 | -------------------- 104 | 105 | 106 | **The docs below are provided for reference, only part of the functions are currently tested and maintained.** 107 | 108 | 109 | Introduction 110 | ------------ 111 | 112 | ``langid.py`` is a standalone Language Identification (LangID) tool. 113 | 114 | The design principles are as follows: 115 | 116 | 1. Fast 117 | 2. Pre-trained over a large number of languages (currently 97) 118 | 3. Not sensitive to domain-specific features (e.g. HTML/XML markup) 119 | 4. Single .py file with minimal dependencies 120 | 5. Deployable as a web service 121 | 122 | All that is required to run ``langid.py`` is Python >= 3.6 and numpy. 123 | 124 | The accompanying training tools are still Python2-only. 125 | 126 | ``langid.py`` is WSGI-compliant. ``langid.py`` will use ``fapws3`` as a web server if 127 | available, and default to ``wsgiref.simple_server`` otherwise. 128 | 129 | ``langid.py`` comes pre-trained on 97 languages (ISO 639-1 codes given): 130 | 131 | af, am, an, ar, as, az, be, bg, bn, br, 132 | bs, ca, cs, cy, da, de, dz, el, en, eo, 133 | es, et, eu, fa, fi, fo, fr, ga, gl, gu, 134 | he, hi, hr, ht, hu, hy, id, is, it, ja, 135 | jv, ka, kk, km, kn, ko, ku, ky, la, lb, 136 | lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, 137 | nb, ne, nl, nn, no, oc, or, pa, pl, ps, 138 | pt, qu, ro, ru, rw, se, si, sk, sl, sq, 139 | sr, sv, sw, ta, te, th, tl, tr, ug, uk, 140 | ur, vi, vo, wa, xh, zh, zu 141 | 142 | The training data was drawn from 5 different sources: 143 | 144 | * JRC-Acquis 145 | * ClueWeb 09 146 | * Wikipedia 147 | * Reuters RCV2 148 | * Debian i18n 149 | 150 | 151 | Usage 152 | ----- 153 | 154 | langid [options] 155 | 156 | optional arguments: 157 | -h, --help show this help message and exit 158 | -s, --serve launch web service 159 | --host=HOST host/ip to bind to 160 | --port=PORT port to listen on 161 | -v increase verbosity (repeat for greater effect) 162 | -m MODEL load model from file 163 | -l LANGS, --langs=LANGS 164 | comma-separated set of target ISO639 language codes 165 | (e.g en,de) 166 | -r, --remote auto-detect IP address for remote access 167 | -b, --batch specify a list of files on the command line 168 | -d, --dist show full distribution over languages 169 | -u URL, --url=URL langid of URL 170 | --line process pipes line-by-line rather than as a document 171 | -n, --normalize normalize confidence scores to probability values 172 | 173 | 174 | The simplest way to use ``langid.py`` is as a command-line tool, and you can 175 | invoke using ``python langid.py``. If you installed ``langid.py`` as a Python 176 | module (e.g. via ``pip install langid``), you can invoke ``langid`` instead of 177 | ``python langid.py -n`` (the two are equivalent). This will cause a prompt to 178 | display. Enter text to identify, and hit enter:: 179 | 180 | >>> This is a test 181 | ('en', -54.41310358047485) 182 | >>> Questa e una prova 183 | ('it', -35.41771221160889) 184 | 185 | 186 | ``langid.py`` can also detect when the input is redirected (only tested under Linux), and in this 187 | case will process until EOF rather than until newline like in interactive mode:: 188 | 189 | python langid.py < README.rst 190 | ('en', -22552.496054649353) 191 | 192 | 193 | The value returned is the unnormalized probability estimate for the language. Calculating 194 | the exact probability estimate is disabled by default, but can be enabled through a flag:: 195 | 196 | python langid.py -n < README.rst 197 | ('en', 1.0) 198 | 199 | More details are provided in this README in the section on `Probability Normalization`. 200 | 201 | You can also use ``langid.py`` as a Python library:: 202 | 203 | # python 204 | Python 2.7.2+ (default, Oct 4 2011, 20:06:09) 205 | [GCC 4.6.1] on linux2 206 | Type "help", "copyright", "credits" or "license" for more information. 207 | >>> import langid 208 | >>> langid.classify("This is a test") 209 | ('en', -54.41310358047485) 210 | 211 | Finally, ``langid.py`` can use Python's built-in ``wsgiref.simple_server`` (or ``fapws3`` if available) to 212 | provide language identification as a web service. To do this, launch ``python langid.py -s``, and 213 | access http://localhost:9008/detect . The web service supports GET, POST and PUT. If GET is performed 214 | with no data, a simple HTML forms interface is displayed. 215 | 216 | The response is generated in JSON, here is an example:: 217 | 218 | {"responseData": {"confidence": -54.41310358047485, "language": "en"}, "responseDetails": null, "responseStatus": 200} 219 | 220 | A utility such as curl can be used to access the web service:: 221 | 222 | # curl -d "q=This is a test" localhost:9008/detect 223 | {"responseData": {"confidence": -54.41310358047485, "language": "en"}, "responseDetails": null, "responseStatus": 200} 224 | 225 | You can also use HTTP PUT:: 226 | 227 | # curl -T readme.rst localhost:9008/detect 228 | % Total % Received % Xferd Average Speed Time Time Time Current 229 | Dload Upload Total Spent Left Speed 230 | 100 2871 100 119 100 2752 117 2723 0:00:01 0:00:01 --:--:-- 2727 231 | {"responseData": {"confidence": -22552.496054649353, "language": "en"}, "responseDetails": null, "responseStatus": 200} 232 | 233 | If no "q=XXX" key-value pair is present in the HTTP POST payload, ``langid.py`` will interpret the entire 234 | file as a single query. This allows for redirection via curl:: 235 | 236 | # echo "This is a test" | curl -d @- localhost:9008/detect 237 | {"responseData": {"confidence": -54.41310358047485, "language": "en"}, "responseDetails": null, "responseStatus": 200} 238 | 239 | ``langid.py`` will attempt to discover the host IP address automatically. Often, this is set to localhost(127.0.1.1), even 240 | though the machine has a different external IP address. ``langid.py`` can attempt to automatically discover the external 241 | IP address. To enable this functionality, start ``langid.py`` with the ``-r`` flag. 242 | 243 | ``langid.py`` supports constraining of the output language set using the ``-l`` flag and a comma-separated list of ISO639-1 244 | language codes (the ``-n`` flag enables probability normalization):: 245 | 246 | # python langid.py -n -l it,fr 247 | >>> Io non parlo italiano 248 | ('it', 0.99999999988965627) 249 | >>> Je ne parle pas français 250 | ('fr', 1.0) 251 | >>> I don't speak english 252 | ('it', 0.92210605672341062) 253 | 254 | When using ``langid.py`` as a library, the set_languages method can be used to constrain the language set:: 255 | 256 | python 257 | Python 2.7.2+ (default, Oct 4 2011, 20:06:09) 258 | [GCC 4.6.1] on linux2 259 | Type "help", "copyright", "credits" or "license" for more information. 260 | >>> import langid 261 | >>> langid.classify("I do not speak english") 262 | ('en', 0.57133487679900674) 263 | >>> langid.set_languages(['de','fr','it']) 264 | >>> langid.classify("I do not speak english") 265 | ('it', 0.99999835791478453) 266 | >>> langid.set_languages(['en','it']) 267 | >>> langid.classify("I do not speak english") 268 | ('en', 0.99176190378750373) 269 | 270 | 271 | Batch Mode 272 | ---------- 273 | 274 | ``langid.py`` supports batch mode processing, which can be invoked with the ``-b`` flag. 275 | In this mode, ``langid.py`` reads a list of paths to files to classify as arguments. 276 | If no arguments are supplied, ``langid.py`` reads the list of paths from ``stdin``, 277 | this is useful for using ``langid.py`` with UNIX utilities such as ``find``. 278 | 279 | In batch mode, ``langid.py`` uses ``multiprocessing`` to invoke multiple instances of 280 | the classifier, utilizing all available CPUs to classify documents in parallel. 281 | 282 | 283 | Probability Normalization 284 | ------------------------- 285 | 286 | The probabilistic model implemented by ``langid.py`` involves the multiplication of a 287 | large number of probabilities. For computational reasons, the actual calculations are 288 | implemented in the log-probability space (a common numerical technique for dealing with 289 | vanishingly small probabilities). One side-effect of this is that it is not necessary to 290 | compute a full probability in order to determine the most probable language in a set 291 | of candidate languages. However, users sometimes find it helpful to have a "confidence" 292 | score for the probability prediction. Thus, ``langid.py`` implements a re-normalization 293 | that produces an output in the 0-1 range. 294 | 295 | ``langid.py`` disables probability normalization by default. For 296 | command-line usages of ``langid.py``, it can be enabled by passing the ``-n`` flag. For 297 | probability normalization in library use, the user must instantiate their own 298 | ``LanguageIdentifier``. An example of such usage is as follows:: 299 | 300 | >> from py3langid.langid import LanguageIdentifier, MODEL_FILE 301 | >> identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) 302 | >> identifier.classify("This is a test") 303 | ('en', 0.9999999909903544) 304 | 305 | 306 | Training a model 307 | ---------------- 308 | 309 | So far Python 2.7 only, see the `original instructions `_. 310 | 311 | 312 | Read more 313 | --------- 314 | 315 | ``langid.py`` is based on published research. [1] describes the LD feature selection technique in detail, 316 | and [2] provides more detail about the module ``langid.py`` itself. 317 | 318 | [1] Lui, Marco and Timothy Baldwin (2011) Cross-domain Feature Selection for Language Identification, 319 | In Proceedings of the Fifth International Joint Conference on Natural Language Processing (IJCNLP 2011), 320 | Chiang Mai, Thailand, pp. 553—561. Available from http://www.aclweb.org/anthology/I11-1062 321 | 322 | [2] Lui, Marco and Timothy Baldwin (2012) langid.py: An Off-the-shelf Language Identification Tool, 323 | In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012), 324 | Demo Session, Jeju, Republic of Korea. Available from www.aclweb.org/anthology/P12-3005 325 | -------------------------------------------------------------------------------- /py3langid/__init__.py: -------------------------------------------------------------------------------- 1 | from .langid import classify, rank, set_languages 2 | 3 | __version__ = '0.3.0' 4 | -------------------------------------------------------------------------------- /py3langid/data/model.plzma: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbar/py3langid/812f2055f74c35dea298f30b434644062d9289be/py3langid/data/model.plzma -------------------------------------------------------------------------------- /py3langid/examples/_twokenize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Twokenize -- a tokenizer designed for Twitter text in English and some other European languages. 4 | This tokenizer code has gone through a long history: 5 | 6 | (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif 7 | TweetMotif: Exploratory Search and Topic Summarization for Twitter. 8 | Brendan O'Connor, Michel Krieger, and David Ahn. 9 | ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf 10 | (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger 11 | (2b) Jason Baldridge and David Snyder ported it to Scala 12 | (3) Brendan bugfixed the Scala port and merged with POS-specific changes 13 | for the CMU ARK Twitter POS Tagger 14 | (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06) 15 | 16 | Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP 17 | 18 | There have been at least 2 other Java ports, but they are not in the lineage for the code here. 19 | 20 | Ported to Python by Myle Ott . 21 | """ 22 | 23 | from __future__ import print_function 24 | 25 | import operator 26 | import re 27 | import HTMLParser 28 | 29 | def regex_or(*items): 30 | return '(?:' + '|'.join(items) + ')' 31 | 32 | Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE) 33 | Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE) 34 | 35 | punctChars = r"['\"“”‘’.?!…,:;]" 36 | #punctSeq = punctChars+"+" #'anthem'. => ' anthem '. 37 | punctSeq = r"['\"“”‘’]+|[.?!,…]+|[:;]+" #'anthem'. => ' anthem ' . 38 | entity = r"&(?:amp|lt|gt|quot);" 39 | # URLs 40 | 41 | 42 | # BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong. 43 | # If you actually empirically test it the results are bad. 44 | # Please see https://github.com/brendano/ark-tweet-nlp/pull/9 45 | 46 | urlStart1 = r"(?:https?://|\bwww\.)" 47 | commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)" 48 | ccTLDs = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \ 49 | r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \ 50 | r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \ 51 | r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \ 52 | r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \ 53 | r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \ 54 | r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \ 55 | r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains? 56 | urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)" 57 | urlBody = r"(?:[^\.\s<>][^\s<>]*?)?" 58 | urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?" 59 | urlEnd = r"(?:\.\.+|[<>]|\s|$)" 60 | url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")" 61 | 62 | 63 | # Numeric 64 | timeLike = r"\d+(?::\d+){1,2}" 65 | #numNum = r"\d+\.\d+" 66 | numberWithCommas = r"(?:(?|>)[\._-]+(?:<|<|>|>)" 102 | s5 = "(?:[.][_]+[.])" 103 | # myleott: in Python the (?i) flag affects the whole expression 104 | #basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5 105 | basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5 106 | 107 | eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+" 108 | eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8') 109 | eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]" 110 | eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight 111 | 112 | oOEmote = r"(?:[oO]" + bfCenter + r"[oO])" 113 | 114 | 115 | emoticon = regex_or( 116 | # Standard version :) :( :] :D :P 117 | "(?:>|>)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths), 118 | 119 | # reversed version (: D: use positive lookbehind to remove "(word):" 120 | # because eyes on the right side is more ambiguous with the standard usage of : ; 121 | regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|<)?", 122 | 123 | #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style 124 | eastEmote.replace("2", "1", 1), basicface, 125 | # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb] 126 | # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this 127 | 128 | # myleott: o.O and O.o are two of the biggest sources of differences 129 | # between this and the Java version. One little hack won't hurt... 130 | oOEmote 131 | ) 132 | 133 | Hearts = "(?:<+/?3+)+" #the other hearts are in decorations 134 | 135 | Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8')) 136 | 137 | # BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes 138 | # "hello (#hashtag)" ==> "hello (#hashtag )" WRONG 139 | # "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT 140 | # "hello (@person)" ==> "hello (@person )" WRONG 141 | # "hello (@person)" ==> "hello ( @person )" RIGHT 142 | # ... Some sort of weird interaction with edgepunct I guess, because edgepunct 143 | # has poor content-symbol detection. 144 | 145 | # This also gets #1 #40 which probably aren't hashtags .. but good as tokens. 146 | # If you want good hashtag identification, use a different regex. 147 | Hashtag = "#[a-zA-Z0-9_]+" #optional: lookbehind for \b 148 | #optional: lookbehind for \b, max length 15 149 | AtMention = "[@@][a-zA-Z0-9_]+" 150 | 151 | # I was worried this would conflict with at-mentions 152 | # but seems ok in sample of 5800: 7 changes all email fixes 153 | # http://www.regular-expressions.info/email.html 154 | Bound = r"(?:\W|^|$)" 155 | Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")" 156 | 157 | # We will be tokenizing using these regexps as delimiters 158 | # Additionally, these things are "protected", meaning they shouldn't be further split themselves. 159 | Protected = re.compile( 160 | unicode(regex_or( 161 | Hearts, 162 | url, 163 | Email, 164 | timeLike, 165 | #numNum, 166 | numberWithCommas, 167 | numComb, 168 | emoticon, 169 | Arrows, 170 | entity, 171 | punctSeq, 172 | arbitraryAbbrev, 173 | separators, 174 | decorations, 175 | embeddedApostrophe, 176 | Hashtag, 177 | AtMention 178 | ).decode('utf-8')), re.UNICODE) 179 | 180 | # Edge punctuation 181 | # Want: 'foo' => ' foo ' 182 | # While also: don't => don't 183 | # the first is considered "edge punctuation". 184 | # the second is word-internal punctuation -- don't want to mess with it. 185 | # BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days. 186 | # I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate. 187 | 188 | # Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes) 189 | #edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols) 190 | edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols) 191 | edgePunct = "[" + edgePunctChars + "]" 192 | notEdgePunct = "[a-zA-Z0-9]" # content characters 193 | offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):" 194 | EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE) 195 | EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE) 196 | 197 | def splitEdgePunct(input): 198 | input = EdgePunctLeft.sub(r"\1\2 \3", input) 199 | input = EdgePunctRight.sub(r"\1 \2\3", input) 200 | return input 201 | 202 | # The main work of tokenizing a tweet. 203 | def simpleTokenize(text): 204 | 205 | # Do the no-brainers first 206 | splitPunctText = splitEdgePunct(text) 207 | 208 | textLength = len(splitPunctText) 209 | 210 | # BTO: the logic here got quite convoluted via the Scala porting detour 211 | # It would be good to switch back to a nice simple procedural style like in the Python version 212 | # ... Scala is such a pain. Never again. 213 | 214 | # Find the matches for subsequences that should be protected, 215 | # e.g. URLs, 1.0, U.N.K.L.E., 12:53 216 | bads = [] 217 | badSpans = [] 218 | for match in Protected.finditer(splitPunctText): 219 | # The spans of the "bads" should not be split. 220 | if (match.start() != match.end()): #unnecessary? 221 | bads.append( [splitPunctText[match.start():match.end()]] ) 222 | badSpans.append( (match.start(), match.end()) ) 223 | 224 | # Create a list of indices to create the "goods", which can be 225 | # split. We are taking "bad" spans like 226 | # List((2,5), (8,10)) 227 | # to create 228 | # List(0, 2, 5, 8, 10, 12) 229 | # where, e.g., "12" here would be the textLength 230 | # has an even length and no indices are the same 231 | indices = [0] 232 | for (first, second) in badSpans: 233 | indices.append(first) 234 | indices.append(second) 235 | indices.append(textLength) 236 | 237 | # Group the indices and map them to their respective portion of the string 238 | splitGoods = [] 239 | for i in range(0, len(indices), 2): 240 | goodstr = splitPunctText[indices[i]:indices[i+1]] 241 | splitstr = goodstr.strip().split(" ") 242 | splitGoods.append(splitstr) 243 | 244 | # Reinterpolate the 'good' and 'bad' Lists, ensuring that 245 | # additonal tokens from last good item get included 246 | zippedStr = [] 247 | for i in range(len(bads)): 248 | zippedStr = addAllnonempty(zippedStr, splitGoods[i]) 249 | zippedStr = addAllnonempty(zippedStr, bads[i]) 250 | zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)]) 251 | 252 | # BTO: our POS tagger wants "ur" and "you're" to both be one token. 253 | # Uncomment to get "you 're" 254 | #splitStr = [] 255 | #for tok in zippedStr: 256 | # splitStr.extend(splitToken(tok)) 257 | #zippedStr = splitStr 258 | 259 | return zippedStr 260 | 261 | def addAllnonempty(master, smaller): 262 | for s in smaller: 263 | strim = s.strip() 264 | if (len(strim) > 0): 265 | master.append(strim) 266 | return master 267 | 268 | # "foo bar " => "foo bar" 269 | def squeezeWhitespace(input): 270 | return Whitespace.sub(" ", input).strip() 271 | 272 | # Final pass tokenization based on special patterns 273 | def splitToken(token): 274 | m = Contractions.search(token) 275 | if m: 276 | return [m.group(1), m.group(2)] 277 | return [token] 278 | 279 | # Assume 'text' has no HTML escaping. 280 | def tokenize(text): 281 | return simpleTokenize(squeezeWhitespace(text)) 282 | 283 | 284 | # Twitter text comes HTML-escaped, so unescape it. 285 | # We also first unescape &'s, in case the text has been buggily double-escaped. 286 | def normalizeTextForTagger(text): 287 | text = text.replace("&", "&") 288 | text = HTMLParser.HTMLParser().unescape(text) 289 | return text 290 | 291 | # This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger. 292 | # 293 | # This function normalizes the input text BEFORE calling the tokenizer. 294 | # So the tokens you get back may not exactly correspond to 295 | # substrings of the original text. 296 | def tokenizeRawTweetText(text): 297 | return tokenize(normalizeTextForTagger(text)) 298 | -------------------------------------------------------------------------------- /py3langid/examples/process_twitter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example for using langid.py to identify the language of messages 3 | on a twitter livestream. Optionally, it can also filter messages 4 | and display only those in a target language(s). 5 | 6 | Expects a Twitterstream on STDIN, such as the one provided by: 7 | 8 | # curl https://stream.twitter.com/1/statuses/sample.json -u -s 9 | 10 | Outputs lang:message one-per-line to STDOUT 11 | 12 | Marco Lui, June 2012 13 | """ 14 | 15 | import sys 16 | import langid 17 | import json 18 | import optparse 19 | import re 20 | 21 | import _twokenize 22 | 23 | 24 | to_clean = re.compile(_twokenize.regex_or( 25 | _twokenize.Hearts, 26 | _twokenize.url, 27 | _twokenize.Email, 28 | _twokenize.emoticon, 29 | _twokenize.Arrows, 30 | _twokenize.entity, 31 | _twokenize.decorations, 32 | _twokenize.Hashtag, 33 | _twokenize.AtMention, 34 | ).decode('utf8'), re.UNICODE) 35 | 36 | 37 | def clean_tweet(text): 38 | return to_clean.sub('', text) 39 | 40 | 41 | def squeeze_whitespace(text): 42 | return re.sub('\s+', ' ', text) 43 | 44 | 45 | if __name__ == "__main__": 46 | parser = optparse.OptionParser() 47 | parser.add_option('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)') 48 | opts, args = parser.parse_args() 49 | 50 | lang_set = set(opts.langs.split(",")) if opts.langs else None 51 | 52 | try: 53 | for line in sys.stdin: 54 | j = json.loads(line) 55 | if j.get('retweet_count') == 0: 56 | text = j.get('text') 57 | if text: 58 | lang, conf = langid.classify(clean_tweet(text)) 59 | if lang_set is None or lang in lang_set: 60 | print "{0}: {1}".format(lang, squeeze_whitespace(text).encode('utf8')) 61 | except (IOError, KeyboardInterrupt): 62 | # Terminate on broken pipe or ^C 63 | pass 64 | -------------------------------------------------------------------------------- /py3langid/langid.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file bundles language identification functions. 3 | 4 | Modifications (fork): Copyright (c) 2021, Adrien Barbaresi. 5 | 6 | Original code: Copyright (c) 2011 Marco Lui . 7 | Based on research by Marco Lui and Tim Baldwin. 8 | 9 | See LICENSE file for more info. 10 | """ 11 | 12 | import bz2 13 | import json 14 | import logging 15 | import lzma 16 | import pickle 17 | 18 | from base64 import b64decode 19 | from collections import Counter 20 | from operator import itemgetter 21 | from pathlib import Path 22 | from urllib.parse import parse_qs 23 | 24 | import numpy as np 25 | 26 | 27 | LOGGER = logging.getLogger(__name__) 28 | 29 | # model defaults 30 | IDENTIFIER = None 31 | MODEL_FILE = 'data/model.plzma' 32 | NORM_PROBS = False # Normalize output probabilities. 33 | # NORM_PROBS defaults to False for a small speed increase. It does not 34 | # affect the relative ordering of the predicted classes. It can be 35 | # re-enabled at runtime - see the readme. 36 | 37 | # quantization: faster but less precise 38 | DATATYPE = "uint16" 39 | 40 | 41 | def load_model(path=None): 42 | """ 43 | Convenience method to set the global identifier using a model at a 44 | specified path. 45 | 46 | @param path to model 47 | """ 48 | LOGGER.debug('initializing identifier') 49 | global IDENTIFIER 50 | if path is None: 51 | IDENTIFIER = LanguageIdentifier.from_pickled_model(MODEL_FILE) 52 | else: 53 | IDENTIFIER = LanguageIdentifier.from_modelpath(path) 54 | 55 | 56 | def set_languages(langs=None): 57 | """ 58 | Set the language set used by the global identifier. 59 | 60 | @param langs a list of language codes 61 | """ 62 | if IDENTIFIER is None: 63 | load_model() 64 | return IDENTIFIER.set_languages(langs) 65 | 66 | 67 | def classify(instance, datatype=DATATYPE): 68 | """ 69 | Convenience method using a global identifier instance with the default 70 | model included in langid.py. Identifies the language that a string is 71 | written in. 72 | 73 | @param instance a text string. Unicode strings will automatically be utf8-encoded 74 | @returns a tuple of the most likely language and the confidence score 75 | """ 76 | if IDENTIFIER is None: 77 | load_model() 78 | return IDENTIFIER.classify(instance, datatype=datatype) 79 | 80 | 81 | def rank(instance): 82 | """ 83 | Convenience method using a global identifier instance with the default 84 | model included in langid.py. Ranks all the languages in the model according 85 | to the likelihood that the string is written in each language. 86 | 87 | @param instance a text string. Unicode strings will automatically be utf8-encoded 88 | @returns a list of tuples language and the confidence score, in descending order 89 | """ 90 | if IDENTIFIER is None: 91 | load_model() 92 | return IDENTIFIER.rank(instance) 93 | 94 | 95 | def cl_path(path): 96 | """ 97 | Convenience method using a global identifier instance with the default 98 | model included in langid.py. Identifies the language that the file at `path` is 99 | written in. 100 | 101 | @param path path to file 102 | @returns a tuple of the most likely language and the confidence score 103 | """ 104 | if IDENTIFIER is None: 105 | load_model() 106 | return IDENTIFIER.cl_path(path) 107 | 108 | 109 | def rank_path(path): 110 | """ 111 | Convenience method using a global identifier instance with the default 112 | model included in langid.py. Ranks all the languages in the model according 113 | to the likelihood that the file at `path` is written in each language. 114 | 115 | @param path path to file 116 | @returns a list of tuples language and the confidence score, in descending order 117 | """ 118 | if IDENTIFIER is None: 119 | load_model() 120 | return IDENTIFIER.rank_path(path) 121 | 122 | 123 | class LanguageIdentifier: 124 | """ 125 | This class implements the actual language identifier. 126 | """ 127 | __slots__ = ['nb_ptc', 'nb_pc', 'nb_numfeats', 'nb_classes', 'tk_nextmove', 'tk_output', 128 | 'norm_probs', '__full_model'] 129 | 130 | # new version: speed-up 131 | @classmethod 132 | def from_pickled_model(cls, pickled_file, *args, **kwargs): 133 | # load data 134 | filepath = str(Path(__file__).parent / pickled_file) 135 | with lzma.open(filepath) as filehandle: 136 | nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.load(filehandle) 137 | nb_numfeats = len(nb_ptc) // len(nb_pc) 138 | 139 | # reconstruct pc and ptc 140 | nb_pc = np.array(nb_pc) 141 | nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc)) 142 | 143 | return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs) 144 | 145 | # legacy methods 146 | @classmethod 147 | def from_modelstring(cls, string, *args, **kwargs): 148 | # load data 149 | nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.loads(bz2.decompress(b64decode(string))) 150 | nb_numfeats = len(nb_ptc) // len(nb_pc) 151 | 152 | # reconstruct pc and ptc 153 | nb_pc = np.array(nb_pc) 154 | nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc)) 155 | 156 | return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs) 157 | 158 | @classmethod 159 | def from_modelpath(cls, path, *args, **kwargs): 160 | with open(path, 'rb') as f: 161 | return cls.from_modelstring(f.read(), *args, **kwargs) 162 | 163 | def __init__(self, nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, 164 | norm_probs=NORM_PROBS): 165 | self.nb_ptc = nb_ptc 166 | self.nb_pc = nb_pc 167 | self.nb_numfeats = nb_numfeats 168 | self.nb_classes = nb_classes 169 | self.tk_nextmove = tk_nextmove 170 | self.tk_output = tk_output 171 | 172 | def apply_norm_probs(pd): 173 | """ 174 | Renormalize log-probs into a proper distribution (sum 1) 175 | The technique for dealing with underflow is described in 176 | http://jblevins.org/log/log-sum-exp 177 | """ 178 | if norm_probs: 179 | # Ignore overflow when computing the exponential. Large values 180 | # in the exp produce a result of inf, which does not affect 181 | # the correctness of the calculation (as 1/x->0 as x->inf). 182 | # On Linux this does not actually trigger a warning, but on 183 | # Windows this causes a RuntimeWarning, so we explicitly 184 | # suppress it. 185 | with np.errstate(over='ignore'): 186 | # legacy formula, there are possibly better alternatives 187 | pd = 1/np.exp(pd[None,:] - pd[:,None]).sum(1) 188 | return pd 189 | 190 | self.norm_probs = apply_norm_probs 191 | 192 | # Maintain a reference to the full model, in case we change our language set 193 | # multiple times. 194 | self.__full_model = nb_ptc, nb_pc, nb_classes 195 | 196 | def set_languages(self, langs=None): 197 | LOGGER.debug("restricting languages to: %s", langs) 198 | 199 | # Unpack the full original model. This is needed in case the language set 200 | # has been previously trimmed, and the new set is not a subset of the current 201 | # set. 202 | nb_ptc, nb_pc, nb_classes = self.__full_model 203 | 204 | if langs is None: 205 | self.nb_classes, self.nb_ptc, self.nb_pc = nb_classes, nb_ptc, nb_pc 206 | 207 | else: 208 | # We were passed a restricted set of languages. Trim the arrays accordingly 209 | # to speed up processing. 210 | for lang in langs: 211 | if lang not in nb_classes: 212 | raise ValueError(f"Unknown language code {lang}") 213 | 214 | subset_mask = np.isin(nb_classes, langs) 215 | self.nb_classes = [c for c in nb_classes if c in langs] 216 | self.nb_ptc = nb_ptc[:, subset_mask] 217 | self.nb_pc = nb_pc[subset_mask] 218 | 219 | def instance2fv(self, text, datatype=DATATYPE): 220 | """ 221 | Map an instance into the feature space of the trained model. 222 | 223 | @param datatype NumPy data type (originally uint32) 224 | """ 225 | # convert to binary if it isn't already the case 226 | if isinstance(text, str): 227 | # fix for surrogates on Windows/NT platforms 228 | text = text.encode('utf8', errors='surrogatepass') 229 | 230 | # Convert the text to a sequence of ascii values and 231 | # Count the number of times we enter each state 232 | state, indexes = 0, [] 233 | extend = indexes.extend 234 | 235 | for letter in text: 236 | state = self.tk_nextmove[(state << 8) + letter] 237 | extend(self.tk_output.get(state, [])) 238 | 239 | # datatype: consider that less feature counts are going to be needed 240 | arr = np.zeros(self.nb_numfeats, dtype=datatype) 241 | # Update all the productions corresponding to the state 242 | for index, value in Counter(indexes).items(): 243 | arr[index] = value 244 | 245 | return arr 246 | 247 | def nb_classprobs(self, fv): 248 | # compute the partial log-probability of the document given each class 249 | pdc = np.dot(fv, self.nb_ptc) # fv @ self.nb_ptc 250 | # compute the partial log-probability of the document in each class 251 | return pdc + self.nb_pc 252 | 253 | def classify(self, text, datatype=DATATYPE): 254 | """ 255 | Classify an instance. 256 | """ 257 | fv = self.instance2fv(text, datatype=datatype) 258 | probs = self.norm_probs(self.nb_classprobs(fv)) 259 | cl = np.argmax(probs) 260 | return self.nb_classes[cl], probs[cl] 261 | 262 | def rank(self, text): 263 | """ 264 | Return a list of languages in order of likelihood. 265 | """ 266 | fv = self.instance2fv(text) 267 | probs = self.norm_probs(self.nb_classprobs(fv)) 268 | return sorted(zip(self.nb_classes, probs), key=itemgetter(1), reverse=True) 269 | 270 | def cl_path(self, path): 271 | """ 272 | Classify a file at a given path 273 | """ 274 | with open(path, 'rb') as f: 275 | retval = self.classify(f.read()) 276 | return path, retval 277 | 278 | def rank_path(self, path): 279 | """ 280 | Class ranking for a file at a given path 281 | """ 282 | with open(path, 'rb') as f: 283 | retval = self.rank(f.read()) 284 | return path, retval 285 | 286 | 287 | class NumpyEncoder(json.JSONEncoder): 288 | """ Custom encoder for numpy data types """ 289 | def default(self, o): 290 | if isinstance(o, np.float32): 291 | return float(o) # Convert float32 to native float 292 | if isinstance(o, np.ndarray): 293 | return o.tolist() # Convert arrays to list 294 | return json.JSONEncoder.default(self, o) 295 | 296 | 297 | METHODS = { 298 | 'detect': lambda data: {'language': classify(data)[0], 'confidence': classify(data)[1]}, 299 | 'rank': lambda data: rank(data) 300 | } 301 | 302 | 303 | def application(environ, start_response): 304 | """ 305 | WSGI-compatible langid web service. 306 | """ 307 | from wsgiref.util import shift_path_info 308 | try: 309 | path = shift_path_info(environ) 310 | except IndexError: 311 | # Catch shift_path_info's failure to handle empty paths properly 312 | path = '' 313 | 314 | if path not in METHODS: 315 | return _return_response(start_response, 404, None, 'Not found') 316 | 317 | data = _get_data(environ) 318 | if data is None: 319 | if environ['REQUEST_METHOD'] == 'GET' and 'QUERY_STRING' not in environ: 320 | return _return_response(start_response, 400, None, 'Missing query string') 321 | return _return_response(start_response, 405, None, f"{environ['REQUEST_METHOD']} not allowed") 322 | 323 | response_data = METHODS[path](data) 324 | return _return_response(start_response, 200, response_data, None) 325 | 326 | 327 | def _get_data(environ): 328 | if environ['REQUEST_METHOD'] in ['PUT', 'POST']: 329 | data = environ['wsgi.input'].read(int(environ['CONTENT_LENGTH'])) 330 | if environ['REQUEST_METHOD'] == 'POST': 331 | try: 332 | data = parse_qs(data)['q'][0] 333 | except KeyError: 334 | pass 335 | return data 336 | if environ['REQUEST_METHOD'] == 'GET': 337 | try: 338 | return parse_qs(environ['QUERY_STRING'])['q'][0] 339 | except KeyError: 340 | pass 341 | return None 342 | 343 | 344 | STATUS_MESSAGES = { 345 | 200: "OK", 346 | 404: "Not Found", 347 | 405: "Method Not Allowed" 348 | } 349 | 350 | 351 | def _return_response(start_response, status_code, response_data, response_details): 352 | status = f"{status_code} {STATUS_MESSAGES.get(status_code, 'Unknown Status')}" 353 | response = { 354 | 'responseData': response_data, 355 | 'responseStatus': status_code, 356 | 'responseDetails': response_details, 357 | } 358 | headers = [('Content-type', 'text/javascript; charset=utf-8')] 359 | start_response(status, headers) 360 | return [json.dumps(response, cls=NumpyEncoder).encode('utf-8')] 361 | 362 | 363 | def main(): 364 | 365 | # lazy imports 366 | import argparse 367 | import sys 368 | 369 | # parse arguments 370 | parser = argparse.ArgumentParser() 371 | parser.add_argument('-s', '--serve', action='store_true', default=False, dest='serve', help='launch web service') 372 | parser.add_argument('--host', default=None, dest='host', help='host/ip to bind to') 373 | parser.add_argument('--port', default=9008, dest='port', help='port to listen on') 374 | parser.add_argument('-v', action='count', dest='verbosity', help='increase verbosity (repeat for greater effect)') 375 | parser.add_argument('-m', dest='model', help='load model from file') 376 | parser.add_argument('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)') 377 | parser.add_argument('-r', '--remote', action="store_true", default=False, help='auto-detect IP address for remote access') 378 | parser.add_argument('-b', '--batch', action="store_true", default=False, help='specify a list of files on the command line') 379 | parser.add_argument('-d', '--dist', action='store_true', default=False, help='show full distribution over languages') 380 | parser.add_argument('-u', '--url', help='langid of URL') 381 | parser.add_argument('--line', action="store_true", default=False, help='process pipes line-by-line rather than as a document') 382 | parser.add_argument('-n', '--normalize', action='store_true', default=False, help='normalize confidence scores to probability values') 383 | options = parser.parse_args() 384 | 385 | if options.verbosity: 386 | logging.basicConfig(level=max((5-options.verbosity)*10, 0)) 387 | else: 388 | logging.basicConfig() 389 | 390 | if options.batch and options.serve: 391 | parser.error("cannot specify both batch and serve at the same time") 392 | 393 | # unpack a model 394 | global IDENTIFIER 395 | 396 | if options.model: 397 | try: 398 | IDENTIFIER = LanguageIdentifier.from_modelpath(options.model, norm_probs=options.normalize) 399 | LOGGER.info("Using external model: %s", options.model) 400 | except IOError as e: 401 | LOGGER.warning("Failed to load %s: %s", options.model, e) 402 | 403 | if IDENTIFIER is None: 404 | IDENTIFIER = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=options.normalize) 405 | LOGGER.info("Using internal model") 406 | 407 | if options.langs: 408 | langs = options.langs.split(",") 409 | IDENTIFIER.set_languages(langs) 410 | 411 | def _process(text): 412 | """ 413 | Set up a local function to do output, configured according to our settings. 414 | """ 415 | return IDENTIFIER.rank(text) if options.dist else IDENTIFIER.classify(text) 416 | 417 | if options.url: 418 | from urllib.request import urlopen 419 | with urlopen(options.url) as url: 420 | text = url.read() 421 | output = _process(text) 422 | print(options.url, len(text), output) 423 | 424 | elif options.serve: 425 | import socket 426 | from wsgiref.simple_server import make_server 427 | 428 | # from http://stackoverflow.com/questions/166506/finding-local-ip-addresses-in-python 429 | if options.remote and options.host is None: 430 | # resolve the external ip address 431 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 432 | s.connect(("google.com", 80)) 433 | hostname = s.getsockname()[0] 434 | elif options.host is None: 435 | # resolve the local hostname 436 | hostname = socket.gethostbyname(socket.gethostname()) 437 | else: 438 | hostname = options.host 439 | 440 | print(f"Listening on {hostname}:%{options.port}") 441 | print("Press Ctrl+C to exit") 442 | httpd = make_server(hostname, int(options.port), application) 443 | try: 444 | httpd.serve_forever() 445 | except KeyboardInterrupt: 446 | pass 447 | 448 | elif options.batch: 449 | # Start in batch mode - interpret input as paths rather than content 450 | # to classify. 451 | import csv 452 | from multiprocessing import Pool 453 | 454 | def generate_paths(): 455 | for line in sys.stdin: 456 | path = line.strip() 457 | if path and Path.is_file(path): 458 | yield path 459 | 460 | writer = csv.writer(sys.stdout) 461 | with Pool() as pool: 462 | if options.dist: 463 | writer.writerow(['path'] + IDENTIFIER.nb_classes) 464 | for path, ranking in pool.imap_unordered(rank_path, generate_paths()): 465 | ranking = dict(ranking) 466 | row = [path] + [ranking[c] for c in IDENTIFIER.nb_classes] 467 | writer.writerow(row) 468 | else: 469 | for path, (lang, conf) in pool.imap_unordered(cl_path, generate_paths()): 470 | writer.writerow((path, lang, conf)) 471 | else: 472 | if sys.stdin.isatty(): 473 | # Interactive mode 474 | while True: 475 | try: 476 | print(">>>", end=' ') 477 | text = input() 478 | except Exception as e: 479 | print(e) 480 | break 481 | print(_process(text)) 482 | else: 483 | # Redirected 484 | if options.line: 485 | for line in sys.stdin: 486 | print(_process(line)) 487 | else: 488 | print(_process(sys.stdin.read())) 489 | 490 | 491 | if __name__ == "__main__": 492 | main() 493 | -------------------------------------------------------------------------------- /py3langid/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbar/py3langid/812f2055f74c35dea298f30b434644062d9289be/py3langid/tools/__init__.py -------------------------------------------------------------------------------- /py3langid/tools/featWeights.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tabulate feature weight data into a single CSV for 3 | further analysis using other tools. This produces 4 | a CSV with header. The features themselves are not 5 | included. 6 | 7 | Marco Lui, February 2013 8 | """ 9 | 10 | import argparse, os, csv, sys 11 | import numpy as np 12 | import bz2, base64 13 | from cPickle import loads 14 | 15 | from langid.train.common import read_weights, read_features 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('model', metavar="MODEL_DIR", help="path to langid.py training model dir") 20 | parser.add_argument('output', metavar="OUTPUT", help = "write to OUTPUT") 21 | parser.add_argument('-f','--features', metavar="FILE", help = 'only output features from FILE') 22 | parser.add_argument('--raw', action='store_true', help="include raw features") 23 | parser.add_argument('--bin', action='store_true', help="include ig for lang-bin") 24 | args = parser.parse_args() 25 | 26 | def model_file(name): 27 | return os.path.join(args.model, name) 28 | 29 | # Try to determine the set of features to consider 30 | if args.features: 31 | # Use a pre-determined feature list 32 | print >>sys.stderr, "using user-supplied feature list:", args.features 33 | feats = read_features(args.features) 34 | elif os.path.exists(model_file('LDfeats')): 35 | # Use LDfeats 36 | print >>sys.stderr, "using LDfeats" 37 | feats = read_features(model_file('LDfeats')) 38 | else: 39 | raise ValueError("no suitable feature list") 40 | 41 | print >>sys.stderr, "considering {0} features".format(len(feats)) 42 | 43 | records = dict( (k, {}) for k in feats ) 44 | headers = [] 45 | 46 | headers.append('len') 47 | for k in feats: 48 | records[k]['len'] = len(k) 49 | 50 | 51 | # Document Frequency 52 | if os.path.exists(model_file('DF_all')): 53 | print >>sys.stderr, "found weights for document frequency" 54 | w = read_weights(model_file('DF_all')) 55 | headers.append('DF') 56 | for k in feats: 57 | records[k]['DF'] = w[k][0] 58 | 59 | # IG weights for the all-languages event 60 | if os.path.exists(model_file('IGweights.lang')): 61 | print >>sys.stderr, "found weights for lang" 62 | w = read_weights(model_file('IGweights.lang')) 63 | headers.append('IGlang') 64 | for k in feats: 65 | records[k]['IGlang'] = w[k][0] 66 | 67 | # IG weights for the all-domains event 68 | if os.path.exists(model_file('IGweights.domain')): 69 | print >>sys.stderr, "found weights for domain" 70 | w = read_weights(model_file('IGweights.domain')) 71 | headers.append('IGdomain') 72 | for k in feats: 73 | records[k]['IGdomain'] = w[k][0] 74 | 75 | # IG weights for language-binarized 76 | if args.bin and os.path.exists(model_file('IGweights.lang.bin')) and os.path.exists(model_file('lang_index')): 77 | print >>sys.stderr, "found weights for lang.bin" 78 | w = read_weights(model_file('IGweights.lang.bin')) 79 | 80 | # find the list of langs in-order 81 | with open(os.path.join(args.model, "lang_index")) as f: 82 | reader = csv.reader(f) 83 | langs = zip(*reader)[0] 84 | 85 | r_h = ['IGlang.bin.{0}'.format(l) for l in langs] 86 | headers.extend( r_h ) 87 | for k in feats: 88 | records[k].update( dict(zip(r_h, w[k])) ) 89 | 90 | if os.path.exists(model_file('LDfeats.scanner')) and os.path.exists(model_file('model')): 91 | print >>sys.stderr, "found weights for P(t|c)" 92 | with open(model_file('model')) as f: 93 | model = loads(bz2.decompress(base64.b64decode(f.read()))) 94 | with open(model_file('LDfeats.scanner')) as f: 95 | _, _, nb_feats = loads(f.read()) 96 | nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model 97 | nb_numfeats = len(nb_ptc) / len(nb_pc) 98 | nb_ptc = np.array(nb_ptc).reshape(len(nb_ptc)/len(nb_pc), len(nb_pc)) 99 | 100 | # Normalize to 1 on the term axis 101 | for i in range(nb_ptc.shape[1]): 102 | nb_ptc[:,i] = (1/np.exp(nb_ptc[:,i][None,:] - nb_ptc[:,i][:,None]).sum(1)) 103 | w = dict(zip(nb_feats, nb_ptc)) 104 | 105 | r_h = ['ptc.{0}'.format(l) for l in nb_classes] 106 | headers.extend( r_h ) 107 | for k in feats: 108 | records[k].update( dict(zip(r_h, w[k])) ) 109 | 110 | if args.raw: 111 | headers.append('feat') 112 | for k in feats: 113 | records[k]['feat'] = k 114 | 115 | 116 | 117 | print >>sys.stderr, "writing output" 118 | with open(args.output, 'w') as f: 119 | writer = csv.DictWriter(f,headers) 120 | writer.writeheader() 121 | writer.writerows(records.values()) 122 | 123 | print >>sys.stderr, "done" 124 | -------------------------------------------------------------------------------- /py3langid/tools/printfeats.py: -------------------------------------------------------------------------------- 1 | """ 2 | Print features out in order of their weights 3 | 4 | Marco Lui, November 2013 5 | """ 6 | 7 | import argparse, os, csv, sys 8 | 9 | from langid.train.common import read_weights 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('file', help="file to read") 14 | parser.add_argument('-c','--column',help="project a specific column", type=int) 15 | parser.add_argument('-n','--number',help="output top N features", type=int) 16 | parser.add_argument('-v','--value',help="output the value used for ranking", action="store_true") 17 | parser.add_argument('-p','--printfeat',help="print the actual feature (default is to print repr)", action="store_true") 18 | parser.add_argument('--output', "-o", default=sys.stdout, type=argparse.FileType('w'), help = "write to OUTPUT") 19 | args = parser.parse_args() 20 | 21 | w = read_weights(args.file) 22 | n = args.number if args.number is not None else len(w) 23 | 24 | def show(feat): 25 | if args.printfeat: 26 | return feat 27 | else: 28 | return repr(feat) 29 | 30 | if args.column is not None: 31 | for key in sorted(w, key=lambda x:w[x][args.column], reverse=True)[:n]: 32 | if args.value: 33 | args.output.write("{0},{1}\n".format(show(key),w[key][args.column])) 34 | else: 35 | args.output.write("{0}\n".format(show(key))) 36 | else: 37 | for key in sorted(w, key=w.get, reverse=True)[:n]: 38 | if args.value: 39 | args.output.write("{0},{1}\n".format(show(key),w[key])) 40 | else: 41 | args.output.write("{0}\n".format(show(key))) 42 | -------------------------------------------------------------------------------- /py3langid/train/BLweight.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementing the "blacklist" feature weighting metric proposed by 3 | Tiedemann & Ljubesic. 4 | 5 | Marco Lui, February 2013 6 | """ 7 | 8 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation 9 | CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use) 10 | 11 | import argparse 12 | import os 13 | 14 | import numpy as np 15 | 16 | from .common import read_features, makedir, write_weights 17 | from .scanner import build_scanner 18 | from .index import CorpusIndexer 19 | from .NBtrain import generate_cm, learn_ptc 20 | 21 | 22 | if __name__ == "__main__": 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("-o","--output", metavar="DIR", help = "write weights to DIR") 26 | parser.add_argument('-f','--features', metavar="FILE", help = 'only output features from FILE') 27 | parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets") 28 | parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") 29 | parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR") 30 | parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS) 31 | parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE) 32 | parser.add_argument("--no_norm", default=False, action="store_true", help="do not normalize difference in p(t|C) by sum p(t|C)") 33 | parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR") 34 | parser.add_argument("pairs", metavar='LANG_PAIR', nargs="*", help="language pairs to compute BL weights for") 35 | args = parser.parse_args() 36 | 37 | # Work out where our model directory is 38 | corpus_name = os.path.basename(args.corpus) 39 | if args.model: 40 | model_dir = args.model 41 | else: 42 | model_dir = os.path.join('.', corpus_name+'.model') 43 | 44 | def m_path(name): 45 | return os.path.join(model_dir, name) 46 | 47 | # Try to determine the set of features to consider 48 | if args.features: 49 | # Use a pre-determined feature list 50 | feat_path = args.features 51 | elif os.path.exists(m_path('DFfeats')): 52 | # Use LDfeats 53 | feat_path = m_path('DFfeats') 54 | else: 55 | raise ValueError("no suitable feature list") 56 | 57 | # Where temp files go 58 | if args.temp: 59 | buckets_dir = args.temp 60 | else: 61 | buckets_dir = m_path('buckets') 62 | makedir(buckets_dir) 63 | 64 | all_langs = set() 65 | pairs = [] 66 | for p in args.pairs: 67 | try: 68 | lang1, lang2 = p.split(',') 69 | except ValueError: 70 | # Did not unpack to two values 71 | parser.error("{0} is not a lang-pair".format(p)) 72 | all_langs.add(lang1) 73 | all_langs.add(lang2) 74 | pairs.append((lang1, lang2)) 75 | 76 | if args.output: 77 | makedir(args.output) 78 | out_dir = args.output 79 | else: 80 | out_dir = model_dir 81 | 82 | langs = sorted(all_langs) 83 | 84 | # display paths 85 | print("languages({1}): {0}".format(langs, len(langs))) 86 | print("model path:", model_dir) 87 | print("feature path:", feat_path) 88 | print("output path:", out_dir) 89 | print("temp (buckets) path:", buckets_dir) 90 | 91 | feats = read_features(feat_path) 92 | 93 | indexer = CorpusIndexer(args.corpus, langs = langs) 94 | items = [ (d,l,p) for (d,l,n,p) in indexer.items ] 95 | if len(items) == 0: 96 | raise ValueError("found no files!") 97 | 98 | print("will process {0} features across {1} paths".format(len(feats), len(items))) 99 | print("will process {0} features across {1} paths".format(len(feats), len(items))) 100 | 101 | # produce a scanner over all the features 102 | tk_nextmove, tk_output = build_scanner(feats) 103 | 104 | # Generate a class map over all the languages we are dealing with 105 | cm = generate_cm([ (l,p) for d,l,p in items], len(langs)) 106 | 107 | # Compute P(t|C) 108 | print("learning P(t|C)") 109 | paths = zip(*items)[2] 110 | nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args) 111 | nb_ptc = np.array(nb_ptc).reshape(len(feats), len(langs)) 112 | 113 | # Normalize to 1 on the term axis 114 | print("renormalizing P(t|C)") 115 | for i in range(nb_ptc.shape[1]): 116 | # had to de-vectorize this due to memory consumption 117 | newval = np.empty_like(nb_ptc[:,i]) 118 | for j in range(newval.shape[0]): 119 | newval[j] = (1/np.exp(nb_ptc[:,i] - nb_ptc[j,i]).sum()) 120 | nb_ptc[:,i] = newval 121 | assert (1.0 - newval.sum()) < 0.0001 122 | 123 | print("doing per-pair output") 124 | for lang1, lang2 in pairs: 125 | # Where to do output 126 | if args.no_norm: 127 | weights_path = os.path.join(out_dir, ('BLfeats.no_norm.{0}.{1}'.format(lang1, lang2))) 128 | else: 129 | weights_path = os.path.join(out_dir, ('BLfeats.{0}.{1}'.format(lang1, lang2))) 130 | 131 | i1 = indexer.lang_index[lang1] 132 | i2 = indexer.lang_index[lang2] 133 | 134 | w = dict(zip(feats, np.abs((nb_ptc[:,i1] - nb_ptc[:,i2]) / (nb_ptc.sum(1) if not args.no_norm else 1)))) 135 | write_weights(w, weights_path) 136 | print("wrote weights to {0}".format(weights_path)) 137 | -------------------------------------------------------------------------------- /py3langid/train/DFfeatureselect.py: -------------------------------------------------------------------------------- 1 | """ 2 | DFfeatureselect.py - 3 | First step in the LD feature selection process, select features based on document 4 | frequency. 5 | 6 | Marco Lui January 2013 7 | 8 | Copyright 2013 Marco Lui . All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without modification, are 11 | permitted provided that the following conditions are met: 12 | 13 | 1. Redistributions of source code must retain the above copyright notice, this list of 14 | conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 17 | of conditions and the following disclaimer in the documentation and/or other materials 18 | provided with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | The views and conclusions contained in the software and documentation are those of the 31 | authors and should not be interpreted as representing official policies, either expressed 32 | or implied, of the copyright holder. 33 | """ 34 | 35 | ###### 36 | # Default values 37 | # Can be overriden with command-line options 38 | ###### 39 | MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider 40 | TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order 41 | 42 | import argparse 43 | import os 44 | import marshal 45 | 46 | from collections import defaultdict 47 | 48 | from .common import unmarshal_iter, MapPool, write_features, write_weights 49 | 50 | 51 | def pass_sum_df(bucket): 52 | """ 53 | Compute document frequency (df) by summing up (key,domain,count) triplets 54 | over all domains. 55 | """ 56 | doc_count = defaultdict(int) 57 | count = 0 58 | with open(os.path.join(bucket, "docfreq"),'wb') as docfreq: 59 | for path in os.listdir(bucket): 60 | # We use the domain buckets as there are usually less domains 61 | if path.endswith('.domain'): 62 | for key, _, value in unmarshal_iter(os.path.join(bucket,path)): 63 | doc_count[key] += value 64 | count += 1 65 | 66 | for item in doc_count.iteritems(): 67 | docfreq.write(marshal.dumps(item)) 68 | return count 69 | 70 | def tally(bucketlist, jobs=None): 71 | """ 72 | Sum up the counts for each feature across all buckets. This 73 | builds a full mapping of feature->count. This is stored in-memory 74 | and thus could be an issue for large feature sets. 75 | """ 76 | 77 | with MapPool(jobs) as f: 78 | pass_sum_df_out = f(pass_sum_df, bucketlist) 79 | 80 | for i, keycount in enumerate(pass_sum_df_out): 81 | print("processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)) 82 | 83 | # build the global term->df mapping 84 | doc_count = {} 85 | for bucket in bucketlist: 86 | for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')): 87 | doc_count[key] = value 88 | 89 | return doc_count 90 | 91 | 92 | 93 | def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_PER_ORDER): 94 | """ 95 | DF feature selection for byte-ngram tokenization 96 | """ 97 | # Work out the set of features to compute IG 98 | features = set() 99 | for i in range(1, max_order+1): 100 | d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i) 101 | features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order]) 102 | features = sorted(features) 103 | 104 | return features 105 | 106 | 107 | 108 | if __name__ == "__main__": 109 | parser = argparse.ArgumentParser() 110 | parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") 111 | parser.add_argument("-f","--features", metavar='FEATURE_FILE', help="output features to FEATURE_FILE") 112 | parser.add_argument("--tokens_per_order", metavar='N', type=int, help="consider top N tokens per ngram order") 113 | parser.add_argument("--tokens", metavar='N', type=int, help="consider top N tokens") 114 | parser.add_argument("--max_order", type=int, help="highest n-gram order to use", default=MAX_NGRAM_ORDER) 115 | parser.add_argument("--doc_count", nargs='?', const=True, metavar='DOC_COUNT_PATH', help="output full mapping of feature->frequency to DOC_COUNT_PATH") 116 | parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR") 117 | 118 | args = parser.parse_args() 119 | 120 | if args.tokens and args.tokens_per_order: 121 | parser.error("--tokens and --tokens_per_order are mutually exclusive") 122 | 123 | # if neither --tokens nor --tokens_per_order is given, default behaviour is tokens_per_order 124 | if not(args.tokens) and not(args.tokens_per_order): 125 | args.tokens_per_order = TOKENS_PER_ORDER 126 | 127 | if args.features: 128 | feature_path = args.features 129 | else: 130 | feature_path = os.path.join(args.model, 'DFfeats') 131 | 132 | bucketlist_path = os.path.join(args.model, 'bucketlist') 133 | 134 | # display paths 135 | print("buckets path:", bucketlist_path) 136 | print("features output path:", feature_path) 137 | if args.tokens_per_order: 138 | print("max ngram order:", args.max_order) 139 | print("tokens per order:", args.tokens_per_order) 140 | else: 141 | print("tokens:", args.tokens) 142 | 143 | with open(bucketlist_path) as f: 144 | bucketlist = map(str.strip, f) 145 | 146 | doc_count = tally(bucketlist, args.jobs) 147 | print("unique features:", len(doc_count)) 148 | if args.doc_count: 149 | # The constant true is used to indicate output to default location 150 | doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count 151 | write_weights(doc_count, doc_count_path) 152 | print("wrote DF counts for all features to:", doc_count_path) 153 | 154 | if args.tokens_per_order: 155 | # Choose a number of features for each length of token 156 | feats = ngram_select(doc_count, args.max_order, args.tokens_per_order) 157 | else: 158 | # Choose a number of features overall 159 | feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] ) 160 | 161 | print("selected features: ", len(feats)) 162 | 163 | write_features(feats, feature_path) 164 | print('wrote features to "%s"' % feature_path) 165 | -------------------------------------------------------------------------------- /py3langid/train/IGweight.py: -------------------------------------------------------------------------------- 1 | """ 2 | IGWeight.py - 3 | Compute IG Weights given a set of tokenized buckets and a feature set 4 | 5 | Marco Lui, January 2013 6 | 7 | Based on research by Marco Lui and Tim Baldwin. 8 | 9 | Copyright 2013 Marco Lui . All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without modification, are 12 | permitted provided that the following conditions are met: 13 | 14 | 1. Redistributions of source code must retain the above copyright notice, this list of 15 | conditions and the following disclaimer. 16 | 17 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 18 | of conditions and the following disclaimer in the documentation and/or other materials 19 | provided with the distribution. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 22 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 23 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | The views and conclusions contained in the software and documentation are those of the 32 | authors and should not be interpreted as representing official policies, either expressed 33 | or implied, of the copyright holder. 34 | """ 35 | 36 | import argparse 37 | import csv 38 | import os 39 | 40 | from collections import defaultdict 41 | 42 | import numpy 43 | 44 | from .common import unmarshal_iter, MapPool, Enumerator, write_weights, read_features 45 | 46 | 47 | def entropy(v, axis=0): 48 | """ 49 | Optimized implementation of entropy. This version is faster than that in 50 | scipy.stats.distributions, particularly over long vectors. 51 | """ 52 | v = numpy.array(v, dtype='float') 53 | s = numpy.sum(v, axis=axis) 54 | with numpy.errstate(divide='ignore', invalid='ignore'): 55 | rhs = numpy.nansum(v * numpy.log(v), axis=axis) / s 56 | r = numpy.log(s) - rhs 57 | # Where dealing with binarized events, it is possible that an event always 58 | # occurs and thus has 0 information. In this case, the negative class 59 | # will have frequency 0, resulting in log(0) being computed as nan. 60 | # We replace these nans with 0 61 | nan_index = numpy.isnan(rhs) 62 | if nan_index.any(): 63 | r[nan_index] = 0 64 | return r 65 | 66 | def setup_pass_IG(features, dist, binarize, suffix): 67 | """ 68 | @param features the list of features to compute IG for 69 | @param dist the background distribution 70 | @param binarize (boolean) compute IG binarized per-class if True 71 | @param suffix of files in bucketdir to process 72 | """ 73 | global __features, __dist, __binarize, __suffix 74 | __features = features 75 | __dist = dist 76 | __binarize = binarize 77 | __suffix = suffix 78 | 79 | def pass_IG(bucket): 80 | """ 81 | In this pass we compute the information gain for each feature, binarized 82 | with respect to each language as well as unified over the set of all 83 | classes. 84 | 85 | @global __features the list of features to compute IG for 86 | @global __dist the background distribution 87 | @global __binarize (boolean) compute IG binarized per-class if True 88 | @global __suffix of files in bucketdir to process 89 | @param bucket the bucket file to process. It is assumed to contain marshalled (term, event_id, count) triplets. 90 | """ 91 | global __features, __dist, __binarize, __suffix 92 | 93 | # We first tally the per-event frequency of each 94 | # term in our selected feature set. 95 | term_freq = defaultdict(lambda: defaultdict(int)) 96 | term_index = defaultdict(Enumerator()) 97 | 98 | for path in os.listdir(bucket): 99 | if path.endswith(__suffix): 100 | for key, event_id, count in unmarshal_iter(os.path.join(bucket,path)): 101 | # Select only our listed features 102 | if key in __features: 103 | term_index[key] 104 | term_freq[key][event_id] += count 105 | 106 | num_term = len(term_index) 107 | num_event = len(__dist) 108 | 109 | cm_pos = numpy.zeros((num_term, num_event), dtype='int') 110 | 111 | for term,term_id in term_index.iteritems(): 112 | # update event matrix 113 | freq = term_freq[term] 114 | for event_id, count in freq.iteritems(): 115 | cm_pos[term_id, event_id] = count 116 | cm_neg = __dist - cm_pos 117 | cm = numpy.dstack((cm_neg, cm_pos)) 118 | 119 | if not __binarize: 120 | # non-binarized event space 121 | x = cm.sum(axis=1) 122 | term_w = x / x.sum(axis=1)[:, None].astype(float) 123 | 124 | # Entropy of the term-present/term-absent events 125 | e = entropy(cm, axis=1) 126 | 127 | # Information Gain with respect to the set of events 128 | ig = entropy(__dist) - (term_w * e).sum(axis=1) 129 | 130 | else: 131 | # binarized event space 132 | # Compute IG binarized with respect to each event 133 | ig = list() 134 | for event_id in range(num_event): 135 | num_doc = __dist.sum() 136 | prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc 137 | 138 | cm_bin = numpy.zeros((num_term, 2, 2), dtype=int) # (term, p(term), p(lang|term)) 139 | cm_bin[:,0,:] = cm.sum(axis=1) - cm[:,event_id,:] 140 | cm_bin[:,1,:] = cm[:,event_id,:] 141 | 142 | e = entropy(cm_bin, axis=1) 143 | x = cm_bin.sum(axis=1) 144 | term_w = x / x.sum(axis=1)[:, None].astype(float) 145 | 146 | ig.append( entropy(prior) - (term_w * e).sum(axis=1) ) 147 | ig = numpy.vstack(ig) 148 | 149 | terms = sorted(term_index, key=term_index.get) 150 | return terms, ig 151 | 152 | 153 | def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None): 154 | pass_IG_args = (features, dist, binarize, suffix) 155 | 156 | num_chunk = len(bucketlist) 157 | weights = [] 158 | terms = [] 159 | 160 | with MapPool(job_count, setup_pass_IG, pass_IG_args) as f: 161 | pass_IG_out = f(pass_IG, bucketlist) 162 | 163 | for i, (t, w) in enumerate(pass_IG_out): 164 | weights.append(w) 165 | terms.extend(t) 166 | print("processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))) 167 | 168 | if binarize: 169 | weights = numpy.hstack(weights).transpose() 170 | else: 171 | weights = numpy.concatenate(weights) 172 | terms = ["".join(t) for t in terms] 173 | 174 | return zip(terms, weights) 175 | 176 | def read_dist(path): 177 | """ 178 | Read the distribution from a file containing item, count pairs. 179 | @param path path to read form 180 | """ 181 | with open(path) as f: 182 | reader = csv.reader(f) 183 | return numpy.array(zip(*reader)[1], dtype=int) 184 | 185 | if __name__ == "__main__": 186 | parser = argparse.ArgumentParser() 187 | parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") 188 | parser.add_argument("-f","--features", metavar='FEATURE_FILE', help="read features from FEATURE_FILE") 189 | parser.add_argument("-w","--weights", metavar='WEIGHTS', help="output weights to WEIGHTS") 190 | parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR") 191 | parser.add_argument("-d","--domain", action="store_true", default=False, help="compute IG with respect to domain") 192 | parser.add_argument("-b","--binarize", action="store_true", default=False, help="binarize the event space in the IG computation") 193 | parser.add_argument("-l","--lang", action="store_true", default=False, help="compute IG with respect to language") 194 | 195 | args = parser.parse_args() 196 | if not(args.domain or args.lang) or (args.domain and args.lang): 197 | parser.error("exactly one of domain(-d) or language (-l) must be specified") 198 | 199 | if args.features: 200 | feature_path = args.features 201 | else: 202 | feature_path = os.path.join(args.model, 'DFfeats') 203 | 204 | bucketlist_path = os.path.join(args.model, 'bucketlist') 205 | 206 | if not os.path.exists(feature_path): 207 | parser.error('{0} does not exist'.format(feature_path)) 208 | 209 | bucketlist = map(str.strip, open(bucketlist_path)) 210 | features = read_features(feature_path) 211 | 212 | if args.domain: 213 | index_path = os.path.join(args.model,'domain_index') 214 | suffix = '.domain' 215 | elif args.lang: 216 | index_path = os.path.join(args.model,'lang_index') 217 | suffix = '.lang' 218 | else: 219 | raise ValueError("no event specified") 220 | 221 | if args.weights: 222 | weights_path = args.weights 223 | else: 224 | weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else '')) 225 | 226 | # display paths 227 | print("model path:", args.model ) 228 | print("buckets path:", bucketlist_path) 229 | print("features path:", feature_path) 230 | print("weights path:", weights_path) 231 | print("index path:", index_path) 232 | print("suffix:", suffix) 233 | 234 | print("computing information gain") 235 | 236 | dist = read_dist(index_path) 237 | ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs) 238 | 239 | write_weights(ig, weights_path) 240 | -------------------------------------------------------------------------------- /py3langid/train/LDfeatureselect.py: -------------------------------------------------------------------------------- 1 | """ 2 | LDfeatureselect.py - 3 | LD (Lang-Domain) feature extractor 4 | Marco Lui November 2011 5 | 6 | Based on research by Marco Lui and Tim Baldwin. 7 | 8 | Copyright 2011 Marco Lui . All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without modification, are 11 | permitted provided that the following conditions are met: 12 | 13 | 1. Redistributions of source code must retain the above copyright notice, this list of 14 | conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 17 | of conditions and the following disclaimer in the documentation and/or other materials 18 | provided with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | The views and conclusions contained in the software and documentation are those of the 31 | authors and should not be interpreted as representing official policies, either expressed 32 | or implied, of the copyright holder. 33 | """ 34 | 35 | ###### 36 | # Default values 37 | # Can be overriden with command-line options 38 | ###### 39 | FEATURES_PER_LANG = 300 # number of features to select for each language 40 | 41 | import argparse 42 | import csv 43 | import os 44 | 45 | from collections import defaultdict 46 | 47 | import numpy 48 | 49 | from common import read_weights, Enumerator, write_features 50 | 51 | def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False): 52 | """ 53 | @param ignore_domain boolean to indicate whether to use domain weights 54 | """ 55 | assert (ig_domain is None) or (len(ig_lang) == len(ig_domain)) 56 | num_lang = len(ig_lang.values()[0]) 57 | num_term = len(ig_lang) 58 | 59 | term_index = defaultdict(Enumerator()) 60 | 61 | 62 | ld = numpy.empty((num_lang, num_term), dtype=float) 63 | 64 | for term in ig_lang: 65 | term_id = term_index[term] 66 | if ignore_domain: 67 | ld[:, term_id] = ig_lang[term] 68 | else: 69 | ld[:, term_id] = ig_lang[term] - ig_domain[term] 70 | 71 | terms = sorted(term_index, key=term_index.get) 72 | # compile the final feature set 73 | selected_features = {} 74 | for lang_id, lang_w in enumerate(ld): 75 | term_inds = numpy.argsort(lang_w)[-feats_per_lang:] 76 | selected_features[lang_id] = [terms[t] for t in term_inds] 77 | 78 | return selected_features 79 | 80 | if __name__ == "__main__": 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument("-o","--output", metavar="OUTPUT_PATH", help = "write selected features to OUTPUT_PATH") 83 | parser.add_argument("--feats_per_lang", type=int, metavar='N', help="select top N features for each language", default=FEATURES_PER_LANG) 84 | parser.add_argument("--per_lang", action="store_true", default=False, help="produce a list of features selecter per-language") 85 | parser.add_argument("--no_domain_ig", action="store_true", default=False, help="use only per-langugage IG in LD calculation") 86 | parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR") 87 | args = parser.parse_args() 88 | 89 | lang_w_path = os.path.join(args.model, 'IGweights.lang.bin') 90 | domain_w_path = os.path.join(args.model, 'IGweights.domain') 91 | feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats') 92 | 93 | # display paths 94 | print("model path:", args.model) 95 | print("lang weights path:", lang_w_path) 96 | print("domain weights path:", domain_w_path) 97 | print("feature output path:", feature_path) 98 | 99 | lang_w = read_weights(lang_w_path) 100 | domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None 101 | 102 | features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig) 103 | if args.per_lang: 104 | with open(feature_path + '.perlang', 'w') as f: 105 | writer = csv.writer(f) 106 | for i in range(len(features_per_lang)): 107 | writer.writerow(map(repr,features_per_lang[i])) 108 | 109 | 110 | final_feature_set = reduce(set.union, map(set, features_per_lang.values())) 111 | print('selected %d features' % len(final_feature_set)) 112 | 113 | write_features(sorted(final_feature_set), feature_path) 114 | print('wrote features to "%s"' % feature_path) 115 | -------------------------------------------------------------------------------- /py3langid/train/NBtrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | NBtrain.py - 3 | Model generator for langid.py 4 | 5 | Marco Lui, January 2013 6 | 7 | Based on research by Marco Lui and Tim Baldwin. 8 | 9 | Copyright 2013 Marco Lui . All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without modification, are 12 | permitted provided that the following conditions are met: 13 | 14 | 1. Redistributions of source code must retain the above copyright notice, this list of 15 | conditions and the following disclaimer. 16 | 17 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 18 | of conditions and the following disclaimer in the documentation and/or other materials 19 | provided with the distribution. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 22 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 23 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | The views and conclusions contained in the software and documentation are those of the 32 | authors and should not be interpreted as representing official policies, either expressed 33 | or implied, of the copyright holder. 34 | """ 35 | MAX_CHUNK_SIZE = 100 # maximum number of files to tokenize at once 36 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation 37 | 38 | import array 39 | import argparse 40 | import atexit 41 | import base64 42 | import bz2 43 | import csv 44 | import marshal 45 | import multiprocessing as mp 46 | import os 47 | import pickle 48 | import shutil 49 | import tempfile 50 | 51 | from collections import defaultdict 52 | 53 | import numpy as np 54 | 55 | from .common import chunk, unmarshal_iter, MapPool 56 | 57 | 58 | def offsets(chunks): 59 | # Work out the path chunk start offsets 60 | chunk_offsets = [0] 61 | for c in chunks: 62 | chunk_offsets.append(chunk_offsets[-1] + len(c)) 63 | return chunk_offsets 64 | 65 | def state_trace(path): 66 | """ 67 | Returns counts of how often each state was entered 68 | """ 69 | global __nm_arr 70 | c = defaultdict(int) 71 | state = 0 72 | 73 | with open(path) as f: 74 | text = f.read() 75 | for letter in map(ord,text): 76 | state = __nm_arr[(state << 8) + letter] 77 | c[state] += 1 78 | return c 79 | 80 | def setup_pass_tokenize(nm_arr, output_states, tk_output, b_dirs): 81 | """ 82 | Set the global next-move array used by the aho-corasick scanner 83 | """ 84 | global __nm_arr, __output_states, __tk_output, __b_dirs 85 | __nm_arr = nm_arr 86 | __output_states = output_states 87 | __tk_output = tk_output 88 | __b_dirs = b_dirs 89 | 90 | def pass_tokenize(arg): 91 | """ 92 | Tokenize documents and do counts for each feature 93 | Split this into buckets chunked over features rather than documents 94 | """ 95 | global __output_states, __tk_output, __b_dirs 96 | chunk_offset, chunk_paths = arg 97 | term_freq = defaultdict(int) 98 | __procname = mp.current_process().name 99 | __buckets = [tempfile.mkstemp(prefix=__procname, suffix='.index', dir=p)[0] for p in __b_dirs] 100 | 101 | # Tokenize each document and add to a count of (doc_id, f_id) frequencies 102 | for doc_count, path in enumerate(chunk_paths): 103 | doc_id = doc_count + chunk_offset 104 | count = state_trace(path) 105 | for state in (set(count) & __output_states): 106 | for f_id in __tk_output[state]: 107 | term_freq[doc_id, f_id] += count[state] 108 | 109 | # Distribute the aggregated counts into buckets 110 | bucket_count = len(__buckets) 111 | for doc_id, f_id in term_freq: 112 | bucket_index = hash(f_id) % bucket_count 113 | count = term_freq[doc_id, f_id] 114 | item = ( f_id, doc_id, count ) 115 | os.write(__buckets[bucket_index], marshal.dumps(item)) 116 | 117 | for f in __buckets: 118 | os.close(f) 119 | 120 | return len(term_freq) 121 | 122 | def setup_pass_ptc(cm, num_instances): 123 | global __cm, __num_instances 124 | __cm = cm 125 | __num_instances = num_instances 126 | 127 | def pass_ptc(b_dir): 128 | """ 129 | Take a bucket, form a feature map, compute the count of 130 | each feature in each class. 131 | @param b_dir path to the bucket directory 132 | @returns (read_count, f_ids, prod) 133 | """ 134 | global __cm, __num_instances 135 | 136 | terms = defaultdict(lambda : np.zeros((__num_instances,), dtype='int')) 137 | 138 | read_count = 0 139 | for path in os.listdir(b_dir): 140 | if path.endswith('.index'): 141 | for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)): 142 | terms[f_id][doc_id] = count 143 | read_count += 1 144 | 145 | f_ids, f_vs = zip(*terms.items()) 146 | fm = np.vstack(f_vs) 147 | prod = np.dot(fm, __cm) 148 | return read_count, f_ids, prod 149 | 150 | 151 | def learn_pc(cm): 152 | """ 153 | @param cm class map 154 | @returns nb_pc: log(P(C)) 155 | """ 156 | pc = np.log(cm.sum(0)) 157 | nb_pc = array.array('d', pc) 158 | return nb_pc 159 | 160 | def generate_cm(items, num_classes): 161 | """ 162 | @param items (class id, path) pairs 163 | @param num_classes The number of classes present 164 | """ 165 | num_instances = len(items) 166 | 167 | # Generate the class map 168 | cm = np.zeros((num_instances, num_classes), dtype='bool') 169 | for docid, (lang_id, path) in enumerate(items): 170 | cm[docid, lang_id] = True 171 | 172 | return cm 173 | 174 | def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args): 175 | global b_dirs 176 | num_instances = len(paths) 177 | num_features = max( i for v in tk_output.values() for i in v) + 1 178 | 179 | # Generate the feature map 180 | nm_arr = mp.Array('i', tk_nextmove, lock=False) 181 | 182 | if args.jobs: 183 | chunksize = min(len(paths) / (args.jobs*2), args.chunksize) 184 | else: 185 | chunksize = min(len(paths) / (mp.cpu_count()*2), args.chunksize) 186 | 187 | # TODO: Set the output dir 188 | b_dirs = [ tempfile.mkdtemp(prefix="train-",suffix='-bucket', dir=temp_path) for i in range(args.buckets) ] 189 | 190 | output_states = set(tk_output) 191 | 192 | path_chunks = list(chunk(paths, chunksize)) 193 | pass_tokenize_arg = zip(offsets(path_chunks), path_chunks) 194 | 195 | pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs) 196 | with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f: 197 | pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg) 198 | 199 | write_count = sum(pass_tokenize_out) 200 | print("wrote a total of %d keys" % write_count) 201 | 202 | pass_ptc_params = (cm, num_instances) 203 | with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f: 204 | pass_ptc_out = f(pass_ptc, b_dirs) 205 | 206 | reads, ids, prods = zip(*pass_ptc_out) 207 | read_count = sum(reads) 208 | print("read a total of %d keys (%d short)" % (read_count, write_count - read_count)) 209 | 210 | prod = np.zeros((num_features, cm.shape[1]), dtype=int) 211 | prod[np.concatenate(ids)] = np.vstack(prods) 212 | 213 | ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0)) 214 | 215 | nb_ptc = array.array('d') 216 | for term_dist in ptc.tolist(): 217 | nb_ptc.extend(term_dist) 218 | 219 | return nb_ptc 220 | 221 | @atexit.register 222 | def cleanup(): 223 | global b_dirs 224 | try: 225 | for d in b_dirs: 226 | shutil.rmtree(d) 227 | except NameError: 228 | # Failed before b_dirs is defined, nothing to clean 229 | pass 230 | 231 | if __name__ == "__main__": 232 | parser = argparse.ArgumentParser() 233 | parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") 234 | parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets") 235 | parser.add_argument("-s", "--scanner", metavar='SCANNER', help="use SCANNER for feature counting") 236 | parser.add_argument("-o", "--output", metavar='OUTPUT', help="output langid.py-compatible model to OUTPUT") 237 | #parser.add_argument("-i","--index",metavar='INDEX',help="read list of training document paths from INDEX") 238 | parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR") 239 | parser.add_argument("--chunksize", type=int, help='maximum chunk size (number of files)', default=MAX_CHUNK_SIZE) 240 | parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS) 241 | args = parser.parse_args() 242 | 243 | if args.temp: 244 | temp_path = args.temp 245 | else: 246 | temp_path = os.path.join(args.model, 'buckets') 247 | 248 | if args.scanner: 249 | scanner_path = args.scanner 250 | else: 251 | scanner_path = os.path.join(args.model, 'LDfeats.scanner') 252 | 253 | if args.output: 254 | output_path = args.output 255 | else: 256 | output_path = os.path.join(args.model, 'model') 257 | 258 | index_path = os.path.join(args.model, 'paths') 259 | lang_path = os.path.join(args.model, 'lang_index') 260 | 261 | # display paths 262 | print("model path:", args.model) 263 | print("temp path:", temp_path) 264 | print("scanner path:", scanner_path) 265 | #print "index path:", index_path 266 | print("output path:", output_path) 267 | 268 | # read list of training files 269 | with open(index_path) as f: 270 | reader = csv.reader(f) 271 | items = [ (l,p) for _,l,p in reader ] 272 | 273 | # read scanner 274 | with open(scanner_path) as f: 275 | tk_nextmove, tk_output, _ = pickle.load(f) 276 | 277 | # read list of languages in order 278 | with open(lang_path) as f: 279 | reader = csv.reader(f) 280 | langs = zip(*reader)[0] 281 | 282 | cm = generate_cm(items, len(langs)) 283 | paths = zip(*items)[1] 284 | 285 | nb_classes = langs 286 | nb_pc = learn_pc(cm) 287 | nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args) 288 | 289 | # output the model 290 | model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output 291 | string = base64.b64encode(bz2.compress(pickle.dumps(model))) 292 | with open(output_path, 'w') as f: 293 | f.write(string) 294 | 295 | print("wrote model to %s (%d bytes)" % (output_path, len(string))) 296 | -------------------------------------------------------------------------------- /py3langid/train/README: -------------------------------------------------------------------------------- 1 | Refactoring of the langid.py training tools, to allow for 2 | more flexibility and easier experimentation. 3 | 4 | Planned tools: 5 | 1) index.py - index a corpus. Produce a list of file, corpus, language pairs. 6 | 2) tokenize.py - take an index and tokenize the corresponding files 7 | 3) DFfeatureselect.py - choose features by document frequency 8 | 3) IGweight.py - compute the IG weights for language and for domain 9 | 4) LDfeatureselect.py - take the IG weights and use them to select a feature set 10 | 5) scanner.py - build a scanner on the basis of a feature set 11 | 6) NBtrain.py - learn NB parameters using an indexed corpus and a scanner 12 | 13 | Optional: 14 | A single tool that integrates all steps, calling on each submodule as required. 15 | 16 | Marco Lui, January 2013 17 | -------------------------------------------------------------------------------- /py3langid/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adbar/py3langid/812f2055f74c35dea298f30b434644062d9289be/py3langid/train/__init__.py -------------------------------------------------------------------------------- /py3langid/train/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common functions 3 | 4 | Marco Lui, January 2013 5 | """ 6 | 7 | import csv 8 | import errno 9 | import marshal 10 | import multiprocessing as mp 11 | import os 12 | 13 | from contextlib import contextmanager, closing 14 | from itertools import imap, islice 15 | 16 | import numpy 17 | 18 | 19 | class Enumerator(object): 20 | """ 21 | Enumerator object. Returns a larger number each call. 22 | Can be used with defaultdict to enumerate a sequence of items. 23 | """ 24 | def __init__(self, start=0): 25 | self.n = start 26 | 27 | def __call__(self): 28 | retval = self.n 29 | self.n += 1 30 | return retval 31 | 32 | def chunk(seq, chunksize): 33 | """ 34 | Break a sequence into chunks not exceeeding a predetermined size 35 | """ 36 | seq_iter = iter(seq) 37 | while True: 38 | chunk = tuple(islice(seq_iter, chunksize)) 39 | if not chunk: break 40 | yield chunk 41 | 42 | def unmarshal_iter(path): 43 | """ 44 | Open a given path and yield an iterator over items unmarshalled from it. 45 | """ 46 | with open(path, 'rb') as f: 47 | while True: 48 | try: 49 | yield marshal.load(f) 50 | except EOFError: 51 | break 52 | 53 | def makedir(path): 54 | try: 55 | os.makedirs(path) 56 | except OSError as e: 57 | if e.errno != errno.EEXIST: 58 | raise 59 | 60 | 61 | def write_weights(weights, path): 62 | w = dict(weights) 63 | with open(path, 'w') as f: 64 | writer = csv.writer(f) 65 | try: 66 | key_order = sorted(w, key=w.get, reverse=True) 67 | except ValueError: 68 | # Could not order keys by value, value is probably a vector. 69 | # Order keys alphabetically in this case. 70 | key_order = sorted(w) 71 | 72 | for k in key_order: 73 | row = [repr(k)] 74 | try: 75 | row.extend(w[k]) 76 | except TypeError: 77 | row.append(w[k]) 78 | writer.writerow(row) 79 | 80 | 81 | def read_weights(path): 82 | with open(path) as f: 83 | reader = csv.reader(f) 84 | retval = {} 85 | for row in reader: 86 | key = eval(row[0]) 87 | #val = numpy.array( map(float,row[1:]) ) 88 | val = numpy.array( [float(v) if v != 'nan' else 0. for v in row[1:]] ) 89 | retval[key] = val 90 | return retval 91 | 92 | def read_features(path): 93 | """ 94 | Read a list of features in feature-per-line format, where each 95 | feature is a repr and needs to be evaled. 96 | @param path path to read from 97 | """ 98 | with open(path) as f: 99 | return map(eval, f) 100 | 101 | def write_features(features, path): 102 | """ 103 | Write a list of features to a file at `path`. The repr of each 104 | feature is written on a new line. 105 | @param features list of features to write 106 | @param path path to write to 107 | """ 108 | with open(path,'w') as f: 109 | for feat in features: 110 | print(repr(feat),file=f) 111 | 112 | 113 | def index(seq): 114 | """ 115 | Build an index for a sequence of items. Assumes 116 | that the items in the sequence are unique. 117 | @param seq the sequence to index 118 | @returns a dictionary from item to position in the sequence 119 | """ 120 | return {(k,v) for (v,k) in enumerate(seq)} 121 | 122 | 123 | @contextmanager 124 | def MapPool(processes=None, initializer=None, initargs=None, maxtasksperchild=None, chunksize=1): 125 | """ 126 | Contextmanager to express the common pattern of not using multiprocessing if 127 | only 1 job is allocated (for example for debugging reasons) 128 | """ 129 | if processes is None: 130 | processes = mp.cpu_count() + 4 131 | 132 | if processes > 1: 133 | with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool: 134 | f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize) 135 | yield f 136 | else: 137 | if initializer is not None: 138 | initializer(*initargs) 139 | f = imap 140 | yield f 141 | 142 | if processes > 1: 143 | pool.join() 144 | -------------------------------------------------------------------------------- /py3langid/train/index.py: -------------------------------------------------------------------------------- 1 | """ 2 | index.py - 3 | Index a corpus that is stored in a directory hierarchy as follows: 4 | 5 | - corpus 6 | - domain1 7 | - language1 8 | - file1 9 | - file2 10 | - ... 11 | - language2 12 | - ... 13 | - domain2 14 | - language1 15 | - file1 16 | - file2 17 | - ... 18 | - language2 19 | - ... 20 | - ... 21 | 22 | This produces 3 files: 23 | * index: a list of paths, together with the langid and domainid as integers 24 | * lang_index: a list of languages in ascending order of id, with the count for each 25 | * domain_index: a list of domains in ascending order of id, with the count for each 26 | 27 | Marco Lui, January 2013 28 | 29 | Copyright 2013 Marco Lui . All rights reserved. 30 | 31 | Redistribution and use in source and binary forms, with or without modification, are 32 | permitted provided that the following conditions are met: 33 | 34 | 1. Redistributions of source code must retain the above copyright notice, this list of 35 | conditions and the following disclaimer. 36 | 37 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 38 | of conditions and the following disclaimer in the documentation and/or other materials 39 | provided with the distribution. 40 | 41 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 42 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 43 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 44 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 45 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 46 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 47 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 48 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 49 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 50 | 51 | The views and conclusions contained in the software and documentation are those of the 52 | authors and should not be interpreted as representing official policies, either expressed 53 | or implied, of the copyright holder. 54 | """ 55 | 56 | ###### 57 | # Default values 58 | # Can be overriden with command-line options 59 | ###### 60 | TRAIN_PROP = 1.0 # probability than any given document is selected 61 | MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included 62 | 63 | import argparse 64 | import csv 65 | import os 66 | import random 67 | 68 | from collections import defaultdict 69 | 70 | import numpy 71 | 72 | from .common import Enumerator, makedir 73 | 74 | 75 | class CorpusIndexer(object): 76 | """ 77 | Class to index the contents of a corpus 78 | """ 79 | def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None): 80 | self.root = root 81 | self.min_domain = min_domain 82 | self.proportion = proportion 83 | 84 | if langs is None: 85 | self.lang_index = defaultdict(Enumerator()) 86 | else: 87 | # pre-specified lang set 88 | self.lang_index = {(k,v) for v,k in enumerate(langs)} 89 | 90 | if domains is None: 91 | self.domain_index = defaultdict(Enumerator()) 92 | else: 93 | # pre-specified domain set 94 | self.domain_index = dict((k,v) for v,k in enumerate(domains)) 95 | 96 | self.coverage_index = defaultdict(set) 97 | self.items = list() 98 | 99 | self.index(root) 100 | self.prune_min_domain(self.min_domain) 101 | 102 | def index(self, root): 103 | # build a list of paths 104 | paths = [] 105 | for dirpath, dirnames, filenames in os.walk(root, followlinks=True): 106 | for docname in filenames: 107 | if random.random() < self.proportion: 108 | # Each file has 'proportion' chance of being selected. 109 | path = os.path.join(dirpath, docname) 110 | 111 | # split the dirpath into identifying components 112 | d, lang = os.path.split(dirpath) 113 | d, domain = os.path.split(d) 114 | 115 | # index the language and the domain 116 | try: 117 | # TODO: If lang is pre-specified but not domain, we can end up 118 | # enumerating empty domains. 119 | domain_id = self.domain_index[domain] 120 | lang_id = self.lang_index[lang] 121 | except KeyError: 122 | # lang or domain outside a pre-specified set so 123 | # skip this document. 124 | continue 125 | 126 | # add the domain-lang relation to the coverage index 127 | self.coverage_index[domain].add(lang) 128 | 129 | # add the item to our list 130 | self.items.append((domain_id,lang_id,docname,path)) 131 | 132 | 133 | def prune_min_domain(self, min_domain): 134 | # prune files for all languages that do not occur in at least min_domain 135 | 136 | # Work out which languages to reject as they are not present in at least 137 | # the required number of domains 138 | lang_domain_count = defaultdict(int) 139 | for langs in self.coverage_index.values(): 140 | for lang in langs: 141 | lang_domain_count[lang] += 1 142 | reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain) 143 | 144 | # Remove the languages from the indexer 145 | if reject_langs: 146 | #print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs)) 147 | reject_ids = set(self.lang_index[l] for l in reject_langs) 148 | 149 | new_lang_index = defaultdict(Enumerator()) 150 | lm = dict() 151 | for k,v in self.lang_index.items(): 152 | if v not in reject_ids: 153 | new_id = new_lang_index[k] 154 | lm[v] = new_id 155 | 156 | # Eliminate all entries for the languages 157 | self.items = [ (d, lm[l], n, p) for (d, l, n, p) in self.items if l in lm] 158 | 159 | self.lang_index = new_lang_index 160 | 161 | 162 | @property 163 | def dist_lang(self): 164 | """ 165 | @returns A vector over frequency counts for each language 166 | """ 167 | retval = numpy.zeros((len(self.lang_index),), dtype='int') 168 | for d, l, n, p in self.items: 169 | retval[l] += 1 170 | return retval 171 | 172 | @property 173 | def dist_domain(self): 174 | """ 175 | @returns A vector over frequency counts for each domain 176 | """ 177 | retval = numpy.zeros((len(self.domain_index),), dtype='int') 178 | for d, l, n, p in self.items: 179 | retval[d] += 1 180 | return retval 181 | 182 | # TODO: Remove this as it should no longer be needed 183 | @property 184 | def classmaps(self): 185 | num_instances = len(self.items) 186 | if num_instances == 0: 187 | raise ValueError("no items indexed!") 188 | cm_domain = numpy.zeros((num_instances, len(self.domain_index)), dtype='bool') 189 | cm_lang = numpy.zeros((num_instances, len(self.lang_index)), dtype='bool') 190 | 191 | # Populate the class maps 192 | for docid, (domain_id, lang_id, docname, path) in enumerate(self.items): 193 | cm_domain[docid, domain_id] = True 194 | cm_lang[docid, lang_id] = True 195 | return cm_domain, cm_lang 196 | 197 | @property 198 | def paths(self): 199 | return [ p for (d,l,n,p) in self.items ] 200 | 201 | 202 | if __name__ == "__main__": 203 | parser = argparse.ArgumentParser() 204 | parser.add_argument("-p","--proportion", type=float, default=TRAIN_PROP, 205 | help="proportion of training data to use" ) 206 | parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR") 207 | parser.add_argument("-d","--domain", metavar="DOMAIN", action='append', 208 | help="use DOMAIN - can be specified multiple times (uses all domains found if not specified)") 209 | parser.add_argument("-l","--lang", metavar="LANG", action='append', 210 | help="use LANG - can be specified multiple times (uses all langs found if not specified)") 211 | parser.add_argument("--min_domain", type=int, default=MIN_DOMAIN, 212 | help="minimum number of domains a language must be present in" ) 213 | parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR") 214 | 215 | args = parser.parse_args() 216 | 217 | corpus_name = os.path.basename(args.corpus) 218 | if args.model: 219 | model_dir = args.model 220 | else: 221 | model_dir = os.path.join('.', corpus_name+'.model') 222 | 223 | makedir(model_dir) 224 | 225 | langs_path = os.path.join(model_dir, 'lang_index') 226 | domains_path = os.path.join(model_dir, 'domain_index') 227 | index_path = os.path.join(model_dir, 'paths') 228 | 229 | # display paths 230 | print("corpus path:", args.corpus) 231 | print("model path:", model_dir) 232 | print("writing langs to:", langs_path) 233 | print("writing domains to:", domains_path) 234 | print("writing index to:", index_path) 235 | 236 | indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion, 237 | langs = args.lang, domains = args.domain) 238 | 239 | # Compute mappings between files, languages and domains 240 | lang_dist = indexer.dist_lang 241 | lang_index = indexer.lang_index 242 | lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items())) 243 | print("langs({0}): {1}".format(len(lang_dist), lang_info)) 244 | 245 | domain_dist = indexer.dist_domain 246 | domain_index = indexer.domain_index 247 | domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items())) 248 | print("domains({0}): {1}".format(len(domain_dist), domain_info)) 249 | 250 | print("identified {0} files".format(len(indexer.items))) 251 | 252 | # output the language index 253 | with open(langs_path,'w') as f: 254 | writer = csv.writer(f) 255 | writer.writerows((l, lang_dist[lang_index[l]]) 256 | for l in sorted(lang_index.keys(), key=lang_index.get)) 257 | 258 | # output the domain index 259 | with open(domains_path,'w') as f: 260 | writer = csv.writer(f) 261 | writer.writerows((d, domain_dist[domain_index[d]]) 262 | for d in sorted(domain_index.keys(), key=domain_index.get)) 263 | 264 | # output items found 265 | with open(index_path,'w') as f: 266 | writer = csv.writer(f) 267 | writer.writerows( (d,l,p) for (d,l,n,p) in indexer.items ) 268 | -------------------------------------------------------------------------------- /py3langid/train/scanner.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanner.py - 3 | Assemble a "feature scanner" using Aho-Corasick string matching. 4 | This takes a list of features (byte sequences) and builds a DFA 5 | that when run on a byte stream can identify how often each of 6 | the features is present in a single pass over the stream. 7 | 8 | Marco Lui, January 2013 9 | 10 | Copyright 2013 Marco Lui . All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without modification, are 13 | permitted provided that the following conditions are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright notice, this list of 16 | conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 19 | of conditions and the following disclaimer in the documentation and/or other materials 20 | provided with the distribution. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 23 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 24 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 30 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | The views and conclusions contained in the software and documentation are those of the 33 | authors and should not be interpreted as representing official policies, either expressed 34 | or implied, of the copyright holder. 35 | """ 36 | 37 | import argparse 38 | import array 39 | import os 40 | import pickle 41 | from collections import deque, defaultdict 42 | from .common import read_features 43 | 44 | class Scanner(object): 45 | alphabet = map(chr, range(1<<8)) 46 | """ 47 | Implementation of Aho-Corasick string matching. 48 | This class should be instantiated with a set of keywords, which 49 | will then be the only tokens generated by the class's search method, 50 | """ 51 | @classmethod 52 | def from_file(cls, path): 53 | with open(path) as f: 54 | tk_nextmove, tk_output, feats = pickle.load(f) 55 | if isinstance(feats, dict): 56 | # The old scanner format had two identical dictionaries as the last 57 | # two items in the tuple. This format can still be used by langid.py, 58 | # but it does not carry the feature list, and so cannot be unpacked 59 | # back into a Scanner object. 60 | raise ValueError("old format scanner - please retrain. see code for details.") 61 | # tk_output is a mapping from state to a list of feature indices. 62 | # because of the way the scanner class is written, it needs a mapping 63 | # from state to the feature itself. We rebuild this here. 64 | tk_output_f = dict( (k,[feats[i] for i in v]) for k,v in tk_output.iteritems() ) 65 | scanner = cls.__new__(cls) 66 | scanner.__setstate__((tk_nextmove, tk_output_f)) 67 | return scanner 68 | 69 | def __init__(self, keywords): 70 | self.build(keywords) 71 | 72 | def __call__(self, value): 73 | return self.search(value) 74 | 75 | def build(self, keywords): 76 | goto = dict() 77 | fail = dict() 78 | output = defaultdict(set) 79 | 80 | # Algorithm 2 81 | newstate = 0 82 | for a in keywords: 83 | state = 0 84 | j = 0 85 | while (j < len(a)) and (state, a[j]) in goto: 86 | state = goto[(state, a[j])] 87 | j += 1 88 | for p in range(j, len(a)): 89 | newstate += 1 90 | goto[(state, a[p])] = newstate 91 | #print "(%d, %s) -> %d" % (state, a[p], newstate) 92 | state = newstate 93 | output[state].add(a) 94 | for a in self.alphabet: 95 | if (0,a) not in goto: 96 | goto[(0,a)] = 0 97 | 98 | # Algorithm 3 99 | queue = deque() 100 | for a in self.alphabet: 101 | if goto[(0,a)] != 0: 102 | s = goto[(0,a)] 103 | queue.append(s) 104 | fail[s] = 0 105 | while queue: 106 | r = queue.popleft() 107 | for a in self.alphabet: 108 | if (r,a) in goto: 109 | s = goto[(r,a)] 110 | queue.append(s) 111 | state = fail[r] 112 | while (state,a) not in goto: 113 | state = fail[state] 114 | fail[s] = goto[(state,a)] 115 | #print "f(%d) -> %d" % (s, goto[(state,a)]), output[fail[s]] 116 | if output[fail[s]]: 117 | output[s].update(output[fail[s]]) 118 | 119 | # Algorithm 4 120 | self.nextmove = {} 121 | for a in self.alphabet: 122 | self.nextmove[(0,a)] = goto[(0,a)] 123 | if goto[(0,a)] != 0: 124 | queue.append(goto[(0,a)]) 125 | while queue: 126 | r = queue.popleft() 127 | for a in self.alphabet: 128 | if (r,a) in goto: 129 | s = goto[(r,a)] 130 | queue.append(s) 131 | self.nextmove[(r,a)] = s 132 | else: 133 | self.nextmove[(r,a)] = self.nextmove[(fail[r],a)] 134 | 135 | # convert the output to tuples, as tuple iteration is faster 136 | # than set iteration 137 | self.output = dict((k, tuple(output[k])) for k in output) 138 | 139 | # Next move encoded as a single array. The index of the next state 140 | # is located at current state * alphabet size + ord(c). 141 | # The choice of 'H' array typecode limits us to 64k states. 142 | def generate_nm_arr(typecode): 143 | def nextstate_iter(): 144 | # State count starts at 0, so the number of states is the number of i 145 | # the last state (newstate) + 1 146 | for state in range(newstate+1): 147 | for letter in self.alphabet: 148 | yield self.nextmove[(state, letter)] 149 | return array.array(typecode, nextstate_iter()) 150 | try: 151 | self.nm_arr = generate_nm_arr('H') 152 | except OverflowError: 153 | # Could not fit in an unsigned short array, let's try an unsigned long array. 154 | self.nm_arr = generate_nm_arr('L') 155 | 156 | def __getstate__(self): 157 | """ 158 | Compiled nextmove and output. 159 | """ 160 | return (self.nm_arr, self.output) 161 | 162 | def __setstate__(self, value): 163 | nm_array, output = value 164 | self.nm_arr = nm_array 165 | self.output = output 166 | self.nextmove = {} 167 | for i, next_state in enumerate(nm_array): 168 | state = i / 256 169 | letter = chr(i % 256) 170 | self.nextmove[(state, letter)] = next_state 171 | 172 | def search(self, string): 173 | state = 0 174 | for letter in map(ord,string): 175 | state = self.nm_arr[(state << 8) + letter] 176 | for key in self.output.get(state, []): 177 | yield key 178 | 179 | def build_scanner(features): 180 | """ 181 | In difference to the Scanner class, this function unwraps a layer of indirection in 182 | the detection of features. It translates the string output of the scanner's output 183 | mapping into the index values (positions in the list) of the features in the supplied 184 | feature set. This is very useful where we are only interested in the relative frequencies 185 | of features. 186 | 187 | @param features a list of features (byte sequences) 188 | @returns a compiled scanner model 189 | """ 190 | feat_index = index(features) 191 | 192 | # Build the actual scanner 193 | print("building scanner") 194 | scanner = Scanner(features) 195 | tk_nextmove, raw_output = scanner.__getstate__() 196 | 197 | # tk_output is the output function of the scanner. It should generate indices into 198 | # the feature space directly, as this saves a lookup 199 | tk_output = {} 200 | for k,v in raw_output.items(): 201 | tk_output[k] = tuple(feat_index[f] for f in v) 202 | return tk_nextmove, tk_output 203 | 204 | 205 | def index(seq): 206 | """ 207 | Build an index for a sequence of items. Assumes 208 | that the items in the sequence are unique. 209 | @param seq the sequence to index 210 | @returns a dictionary from item to position in the sequence 211 | """ 212 | return dict((k,v) for (v,k) in enumerate(seq)) 213 | 214 | if __name__ == "__main__": 215 | parser = argparse.ArgumentParser() 216 | parser.add_argument("input", metavar="INPUT", help="build a scanner for INPUT. If input is a directory, read INPUT/LDfeats") 217 | parser.add_argument("-o","--output", help="output scanner to OUTFILE", metavar="OUTFILE") 218 | args = parser.parse_args() 219 | 220 | if os.path.isdir(args.input): 221 | input_path = os.path.join(args.input, 'LDfeats') 222 | else: 223 | input_path = args.input 224 | 225 | if args.output: 226 | output_path = args.output 227 | else: 228 | output_path = input_path + '.scanner' 229 | 230 | # display paths 231 | print("input path:", input_path) 232 | print("output path:", output_path) 233 | 234 | nb_features = read_features(input_path) 235 | tk_nextmove, tk_output = build_scanner(nb_features) 236 | scanner = tk_nextmove, tk_output, nb_features 237 | 238 | with open(output_path, 'w') as f: 239 | pickle.dump(scanner, f) 240 | print("wrote scanner to {0}".format(output_path)) 241 | -------------------------------------------------------------------------------- /py3langid/train/tokenize.py: -------------------------------------------------------------------------------- 1 | """ 2 | tokenize.py - 3 | Tokenizer for langid.py training system. This takes a list of files and tokenizes them 4 | in parallel. 5 | 6 | Marco Lui, January 2013 7 | 8 | Copyright 2013 Marco Lui . All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without modification, are 11 | permitted provided that the following conditions are met: 12 | 13 | 1. Redistributions of source code must retain the above copyright notice, this list of 14 | conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 17 | of conditions and the following disclaimer in the documentation and/or other materials 18 | provided with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | The views and conclusions contained in the software and documentation are those of the 31 | authors and should not be interpreted as representing official policies, either expressed 32 | or implied, of the copyright holder. 33 | """ 34 | 35 | ###### 36 | # Default values 37 | # Can be overriden with command-line options 38 | ###### 39 | 40 | MIN_NGRAM_ORDER = 1 # smallest order of n-grams to consider 41 | MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider 42 | TOP_DOC_FREQ = 15000 # number of tokens to consider for each order 43 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation 44 | CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use) 45 | 46 | import argparse 47 | import atexit 48 | import csv 49 | import marshal 50 | import multiprocessing as mp 51 | import os 52 | import random 53 | import shutil 54 | import tempfile 55 | 56 | from itertools import tee 57 | from collections import defaultdict 58 | 59 | from .common import makedir, chunk, MapPool 60 | 61 | class NGramTokenizer(object): 62 | def __init__(self, min_order=1, max_order=3): 63 | self.min_order = min_order 64 | self.max_order = max_order 65 | 66 | def __call__(self, seq): 67 | min_order = self.min_order 68 | max_order = self.max_order 69 | t = tee(seq, max_order) 70 | for i in range(max_order): 71 | for _ in range(i): 72 | # advance iterators, ignoring result 73 | t[i].next() 74 | while True: 75 | token = ''.join(tn.next() for tn in t) 76 | if len(token) < max_order: break 77 | for n in range(min_order-1, max_order): 78 | yield token[:n+1] 79 | for a in range(max_order-1): 80 | for b in range(min_order, max_order-a): 81 | yield token[a:a+b] 82 | 83 | @atexit.register 84 | def cleanup(): 85 | global b_dirs, complete 86 | try: 87 | if not complete: 88 | for d in b_dirs: 89 | shutil.rmtree(d) 90 | except NameError: 91 | # Failed before globals defined, nothing to clean 92 | pass 93 | 94 | def setup_pass_tokenize(tokenizer, b_dirs, sample_count, sample_size): 95 | global __tokenizer, __b_dirs, __sample_count, __sample_size 96 | __tokenizer = tokenizer 97 | __b_dirs = b_dirs 98 | __sample_count = sample_count 99 | __sample_size = sample_size 100 | 101 | def pass_tokenize(chunk_items): 102 | """ 103 | Chunk files into a doc->term mapping, 104 | and simultaneously build a term->df count. 105 | The term->df counts are redistributed to 106 | buckets via python's in-built hash function. 107 | This is basically an inversion step, so that 108 | now we are chunked on the term axis rather 109 | than the document axis. 110 | """ 111 | global __maxorder, __b_dirs, __extractor, __sample_count, __sample_size 112 | __procname = mp.current_process().name 113 | b_freq_lang = [tempfile.mkstemp(prefix=__procname+'-', suffix='.lang', dir=p)[0] for p in __b_dirs] 114 | b_freq_domain = [tempfile.mkstemp(prefix=__procname+'-', suffix='.domain', dir=p)[0] for p in __b_dirs] 115 | 116 | extractor = __tokenizer 117 | term_lng_freq = defaultdict(lambda: defaultdict(int)) 118 | term_dom_freq = defaultdict(lambda: defaultdict(int)) 119 | 120 | for domain_id, lang_id, path in chunk_items: 121 | with open(path) as f: 122 | if __sample_count: 123 | # sampling tokenization 124 | text = f.read() 125 | poss = max(1,len(text) - __sample_size) # possibe start locations 126 | count = min(poss, __sample_count) # reduce number of samples if document is too short 127 | offsets = random.sample(range(poss), count) 128 | for offset in offsets: 129 | tokenset = set(extractor(text[offset: offset+__sample_size])) 130 | for token in tokenset: 131 | term_lng_freq[token][lang_id] += 1 132 | term_dom_freq[token][domain_id] += 1 133 | 134 | else: 135 | # whole-document tokenization 136 | tokenset = set(extractor(f.read())) 137 | for token in tokenset: 138 | term_lng_freq[token][lang_id] += 1 139 | term_dom_freq[token][domain_id] += 1 140 | 141 | for term in term_lng_freq: 142 | bucket_index = hash(term) % len(b_freq_lang) 143 | for lang, count in term_lng_freq[term].iteritems(): 144 | os.write(b_freq_lang[bucket_index], marshal.dumps((term, lang, count))) 145 | for domain, count in term_dom_freq[term].iteritems(): 146 | os.write(b_freq_domain[bucket_index], marshal.dumps((term, domain, count))) 147 | 148 | # Close all the open files 149 | for f in b_freq_lang + b_freq_domain: 150 | os.close(f) 151 | 152 | return len(term_lng_freq) 153 | 154 | def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE, sample_count=None, sample_size=None): 155 | """ 156 | @param items a list of (domain, language, path) tuples 157 | """ 158 | global b_dirs, complete 159 | 160 | # Our exitfunc uses this to know whether to delete the tokenized files 161 | complete = False 162 | 163 | if jobs is None: 164 | jobs = mp.cpu_count() + 4 165 | 166 | b_dirs = [ tempfile.mkdtemp(prefix="tokenize-",suffix='-{0}'.format(tokenizer.__class__.__name__), dir=outdir) for i in range(buckets) ] 167 | 168 | # PASS 1: Tokenize documents into sets of terms 169 | 170 | # If there are few items, make the chunk size such that each job 171 | # will have 2 chunks 172 | chunk_size = max(1,min(len(items) / (jobs * 2), chunksize)) 173 | item_chunks = list(chunk(items, chunk_size)) 174 | pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size) 175 | 176 | with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f: 177 | pass_tokenize_out = f(pass_tokenize, item_chunks) 178 | 179 | 180 | doc_count = defaultdict(int) 181 | chunk_count = len(item_chunks) 182 | print("chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)) 183 | print("job count: {0}".format(jobs)) 184 | 185 | if sample_count: 186 | print("sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count)) 187 | else: 188 | print("whole-document tokenization") 189 | 190 | for i, keycount in enumerate(pass_tokenize_out): 191 | print("tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount)) 192 | 193 | complete = True 194 | 195 | return b_dirs 196 | 197 | if __name__ == "__main__": 198 | parser = argparse.ArgumentParser() 199 | parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") 200 | parser.add_argument("-s", "--scanner", metavar='SCANNER', help="use SCANNER for tokenizing") 201 | parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS) 202 | parser.add_argument("--max_order", type=int, help="highest n-gram order to use") 203 | parser.add_argument("--word", action='store_true', default=False, help="use 'word' tokenization (currently str.split)") 204 | parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE) 205 | parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets") 206 | parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR") 207 | 208 | group = parser.add_argument_group('sampling') 209 | group.add_argument("--sample_size", type=int, help="size of sample for sampling-based tokenization", default=140) 210 | group.add_argument("--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None) 211 | 212 | args = parser.parse_args() 213 | 214 | if args.temp: 215 | buckets_dir = args.temp 216 | else: 217 | buckets_dir = os.path.join(args.model, 'buckets') 218 | makedir(buckets_dir) 219 | 220 | bucketlist_path = os.path.join(args.model, 'bucketlist') 221 | index_path = os.path.join(args.model, 'paths') 222 | 223 | # display paths 224 | print("index path:", index_path) 225 | print("bucketlist path:", bucketlist_path) 226 | print("buckets path:", buckets_dir) 227 | 228 | with open(index_path) as f: 229 | reader = csv.reader(f) 230 | items = list(reader) 231 | 232 | if sum(map(bool,(args.scanner, args.max_order, args.word))) > 1: 233 | parser.error('can only specify one of --word, --scanner and --max_order') 234 | 235 | # Tokenize 236 | print("will tokenize %d files" % len(items)) 237 | if args.scanner: 238 | from .scanner import Scanner 239 | tokenizer = Scanner.from_file(args.scanner) 240 | print("using provided scanner: ", args.scanner) 241 | elif args.word: 242 | tokenizer = str.split 243 | print("using str.split to tokenize") 244 | else: 245 | max_order = args.max_order if args.max_order else MAX_NGRAM_ORDER 246 | tokenizer = NGramTokenizer(1,max_order) 247 | print("using n-gram tokenizer: max_order({0})".format(max_order)) 248 | b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size) 249 | 250 | # output the paths to the buckets 251 | with open(bucketlist_path,'w') as f: 252 | for d in b_dirs: 253 | f.write(d+'\n') 254 | -------------------------------------------------------------------------------- /py3langid/train/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | train.py - 3 | All-in-one tool for easy training of a model for langid.py. This depends on the 4 | training tools for individual steps, which can be run separately. 5 | 6 | Marco Lui, January 2013 7 | 8 | Copyright 2013 Marco Lui . All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without modification, are 11 | permitted provided that the following conditions are met: 12 | 13 | 1. Redistributions of source code must retain the above copyright notice, this list of 14 | conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 17 | of conditions and the following disclaimer in the documentation and/or other materials 18 | provided with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | The views and conclusions contained in the software and documentation are those of the 31 | authors and should not be interpreted as representing official policies, either expressed 32 | or implied, of the copyright holder. 33 | """ 34 | 35 | TRAIN_PROP = 1.0 # probability than any given document is selected 36 | MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included 37 | MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider 38 | TOP_DOC_FREQ = 15000 # number of tokens to consider for each order 39 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation 40 | CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use) 41 | FEATURES_PER_LANG = 300 # number of features to select for each language 42 | 43 | import argparse 44 | import base64 45 | import bz2 46 | import csv 47 | import os 48 | import pickle 49 | import shutil 50 | 51 | import numpy 52 | 53 | from .common import makedir, write_weights, write_features, read_features 54 | from .index import CorpusIndexer 55 | from .tokenize import build_index, NGramTokenizer 56 | from .DFfeatureselect import tally, ngram_select 57 | from .IGweight import compute_IG 58 | from .LDfeatureselect import select_LD_features 59 | from .scanner import build_scanner, Scanner 60 | 61 | from .NBtrain import generate_cm, learn_pc, learn_ptc 62 | 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("-p","--proportion", type=float, help="proportion of training data to use", default=TRAIN_PROP) 67 | parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR") 68 | parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") 69 | parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets") 70 | parser.add_argument("-d","--domain", metavar="DOMAIN", action='append', 71 | help="use DOMAIN - can be specified multiple times (uses all domains found if not specified)") 72 | parser.add_argument("-l","--lang", metavar="LANG", action='append', 73 | help="use LANG - can be specified multiple times (uses all langs found if not specified)") 74 | parser.add_argument("--min_domain", type=int, help="minimum number of domains a language must be present in", default=MIN_DOMAIN) 75 | parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS) 76 | parser.add_argument("--max_order", type=int, help="highest n-gram order to use", default=MAX_NGRAM_ORDER) 77 | parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE) 78 | parser.add_argument("--df_tokens", type=int, help="number of tokens to consider for each n-gram order", default=TOP_DOC_FREQ) 79 | parser.add_argument("--word", action='store_true', default=False, help="use 'word' tokenization (currently str.split)") 80 | parser.add_argument("--df_feats", metavar="FEATS", help="Instead of DF feature selection, use a list of features from FEATS") 81 | parser.add_argument("--ld_feats", metavar="FEATS", help="Instead of LD feature selection, use a list of features from FEATS") 82 | parser.add_argument("--feats_per_lang", type=int, metavar='N', help="select top N features for each language", default=FEATURES_PER_LANG) 83 | parser.add_argument("--no_domain_ig", action="store_true", default=False, help="use only per-langugage IG in LD calculation") 84 | parser.add_argument("--debug", action="store_true", default=False, help="produce debug output (all intermediates)") 85 | 86 | group = parser.add_argument_group('sampling') 87 | group.add_argument("--sample_size", type=int, help="size of sample for sampling-based tokenization", default=140) 88 | group.add_argument("--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None) 89 | 90 | parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR") 91 | 92 | args = parser.parse_args() 93 | 94 | if args.df_feats and args.ld_feats: 95 | parser.error("--df_feats and --ld_feats are mutually exclusive") 96 | 97 | corpus_name = os.path.basename(args.corpus) 98 | if args.model: 99 | model_dir = args.model 100 | else: 101 | model_dir = os.path.join('.', corpus_name+'.model') 102 | 103 | makedir(model_dir) 104 | 105 | langs_path = os.path.join(model_dir, 'lang_index') 106 | domains_path = os.path.join(model_dir, 'domain_index') 107 | index_path = os.path.join(model_dir, 'paths') 108 | 109 | # display paths 110 | print("corpus path:", args.corpus) 111 | print("model path:", model_dir) 112 | 113 | indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion, 114 | langs = args.lang, domains = args.domain) 115 | 116 | # Compute mappings between files, languages and domains 117 | lang_dist = indexer.dist_lang 118 | lang_index = indexer.lang_index 119 | lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items())) 120 | print("langs({0}): {1}".format(len(lang_dist), lang_info)) 121 | 122 | domain_dist = indexer.dist_domain 123 | domain_index = indexer.domain_index 124 | domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items())) 125 | print("domains({0}): {1}".format(len(domain_dist), domain_info)) 126 | 127 | print("identified {0} files".format(len(indexer.items))) 128 | 129 | items = [ (d,l,p) for (d,l,n,p) in indexer.items ] 130 | if args.debug: 131 | # output the language index 132 | with open(langs_path,'w') as f: 133 | writer = csv.writer(f) 134 | writer.writerows((l, lang_dist[lang_index[l]]) 135 | for l in sorted(lang_index, key=lang_index.get)) 136 | 137 | # output the domain index 138 | with open(domains_path,'w') as f: 139 | writer = csv.writer(f) 140 | writer.writerows((d, domain_dist[domain_index[d]]) 141 | for d in sorted(domain_index, key=domain_index.get)) 142 | 143 | # output items found 144 | with open(index_path,'w') as f: 145 | writer = csv.writer(f) 146 | writer.writerows(items) 147 | 148 | if args.temp: 149 | buckets_dir = args.temp 150 | else: 151 | buckets_dir = os.path.join(model_dir, 'buckets') 152 | makedir(buckets_dir) 153 | 154 | bucketlist_path = os.path.join(model_dir, 'bucketlist') 155 | index_path = os.path.join(model_dir, 'paths') 156 | 157 | if args.ld_feats: 158 | # LD features are pre-specified. We are basically just building the NB model. 159 | LDfeats = read_features(args.ld_feats) 160 | 161 | else: 162 | # LD features not pre-specified, so we compute them. 163 | 164 | # Tokenize 165 | DFfeats = None 166 | print("will tokenize %d files" % len(items)) 167 | # TODO: Custom tokenizer if doing custom first-pass features 168 | if args.df_feats: 169 | print("reading custom features from:", args.df_feats) 170 | DFfeats = read_features(args.df_feats) 171 | print("building tokenizer for custom list of {0} features".format(len(DFfeats))) 172 | tk = Scanner(DFfeats) 173 | elif args.word: 174 | print("using word tokenizer") 175 | tk = str.split 176 | else: 177 | print("using byte NGram tokenizer, max_order: {0}".format(args.max_order)) 178 | tk = NGramTokenizer(1, args.max_order) 179 | 180 | # First-pass tokenization, used to determine DF of features 181 | b_dirs = build_index(items, tk, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size) 182 | 183 | if args.debug: 184 | # output the paths to the buckets 185 | with open(bucketlist_path,'w') as f: 186 | for d in b_dirs: 187 | f.write(d+'\n') 188 | 189 | # We need to compute a tally if we are selecting features by DF, but also if 190 | # we want full debug output. 191 | if DFfeats is None or args.debug: 192 | # Compute DF per-feature 193 | doc_count = tally(b_dirs, args.jobs) 194 | if args.debug: 195 | doc_count_path = os.path.join(model_dir, 'DF_all') 196 | write_weights(doc_count, doc_count_path) 197 | print("wrote DF counts for all features to:", doc_count_path) 198 | 199 | if DFfeats is None: 200 | # Choose the first-stage features 201 | DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens) 202 | doc_count = None 203 | 204 | if args.debug: 205 | feature_path = os.path.join(model_dir, 'DFfeats') 206 | write_features(DFfeats, feature_path) 207 | print('wrote features to "%s"' % feature_path ) 208 | 209 | # Dispose of the first-pass tokenize output as it is no longer 210 | # needed. 211 | if not args.debug: 212 | for b in b_dirs: 213 | shutil.rmtree(b) 214 | 215 | # Second-pass tokenization to only obtain counts for the selected features. 216 | # As the first-pass set is typically much larger than the second pass, it often 217 | # works out to be faster to retokenize the raw documents rather than iterate 218 | # over the first-pass counts. 219 | DF_scanner = Scanner(DFfeats) 220 | b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets, args.jobs, args.chunksize) 221 | DF_scanner = None 222 | 223 | # Build vectors of domain and language distributions for use in IG calculation 224 | domain_dist_vec = numpy.array([ domain_dist[domain_index[d]] 225 | for d in sorted(domain_index, key=domain_index.get)], dtype=int) 226 | domain_dist = None 227 | lang_dist_vec = numpy.array([ lang_dist[lang_index[l]] 228 | for l in sorted(lang_index.keys(), key=lang_index.get)], dtype=int) 229 | lang_dist = None 230 | 231 | # Compute IG 232 | ig_params = [ 233 | ('lang', lang_dist_vec, '.lang', True), 234 | ] 235 | if not args.no_domain_ig: 236 | ig_params.append( ('domain', domain_dist_vec, '.domain', False) ) 237 | 238 | ig_vals = {} 239 | for label, dist, suffix, binarize in ig_params: 240 | print("Computing information gain for {0}".format(label)) 241 | ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs) 242 | if args.debug: 243 | weights_path = os.path.join(model_dir, 'IGweights' + suffix + ('.bin' if binarize else '')) 244 | write_weights(ig, weights_path) 245 | ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig) 246 | 247 | ig = None 248 | DFfeats = None 249 | # Select features according to the LD criteria 250 | features_per_lang = select_LD_features(ig_vals['lang'], ig_vals.get('domain'), args.feats_per_lang, ignore_domain = args.no_domain_ig) 251 | ig_vals = None 252 | LDfeats = reduce(set.union, map(set, features_per_lang.values())) 253 | print('selected %d features' % len(LDfeats)) 254 | 255 | if args.debug: 256 | feature_path = os.path.join(model_dir, 'LDfeats') 257 | write_features(sorted(LDfeats), feature_path) 258 | print('wrote LD features to "%s"' % feature_path ) 259 | 260 | with open(feature_path + '.perlang', 'w') as f: 261 | writer = csv.writer(f) 262 | for i in range(len(features_per_lang)): 263 | writer.writerow(map(repr,features_per_lang[i])) 264 | 265 | print('wrote LD.perlang features to "%s"' % feature_path + '.perlang') 266 | features_per_lang = None 267 | 268 | # Compile a scanner for the LDfeats 269 | tk_nextmove, tk_output = build_scanner(LDfeats) 270 | if args.debug: 271 | scanner_path = feature_path + '.scanner' 272 | with open(scanner_path, 'w') as f: 273 | pickle.dump((tk_nextmove, tk_output, LDfeats), f) 274 | 275 | print("wrote scanner to {0}".format(scanner_path)) 276 | 277 | LDfeats = None 278 | 279 | # Assemble the NB model 280 | langs = sorted(lang_index, key=lang_index.get) 281 | lang_index = None 282 | 283 | cm = generate_cm([ (l,p) for d,l,p in items], len(langs)) 284 | paths = zip(*items)[2] 285 | 286 | nb_classes = langs 287 | nb_pc = learn_pc(cm) 288 | nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args) 289 | 290 | # output the model 291 | output_path = os.path.join(model_dir, 'model') 292 | model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output 293 | string = base64.b64encode(bz2.compress(pickle.dumps(model))) 294 | with open(output_path, 'w') as f: 295 | f.write(string) 296 | 297 | print("wrote model to %s (%d bytes)" % (output_path, len(string))) 298 | 299 | # remove buckets if debug is off. We don't generate buckets if ldfeats is supplied. 300 | if not args.debug and not args.ld_feats: 301 | for b in b_dirs: 302 | shutil.rmtree(b) 303 | if not args.temp: 304 | # Do not remove the buckets dir if temp was supplied as we don't know 305 | # if we created it. 306 | shutil.rmtree(buckets_dir) 307 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/ 2 | [build-system] 3 | requires = ["setuptools>=61.0"] 4 | build-backend = "setuptools.build_meta" 5 | 6 | [project] 7 | name = "py3langid" 8 | description = "Fork of the language identification tool langid.py, featuring a modernized codebase and faster execution times." 9 | readme = "README.rst" 10 | license = { text = "BSD" } 11 | dynamic = ["version"] 12 | requires-python = ">=3.8" 13 | authors = [ 14 | {name = "Marco Lui"}, 15 | {name = "Adrien Barbaresi", email = "barbaresi@bbaw.de"} 16 | ] 17 | keywords=[ 18 | "language detection", 19 | "language identification", 20 | "langid", 21 | "langid.py" 22 | ] 23 | classifiers = [ 24 | # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers 25 | 'Development Status :: 5 - Production/Stable', 26 | #'Development Status :: 6 - Mature', 27 | "Environment :: Console", 28 | "Intended Audience :: Developers", 29 | "Intended Audience :: Information Technology", 30 | "Intended Audience :: Science/Research", 31 | "License :: OSI Approved :: BSD License", 32 | "Operating System :: MacOS :: MacOS X", 33 | "Operating System :: Microsoft :: Windows", 34 | "Operating System :: POSIX :: Linux", 35 | "Programming Language :: Python", 36 | "Programming Language :: Python :: 3", 37 | "Programming Language :: Python :: 3.8", 38 | "Programming Language :: Python :: 3.9", 39 | "Programming Language :: Python :: 3.10", 40 | "Programming Language :: Python :: 3.11", 41 | "Programming Language :: Python :: 3.12", 42 | "Programming Language :: Python :: 3.13", 43 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 44 | "Topic :: Text Processing :: Linguistic", 45 | ] 46 | dependencies = [ 47 | "numpy >= 2.0.0 ; python_version >= '3.9'", 48 | "numpy >= 1.24.3 ; python_version == '3.8'", 49 | ] 50 | 51 | # https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html 52 | [tool.setuptools] 53 | packages = ["py3langid"] 54 | 55 | # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ 56 | [tool.setuptools.dynamic] 57 | version = {attr = "py3langid.__version__"} 58 | 59 | [tool.setuptools.package-data] 60 | py3langid = ["data/model.plzma"] 61 | 62 | [project.scripts] 63 | langid = "py3langid.langid:main" 64 | 65 | [project.urls] 66 | "Homepage" = "https://github.com/adbar/py3langid" 67 | "Blog" = "https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html" 68 | "Tracker" = "https://github.com/adbar/py3langid/issues" 69 | 70 | # Development extras 71 | [project.optional-dependencies] 72 | dev = [ 73 | "pytest", 74 | "pytest-cov", 75 | ] 76 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | -------------------------------------------------------------------------------- /tests/test_langid.py: -------------------------------------------------------------------------------- 1 | 2 | import subprocess 3 | import sys 4 | 5 | from io import StringIO 6 | from pathlib import Path 7 | 8 | import py3langid as langid 9 | from py3langid.langid import LanguageIdentifier, MODEL_FILE 10 | 11 | 12 | def test_langid(): 13 | '''Test if the language detection functions work''' 14 | # basic classification 15 | text = b'This text is in English.' 16 | assert langid.classify(text)[0] == 'en' 17 | assert langid.rank(text)[0][0] == 'en' 18 | text = 'This text is in English.' 19 | assert langid.classify(text)[0] == 'en' 20 | assert langid.rank(text)[0][0] == 'en' 21 | text = 'Test Unicode sur du texte en français' 22 | assert langid.classify(text)[0] == 'fr' 23 | assert langid.rank(text)[0][0] == 'fr' 24 | # other datatype 25 | assert langid.classify(text)[1] != langid.classify(text, datatype='uint32')[1] 26 | # normalization of probabilities 27 | identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) 28 | _, normed_prob = identifier.classify(text) 29 | assert 0 <= normed_prob <= 1 30 | # probability not equal to 1 31 | _, normed_prob = identifier.classify('This potrebbe essere a test.') 32 | normed_prob == 0.8942321 33 | # not normalized 34 | identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=False) 35 | _, prob = identifier.classify(text) 36 | assert prob < 0 37 | # subset of target languages 38 | identifier.set_languages(['de', 'en', 'fr']) 39 | assert identifier.classify('这样不好')[0] != 'zh' 40 | 41 | 42 | 43 | def test_redirection(): 44 | '''Test if STDIN redirection works''' 45 | thisdir = Path(__file__).parent 46 | langid_path = str(thisdir.parent / 'py3langid' / 'langid.py') 47 | readme_path = str(thisdir.parent / 'README.rst') 48 | with open(readme_path, 'rb') as f: 49 | readme = f.read() 50 | result = subprocess.check_output(['python3', langid_path, '-n'], input=readme) 51 | assert b'en' in result and b'1.0' in result 52 | 53 | 54 | 55 | def test_cli(): 56 | '''Test console scripts entry point''' 57 | result = subprocess.check_output(['langid', '-n'], input=b'This should be enough text.') 58 | assert b'en' in result and b'1.0' in result 59 | result = subprocess.check_output(['langid', '-n', '-l', 'bg,en,uk'], input=b'This should be enough text.') 60 | assert b'en' in result and b'1.0' in result 61 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from unittest.mock import MagicMock 4 | 5 | import pytest 6 | 7 | from py3langid.langid import application 8 | 9 | 10 | @pytest.fixture 11 | def mock_start_response(): 12 | return MagicMock() 13 | 14 | def test_detect_put(mock_start_response): 15 | environ = { 16 | 'REQUEST_METHOD': 'PUT', 17 | 'CONTENT_LENGTH': 10, 18 | 'wsgi.input': MagicMock(read=lambda x: b'This is a test'), 19 | 'PATH_INFO': '/detect' 20 | } 21 | response = application(environ, mock_start_response) 22 | assert mock_start_response.call_args[0][0] == '200 OK' 23 | assert json.loads(response[0].decode('utf-8'))['responseData']['language'] == 'en' 24 | 25 | def test_detect_get(mock_start_response): 26 | environ = { 27 | 'REQUEST_METHOD': 'GET', 28 | 'QUERY_STRING': 'q=This+is+a+test', 29 | 'PATH_INFO': '/detect' 30 | } 31 | response = application(environ, mock_start_response) 32 | assert mock_start_response.call_args[0][0] == '200 OK' 33 | assert json.loads(response[0].decode('utf-8'))['responseData']['language'] == 'en' 34 | 35 | def test_detect_post(mock_start_response): 36 | environ = { 37 | 'REQUEST_METHOD': 'POST', 38 | 'CONTENT_LENGTH': 10, 39 | 'wsgi.input': MagicMock(read=lambda x: b'q=Hello+World'), 40 | 'PATH_INFO': '/detect' 41 | } 42 | response = application(environ, mock_start_response) 43 | assert mock_start_response.call_args[0][0] == '200 OK' 44 | assert json.loads(response[0].decode('utf-8'))['responseData']['language'] == 'en' 45 | 46 | def test_rank_put(mock_start_response): 47 | environ = { 48 | 'REQUEST_METHOD': 'PUT', 49 | 'CONTENT_LENGTH': 10, 50 | 'wsgi.input': MagicMock(read=lambda x: b'Hello World'), 51 | 'PATH_INFO': '/rank' 52 | } 53 | response = application(environ, mock_start_response) 54 | assert mock_start_response.call_args[0][0] == '200 OK' 55 | assert json.loads(response[0].decode('utf-8'))['responseData'] is not None 56 | 57 | def test_rank_get(mock_start_response): 58 | environ = { 59 | 'REQUEST_METHOD': 'GET', 60 | 'QUERY_STRING': 'q=Hello+World', 61 | 'PATH_INFO': '/rank' 62 | } 63 | response = application(environ, mock_start_response) 64 | assert mock_start_response.call_args[0][0] == '200 OK' 65 | assert json.loads(response[0].decode('utf-8'))['responseData'] is not None 66 | 67 | def test_rank_post(mock_start_response): 68 | environ = { 69 | 'REQUEST_METHOD': 'POST', 70 | 'CONTENT_LENGTH': 10, 71 | 'wsgi.input': MagicMock(read=lambda x: b'q=Hello+World'), 72 | 'PATH_INFO': '/rank' 73 | } 74 | response = application(environ, mock_start_response) 75 | assert mock_start_response.call_args[0][0] == '200 OK' 76 | assert json.loads(response[0].decode('utf-8'))['responseData'] is not None 77 | 78 | def test_invalid_method(mock_start_response): 79 | environ = { 80 | 'REQUEST_METHOD': 'DELETE', 81 | 'PATH_INFO': '/detect' 82 | } 83 | response = application(environ, mock_start_response) 84 | assert mock_start_response.call_args[0][0] == '405 Method Not Allowed' 85 | 86 | def test_invalid_path(mock_start_response): 87 | environ = { 88 | 'REQUEST_METHOD': 'GET', 89 | 'PATH_INFO': '/invalid' 90 | } 91 | response = application(environ, mock_start_response) 92 | assert mock_start_response.call_args[0][0] == '404 Not Found' 93 | 94 | def test_empty_path(mock_start_response): 95 | environ = { 96 | 'REQUEST_METHOD': 'GET', 97 | 'PATH_INFO': '' 98 | } 99 | response = application(environ, mock_start_response) 100 | assert mock_start_response.call_args[0][0] == '404 Not Found' 101 | 102 | def test_no_query_string(mock_start_response): 103 | environ = { 104 | 'REQUEST_METHOD': 'GET', 105 | 'PATH_INFO': '/detect' 106 | } 107 | response = application(environ, mock_start_response) 108 | assert mock_start_response.call_args[0][0] == '400 Unknown Status' 109 | assert json.loads(response[0].decode('utf-8'))['responseData'] is None 110 | --------------------------------------------------------------------------------