├── .github
    └── workflows
    │   ├── codeql-analysis.yml
    │   └── tests.yml
├── .gitignore
├── FEATURES
├── HISTORY.rst
├── LICENSE
├── README.rst
├── py3langid
    ├── __init__.py
    ├── data
    │   └── model.plzma
    ├── examples
    │   ├── _twokenize.py
    │   └── process_twitter.py
    ├── langid.py
    ├── tools
    │   ├── __init__.py
    │   ├── featWeights.py
    │   └── printfeats.py
    └── train
    │   ├── BLweight.py
    │   ├── DFfeatureselect.py
    │   ├── IGweight.py
    │   ├── LDfeatureselect.py
    │   ├── NBtrain.py
    │   ├── README
    │   ├── __init__.py
    │   ├── common.py
    │   ├── index.py
    │   ├── scanner.py
    │   ├── tokenize.py
    │   └── train.py
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── test_langid.py
    └── test_server.py


/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '23 1 * * 1'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v2
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v1
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v1
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v1
71 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         os: [ubuntu-latest]
20 |         # https://github.com/actions/python-versions/blob/main/versions-manifest.json
21 |         python-version: [3.8, 3.9, "3.10", "3.11", "3.12", "3.13-dev"]
22 |         include:
23 |         # other OS version necessary
24 |         - os: macos-latest
25 |           python-version: "3.10"
26 |         - os: windows-latest
27 |           python-version: "3.10"
28 |     steps:
29 |     # Python and pip setup
30 |     - name: Set up Python ${{ matrix.python-version }}
31 |       uses: actions/setup-python@v5
32 |       with:
33 |         python-version: ${{ matrix.python-version }}
34 | 
35 |     - name: Upgrade pip
36 |       run: python -m pip install --upgrade pip
37 | 
38 |     - name: Get pip cache dir
39 |       id: pip-cache
40 |       run: |
41 |         echo "::set-output name=dir::$(pip cache dir)"
42 | 
43 |     - name: pip cache
44 |       uses: actions/cache@v4
45 |       with:
46 |         path: ${{ steps.pip-cache.outputs.dir }}
47 |         key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
48 |         restore-keys: |
49 |           ${{ runner.os }}-pip-
50 | 
51 |     # package setup
52 |     - uses: actions/checkout@v4
53 | 
54 |     - name: Install dependencies
55 |       run: python -m pip install -e "."
56 | 
57 |     # tests
58 |     - name: Test with pytest
59 |       run: |
60 |         python -m pip install pytest pytest-cov
61 |         pytest --cov=./ --cov-report=xml
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | # IDE settings
105 | .vscode/
106 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | History
 3 | =======
 4 | 
 5 | 0.3.0
 6 | -----
 7 | 
 8 | * Modernized setup, dropped support for Python 3.6 & 3.7
 9 | * Simplified inference code
10 | * Support for Numpy 2.0
11 | 
12 | 
13 | 0.2.2
14 | -----
15 | 
16 | * Fixed bug in probability normalization (#6)
17 | * Fully implemented data type argument in ``classify()``
18 | * Adapted training scripts to Python3 (untested)
19 | 
20 | 
21 | 0.2.1
22 | -----
23 | 
24 | * Maintenance: update and simplify code
25 | 
26 | 
27 | 0.2.0
28 | -----
29 | 
30 | * Change Numpy data type for features (``uint32`` → ``uint16``)
31 | * Code cleaning
32 | 
33 | 
34 | 0.1.2
35 | -----
36 | 
37 | * Include data in non-wheel package versions
38 | 
39 | 
40 | 0.1.1
41 | -----
42 | 
43 | * Faster module loading
44 | * Extended tests and readme
45 | 
46 | 
47 | 0.1.0
48 | -----
49 | 
50 | * Fork re-packaged
51 | * Efficiency improvements in ``langid.py``
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | py3langid - Language Identifier
 2 | BSD 3-Clause License
 3 | 
 4 | Modifications (fork): Copyright (c) 2021, Adrien Barbaresi.
 5 | 
 6 | Original code: Copyright (c) 2011 Marco Lui <saffsd@gmail.com>.
 7 | Based on research by Marco Lui and Tim Baldwin.
 8 | 
 9 | All rights reserved.
10 | 
11 | Redistribution and use in source and binary forms, with or without modification, are
12 | permitted provided that the following conditions are met:
13 | 
14 | 1. Redistributions of source code must retain the above copyright notice, this
15 |    list of conditions and the following disclaimer.
16 | 
17 | 2. Redistributions in binary form must reproduce the above copyright notice,
18 |    this list of conditions and the following disclaimer in the documentation
19 |    and/or other materials provided with the distribution.
20 | 
21 | 3. Neither the name of the copyright holder nor the names of its
22 |    contributors may be used to endorse or promote products derived from
23 |    this software without specific prior written permission.
24 | 
25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
26 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
27 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
28 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
31 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
33 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =============
  2 | ``py3langid``
  3 | =============
  4 | 
  5 | 
  6 | ``py3langid`` is a fork of the standalone language identification tool ``langid.py`` by Marco Lui.
  7 | 
  8 | Original license: BSD-2-Clause. Fork license: BSD-3-Clause.
  9 | 
 10 | 
 11 | 
 12 | Changes in this fork
 13 | --------------------
 14 | 
 15 | Execution speed has been improved and the code base has been optimized for Python 3.6+:
 16 | 
 17 | - Import: Loading the package (``import py3langid``) is about 30% faster
 18 | - Startup: Loading the default classification model is 25-30x faster
 19 | - Execution: Language detection with ``langid.classify`` is 5-6x faster on paragraphs (less on longer texts)
 20 | 
 21 | For implementation details see this blog post: `How to make language detection with langid.py faster <https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html>`_.
 22 | 
 23 | For more information and older Python versions see `changelog <https://github.com/adbar/py3langid/blob/master/HISTORY.rst>`_.
 24 | 
 25 | 
 26 | Usage
 27 | -----
 28 | 
 29 | Drop-in replacement
 30 | ~~~~~~~~~~~~~~~~~~~
 31 | 
 32 | 
 33 | 1. Install the package:
 34 | 
 35 |    * ``pip3 install py3langid`` (or ``pip`` where applicable)
 36 | 
 37 | 2. Use it:
 38 | 
 39 |    * with Python: ``import py3langid as langid``
 40 |    * on the command-line: ``langid``
 41 | 
 42 | 
 43 | With Python
 44 | ~~~~~~~~~~~
 45 | 
 46 | Basics:
 47 | 
 48 | .. code-block:: python
 49 | 
 50 |     >>> import py3langid as langid
 51 |     
 52 |     >>> text = 'This text is in English.'
 53 |     # identified language and probability
 54 |     >>> langid.classify(text)
 55 |     ('en', -56.77429)
 56 |     # unpack the result tuple in variables
 57 |     >>> lang, prob = langid.classify(text)
 58 |     # all potential languages
 59 |     >>> langid.rank(text)
 60 | 
 61 | 
 62 | More options:
 63 | 
 64 | .. code-block:: python
 65 | 
 66 |     >>> from py3langid.langid import LanguageIdentifier, MODEL_FILE
 67 | 
 68 |     # subset of target languages
 69 |     >>> identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE)
 70 |     >>> identifier.set_languages(['de', 'en', 'fr'])
 71 |     # this won't work well...
 72 |     >>> identifier.classify('这样不好')
 73 |     ('en', -81.831665)
 74 | 
 75 |     # normalization of probabilities to an interval between 0 and 1
 76 |     >>> identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
 77 |     >>> identifier.classify('This should be enough text.')
 78 |     ('en', 1.0)
 79 | 
 80 | 
 81 | Note: the Numpy data type for the feature vector has been changed to optimize for speed. If results are inconsistent, try restoring the original setting:
 82 | 
 83 | .. code-block:: python
 84 | 
 85 |     >>> langid.classify(text, datatype='uint32')
 86 | 
 87 | 
 88 | On the command-line
 89 | ~~~~~~~~~~~~~~~~~~~
 90 | 
 91 | .. code-block:: bash
 92 | 
 93 |     # basic usage with probability normalization
 94 |     $ echo "This should be enough text." | langid -n
 95 |     ('en', 1.0)
 96 | 
 97 |     # define a subset of target languages
 98 |     $ echo "This won't be recognized properly." | langid -n -l fr,it,tr
 99 |     ('it', 0.97038305)
100 | 
101 | 
102 | Legacy documentation
103 | --------------------
104 | 
105 | 
106 | **The docs below are provided for reference, only part of the functions are currently tested and maintained.**
107 | 
108 | 
109 | Introduction
110 | ------------
111 | 
112 | ``langid.py`` is a standalone Language Identification (LangID) tool.
113 | 
114 | The design principles are as follows:
115 | 
116 | 1. Fast
117 | 2. Pre-trained over a large number of languages (currently 97)
118 | 3. Not sensitive to domain-specific features (e.g. HTML/XML markup)
119 | 4. Single .py file with minimal dependencies
120 | 5. Deployable as a web service
121 | 
122 | All that is required to run ``langid.py`` is Python >= 3.6 and numpy. 
123 | 
124 | The accompanying training tools are still Python2-only.
125 | 
126 | ``langid.py`` is WSGI-compliant.  ``langid.py`` will use ``fapws3`` as a web server if 
127 | available, and default to ``wsgiref.simple_server`` otherwise.
128 | 
129 | ``langid.py`` comes pre-trained on 97 languages (ISO 639-1 codes given):
130 | 
131 |     af, am, an, ar, as, az, be, bg, bn, br, 
132 |     bs, ca, cs, cy, da, de, dz, el, en, eo, 
133 |     es, et, eu, fa, fi, fo, fr, ga, gl, gu, 
134 |     he, hi, hr, ht, hu, hy, id, is, it, ja, 
135 |     jv, ka, kk, km, kn, ko, ku, ky, la, lb, 
136 |     lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, 
137 |     nb, ne, nl, nn, no, oc, or, pa, pl, ps, 
138 |     pt, qu, ro, ru, rw, se, si, sk, sl, sq, 
139 |     sr, sv, sw, ta, te, th, tl, tr, ug, uk, 
140 |     ur, vi, vo, wa, xh, zh, zu
141 | 
142 | The training data was drawn from 5 different sources:
143 | 
144 | * JRC-Acquis 
145 | * ClueWeb 09
146 | * Wikipedia
147 | * Reuters RCV2
148 | * Debian i18n
149 | 
150 | 
151 | Usage
152 | -----
153 | 
154 |     langid [options]
155 | 
156 | optional arguments:
157 |   -h, --help            show this help message and exit
158 |   -s, --serve           launch web service
159 |   --host=HOST           host/ip to bind to
160 |   --port=PORT           port to listen on
161 |   -v                    increase verbosity (repeat for greater effect)
162 |   -m MODEL              load model from file
163 |   -l LANGS, --langs=LANGS
164 |                         comma-separated set of target ISO639 language codes
165 |                         (e.g en,de)
166 |   -r, --remote          auto-detect IP address for remote access
167 |   -b, --batch           specify a list of files on the command line
168 |   -d, --dist            show full distribution over languages
169 |   -u URL, --url=URL     langid of URL
170 |   --line                process pipes line-by-line rather than as a document
171 |   -n, --normalize       normalize confidence scores to probability values
172 | 
173 | 
174 | The simplest way to use ``langid.py`` is as a command-line tool, and you can 
175 | invoke using ``python langid.py``. If you installed ``langid.py`` as a Python 
176 | module (e.g. via ``pip install langid``), you can invoke ``langid`` instead of 
177 | ``python langid.py -n`` (the two are equivalent).  This will cause a prompt to 
178 | display. Enter text to identify, and hit enter::
179 | 
180 |   >>> This is a test
181 |   ('en', -54.41310358047485)
182 |   >>> Questa e una prova
183 |   ('it', -35.41771221160889)
184 | 
185 | 
186 | ``langid.py`` can also detect when the input is redirected (only tested under Linux), and in this
187 | case will process until EOF rather than until newline like in interactive mode::
188 | 
189 |   python langid.py < README.rst 
190 |   ('en', -22552.496054649353)
191 | 
192 | 
193 | The value returned is the unnormalized probability estimate for the language. Calculating 
194 | the exact probability estimate is disabled by default, but can be enabled through a flag::
195 | 
196 |   python langid.py -n < README.rst 
197 |   ('en', 1.0)
198 | 
199 | More details are provided in this README in the section on `Probability Normalization`.
200 | 
201 | You can also use ``langid.py`` as a Python library::
202 | 
203 |   # python
204 |   Python 2.7.2+ (default, Oct  4 2011, 20:06:09) 
205 |   [GCC 4.6.1] on linux2
206 |   Type "help", "copyright", "credits" or "license" for more information.
207 |   >>> import langid
208 |   >>> langid.classify("This is a test")
209 |   ('en', -54.41310358047485)
210 |   
211 | Finally, ``langid.py`` can use Python's built-in ``wsgiref.simple_server`` (or ``fapws3`` if available) to
212 | provide language identification as a web service. To do this, launch ``python langid.py -s``, and
213 | access http://localhost:9008/detect . The web service supports GET, POST and PUT. If GET is performed
214 | with no data, a simple HTML forms interface is displayed.
215 | 
216 | The response is generated in JSON, here is an example::
217 | 
218 |   {"responseData": {"confidence": -54.41310358047485, "language": "en"}, "responseDetails": null, "responseStatus": 200}
219 | 
220 | A utility such as curl can be used to access the web service::
221 | 
222 |   # curl -d "q=This is a test" localhost:9008/detect
223 |   {"responseData": {"confidence": -54.41310358047485, "language": "en"}, "responseDetails": null, "responseStatus": 200}
224 | 
225 | You can also use HTTP PUT::
226 | 
227 |   # curl -T readme.rst localhost:9008/detect
228 |     % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
229 |                                  Dload  Upload   Total   Spent    Left  Speed
230 |   100  2871  100   119  100  2752    117   2723  0:00:01  0:00:01 --:--:--  2727
231 |   {"responseData": {"confidence": -22552.496054649353, "language": "en"}, "responseDetails": null, "responseStatus": 200}
232 | 
233 | If no "q=XXX" key-value pair is present in the HTTP POST payload, ``langid.py`` will interpret the entire
234 | file as a single query. This allows for redirection via curl::
235 | 
236 |   # echo "This is a test" | curl -d @- localhost:9008/detect
237 |   {"responseData": {"confidence": -54.41310358047485, "language": "en"}, "responseDetails": null, "responseStatus": 200}
238 | 
239 | ``langid.py`` will attempt to discover the host IP address automatically. Often, this is set to localhost(127.0.1.1), even 
240 | though the machine has a different external IP address. ``langid.py`` can attempt to automatically discover the external
241 | IP address. To enable this functionality, start ``langid.py`` with the ``-r`` flag.
242 | 
243 | ``langid.py`` supports constraining of the output language set using the ``-l`` flag and a comma-separated list of ISO639-1 
244 | language codes (the ``-n`` flag enables probability normalization)::
245 | 
246 |   # python langid.py -n -l it,fr
247 |   >>> Io non parlo italiano
248 |   ('it', 0.99999999988965627)
249 |   >>> Je ne parle pas français
250 |   ('fr', 1.0)
251 |   >>> I don't speak english
252 |   ('it', 0.92210605672341062)
253 | 
254 | When using ``langid.py`` as a library, the set_languages method can be used to constrain the language set::
255 | 
256 |   python                      
257 |   Python 2.7.2+ (default, Oct  4 2011, 20:06:09) 
258 |   [GCC 4.6.1] on linux2
259 |   Type "help", "copyright", "credits" or "license" for more information.
260 |   >>> import langid
261 |   >>> langid.classify("I do not speak english")
262 |   ('en', 0.57133487679900674)
263 |   >>> langid.set_languages(['de','fr','it'])
264 |   >>> langid.classify("I do not speak english")
265 |   ('it', 0.99999835791478453)
266 |   >>> langid.set_languages(['en','it'])
267 |   >>> langid.classify("I do not speak english")
268 |   ('en', 0.99176190378750373)
269 | 
270 | 
271 | Batch Mode
272 | ----------
273 | 
274 | ``langid.py`` supports batch mode processing, which can be invoked with the ``-b`` flag.
275 | In this mode, ``langid.py`` reads a list of paths to files to classify as arguments.
276 | If no arguments are supplied, ``langid.py`` reads the list of paths from ``stdin``,
277 | this is useful for using ``langid.py`` with UNIX utilities such as ``find``.
278 | 
279 | In batch mode, ``langid.py`` uses ``multiprocessing`` to invoke multiple instances of
280 | the classifier, utilizing all available CPUs to classify documents in parallel. 
281 | 
282 | 
283 | Probability Normalization
284 | -------------------------
285 | 
286 | The probabilistic model implemented by ``langid.py`` involves the multiplication of a
287 | large number of probabilities. For computational reasons, the actual calculations are
288 | implemented in the log-probability space (a common numerical technique for dealing with
289 | vanishingly small probabilities). One side-effect of this is that it is not necessary to
290 | compute a full probability in order to determine the most probable language in a set
291 | of candidate languages. However, users sometimes find it helpful to have a "confidence"
292 | score for the probability prediction. Thus, ``langid.py`` implements a re-normalization
293 | that produces an output in the 0-1 range.
294 | 
295 | ``langid.py`` disables probability normalization by default. For
296 | command-line usages of ``langid.py``, it can be enabled by passing the ``-n`` flag. For
297 | probability normalization in library use, the user must instantiate their own 
298 | ``LanguageIdentifier``. An example of such usage is as follows::
299 |   
300 |   >> from py3langid.langid import LanguageIdentifier, MODEL_FILE
301 |   >> identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
302 |   >> identifier.classify("This is a test")
303 |   ('en', 0.9999999909903544)
304 | 
305 | 
306 | Training a model
307 | ----------------
308 | 
309 | So far Python 2.7 only, see the `original instructions <https://github.com/saffsd/langid.py#training-a-model>`_.
310 | 
311 | 
312 | Read more
313 | ---------
314 | 
315 | ``langid.py`` is based on published research. [1] describes the LD feature selection technique in detail,
316 | and [2] provides more detail about the module ``langid.py`` itself.
317 | 
318 | [1] Lui, Marco and Timothy Baldwin (2011) Cross-domain Feature Selection for Language Identification, 
319 | In Proceedings of the Fifth International Joint Conference on Natural Language Processing (IJCNLP 2011), 
320 | Chiang Mai, Thailand, pp. 553—561. Available from http://www.aclweb.org/anthology/I11-1062
321 | 
322 | [2] Lui, Marco and Timothy Baldwin (2012) langid.py: An Off-the-shelf Language Identification Tool, 
323 | In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012), 
324 | Demo Session, Jeju, Republic of Korea. Available from www.aclweb.org/anthology/P12-3005
325 | 


--------------------------------------------------------------------------------
/py3langid/__init__.py:
--------------------------------------------------------------------------------
1 | from .langid import classify, rank, set_languages
2 | 
3 | __version__ = '0.3.0'
4 | 


--------------------------------------------------------------------------------
/py3langid/data/model.plzma:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbar/py3langid/812f2055f74c35dea298f30b434644062d9289be/py3langid/data/model.plzma


--------------------------------------------------------------------------------
/py3langid/examples/_twokenize.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
  4 | This tokenizer code has gone through a long history:
  5 | 
  6 | (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
  7 |        TweetMotif: Exploratory Search and Topic Summarization for Twitter.
  8 |        Brendan O'Connor, Michel Krieger, and David Ahn.
  9 |        ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
 10 | (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
 11 | (2b) Jason Baldridge and David Snyder ported it to Scala
 12 | (3) Brendan bugfixed the Scala port and merged with POS-specific changes
 13 |     for the CMU ARK Twitter POS Tagger
 14 | (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
 15 | 
 16 | Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
 17 | 
 18 | There have been at least 2 other Java ports, but they are not in the lineage for the code here.
 19 | 
 20 | Ported to Python by Myle Ott <myleott@gmail.com>.
 21 | """
 22 | 
 23 | from __future__ import print_function
 24 | 
 25 | import operator
 26 | import re
 27 | import HTMLParser
 28 | 
 29 | def regex_or(*items):
 30 |     return '(?:' + '|'.join(items) + ')'
 31 | 
 32 | Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
 33 | Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
 34 | 
 35 | punctChars = r"['\"“”‘’.?!…,:;]"
 36 | #punctSeq   = punctChars+"+"    #'anthem'. => ' anthem '.
 37 | punctSeq   = r"['\"“”‘’]+|[.?!,…]+|[:;]+"       #'anthem'. => ' anthem ' .
 38 | entity     = r"&(?:amp|lt|gt|quot);"
 39 | #  URLs
 40 | 
 41 | 
 42 | # BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
 43 | # If you actually empirically test it the results are bad.
 44 | # Please see https://github.com/brendano/ark-tweet-nlp/pull/9
 45 | 
 46 | urlStart1  = r"(?:https?://|\bwww\.)"
 47 | commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
 48 | ccTLDs   = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
 49 | r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
 50 | r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
 51 | r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
 52 | r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
 53 | r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
 54 | r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
 55 | r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"   #TODO: remove obscure country domains?
 56 | urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
 57 | urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
 58 | urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
 59 | urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
 60 | url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
 61 | 
 62 | 
 63 | # Numeric
 64 | timeLike   = r"\d+(?::\d+){1,2}"
 65 | #numNum     = r"\d+\.\d+"
 66 | numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
 67 | numComb  = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8')
 68 | 
 69 | # Abbreviations
 70 | boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
 71 | aa1  = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
 72 | aa2  = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
 73 | standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
 74 | arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
 75 | separators  = "(?:--+|―|—|~|–|=)"
 76 | decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8')
 77 | thingsThatSplitWords = r"[^\s\.,?\"]"
 78 | embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
 79 | 
 80 | #  Emoticons
 81 | # myleott: in Python the (?iu) flags affect the whole expression
 82 | #normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
 83 | normalEyes = "[:=]" # 8 and x are eyes but cause problems
 84 | wink = "[;]"
 85 | noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
 86 | happyMouths = r"[D\)\]\}]+"
 87 | sadMouths = r"[\(\[\{]+"
 88 | tongue = "[pPd3]+"
 89 | otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
 90 | 
 91 | # mouth repetition examples:
 92 | # @aliciakeys Put it in a love song :-))
 93 | # @hellocalyclops =))=))=)) Oh well
 94 | 
 95 | # myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
 96 | #bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
 97 | bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
 98 | bfCenter = r"(?:[\.]|[_-]+)"
 99 | bfRight = r"\2"
100 | s3 = r"(?:--['\"])"
101 | s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
102 | s5 = "(?:[.][_]+[.])"
103 | # myleott: in Python the (?i) flag affects the whole expression
104 | #basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
105 | basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
106 | 
107 | eeLeft = r"[＼\\ƪԄ\(（<>;ヽ\-=~\*]+"
108 | eeRight= u"[\\-=\\);'\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+".encode('utf-8')
109 | eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
110 | eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
111 | 
112 | oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
113 | 
114 | 
115 | emoticon = regex_or(
116 |         # Standard version  :) :( :] :D :P
117 |         "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
118 | 
119 |         # reversed version (: D:  use positive lookbehind to remove "(word):"
120 |         # because eyes on the right side is more ambiguous with the standard usage of : ;
121 |         regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",
122 | 
123 |         #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
124 |         eastEmote.replace("2", "1", 1), basicface,
125 |         # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
126 |         # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
127 | 
128 |         # myleott: o.O and O.o are two of the biggest sources of differences
129 |         #          between this and the Java version. One little hack won't hurt...
130 |         oOEmote
131 | )
132 | 
133 | Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
134 | 
135 | Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
136 | 
137 | # BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
138 | # "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
139 | # "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
140 | # "hello (@person)" ==> "hello (@person )"  WRONG
141 | # "hello (@person)" ==> "hello ( @person )"  RIGHT
142 | # ... Some sort of weird interaction with edgepunct I guess, because edgepunct
143 | # has poor content-symbol detection.
144 | 
145 | # This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
146 | # If you want good hashtag identification, use a different regex.
147 | Hashtag = "#[a-zA-Z0-9_]+"  #optional: lookbehind for \b
148 | #optional: lookbehind for \b, max length 15
149 | AtMention = "[@＠][a-zA-Z0-9_]+"
150 | 
151 | # I was worried this would conflict with at-mentions
152 | # but seems ok in sample of 5800: 7 changes all email fixes
153 | # http://www.regular-expressions.info/email.html
154 | Bound = r"(?:\W|^|$)"
155 | Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
156 | 
157 | # We will be tokenizing using these regexps as delimiters
158 | # Additionally, these things are "protected", meaning they shouldn't be further split themselves.
159 | Protected  = re.compile(
160 |     unicode(regex_or(
161 |         Hearts,
162 |         url,
163 |         Email,
164 |         timeLike,
165 |         #numNum,
166 |         numberWithCommas,
167 |         numComb,
168 |         emoticon,
169 |         Arrows,
170 |         entity,
171 |         punctSeq,
172 |         arbitraryAbbrev,
173 |         separators,
174 |         decorations,
175 |         embeddedApostrophe,
176 |         Hashtag,
177 |         AtMention
178 |     ).decode('utf-8')), re.UNICODE)
179 | 
180 | # Edge punctuation
181 | # Want: 'foo' => ' foo '
182 | # While also:   don't => don't
183 | # the first is considered "edge punctuation".
184 | # the second is word-internal punctuation -- don't want to mess with it.
185 | # BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
186 | # I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.
187 | 
188 | # Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
189 | #edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
190 | edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
191 | edgePunct    = "[" + edgePunctChars + "]"
192 | notEdgePunct = "[a-zA-Z0-9]" # content characters
193 | offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
194 | EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
195 | EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
196 | 
197 | def splitEdgePunct(input):
198 |     input = EdgePunctLeft.sub(r"\1\2 \3", input)
199 |     input = EdgePunctRight.sub(r"\1 \2\3", input)
200 |     return input
201 | 
202 | # The main work of tokenizing a tweet.
203 | def simpleTokenize(text):
204 | 
205 |     # Do the no-brainers first
206 |     splitPunctText = splitEdgePunct(text)
207 | 
208 |     textLength = len(splitPunctText)
209 | 
210 |     # BTO: the logic here got quite convoluted via the Scala porting detour
211 |     # It would be good to switch back to a nice simple procedural style like in the Python version
212 |     # ... Scala is such a pain.  Never again.
213 | 
214 |     # Find the matches for subsequences that should be protected,
215 |     # e.g. URLs, 1.0, U.N.K.L.E., 12:53
216 |     bads = []
217 |     badSpans = []
218 |     for match in Protected.finditer(splitPunctText):
219 |         # The spans of the "bads" should not be split.
220 |         if (match.start() != match.end()): #unnecessary?
221 |             bads.append( [splitPunctText[match.start():match.end()]] )
222 |             badSpans.append( (match.start(), match.end()) )
223 | 
224 |     # Create a list of indices to create the "goods", which can be
225 |     # split. We are taking "bad" spans like
226 |     #     List((2,5), (8,10))
227 |     # to create
228 |     #     List(0, 2, 5, 8, 10, 12)
229 |     # where, e.g., "12" here would be the textLength
230 |     # has an even length and no indices are the same
231 |     indices = [0]
232 |     for (first, second) in badSpans:
233 |         indices.append(first)
234 |         indices.append(second)
235 |     indices.append(textLength)
236 | 
237 |     # Group the indices and map them to their respective portion of the string
238 |     splitGoods = []
239 |     for i in range(0, len(indices), 2):
240 |         goodstr = splitPunctText[indices[i]:indices[i+1]]
241 |         splitstr = goodstr.strip().split(" ")
242 |         splitGoods.append(splitstr)
243 | 
244 |     #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
245 |     #  additonal tokens from last good item get included
246 |     zippedStr = []
247 |     for i in range(len(bads)):
248 |         zippedStr = addAllnonempty(zippedStr, splitGoods[i])
249 |         zippedStr = addAllnonempty(zippedStr, bads[i])
250 |     zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
251 | 
252 |     # BTO: our POS tagger wants "ur" and "you're" to both be one token.
253 |     # Uncomment to get "you 're"
254 |     #splitStr = []
255 |     #for tok in zippedStr:
256 |     #    splitStr.extend(splitToken(tok))
257 |     #zippedStr = splitStr
258 | 
259 |     return zippedStr
260 | 
261 | def addAllnonempty(master, smaller):
262 |     for s in smaller:
263 |         strim = s.strip()
264 |         if (len(strim) > 0):
265 |             master.append(strim)
266 |     return master
267 | 
268 | # "foo   bar " => "foo bar"
269 | def squeezeWhitespace(input):
270 |     return Whitespace.sub(" ", input).strip()
271 | 
272 | # Final pass tokenization based on special patterns
273 | def splitToken(token):
274 |     m = Contractions.search(token)
275 |     if m:
276 |         return [m.group(1), m.group(2)]
277 |     return [token]
278 | 
279 | # Assume 'text' has no HTML escaping.
280 | def tokenize(text):
281 |     return simpleTokenize(squeezeWhitespace(text))
282 | 
283 | 
284 | # Twitter text comes HTML-escaped, so unescape it.
285 | # We also first unescape &amp;'s, in case the text has been buggily double-escaped.
286 | def normalizeTextForTagger(text):
287 |     text = text.replace("&amp;", "&")
288 |     text = HTMLParser.HTMLParser().unescape(text)
289 |     return text
290 | 
291 | # This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
292 | #
293 | # This function normalizes the input text BEFORE calling the tokenizer.
294 | # So the tokens you get back may not exactly correspond to
295 | # substrings of the original text.
296 | def tokenizeRawTweetText(text):
297 |     return tokenize(normalizeTextForTagger(text))
298 | 


--------------------------------------------------------------------------------
/py3langid/examples/process_twitter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example for using langid.py to identify the language of messages
 3 | on a twitter livestream. Optionally, it can also filter messages
 4 | and display only those in a target language(s).
 5 | 
 6 | Expects a Twitterstream on STDIN, such as the one provided by:
 7 | 
 8 | # curl https://stream.twitter.com/1/statuses/sample.json -u<username> -s
 9 | 
10 | Outputs lang:message one-per-line to STDOUT
11 | 
12 | Marco Lui, June 2012
13 | """
14 | 
15 | import sys
16 | import langid
17 | import json
18 | import optparse
19 | import re
20 | 
21 | import _twokenize
22 | 
23 | 
24 | to_clean = re.compile(_twokenize.regex_or(
25 |   _twokenize.Hearts,
26 |   _twokenize.url,
27 |   _twokenize.Email,
28 |   _twokenize.emoticon,
29 |   _twokenize.Arrows,
30 |   _twokenize.entity,
31 |   _twokenize.decorations,
32 |   _twokenize.Hashtag,
33 |   _twokenize.AtMention,
34 | ).decode('utf8'), re.UNICODE)
35 | 
36 | 
37 | def clean_tweet(text):
38 |     return to_clean.sub('', text)
39 | 
40 | 
41 | def squeeze_whitespace(text):
42 |     return re.sub('\s+', ' ', text)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     parser = optparse.OptionParser()
47 |     parser.add_option('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)')
48 |     opts, args = parser.parse_args()
49 | 
50 |     lang_set = set(opts.langs.split(",")) if opts.langs else None
51 | 
52 |     try:
53 |         for line in sys.stdin:
54 |             j = json.loads(line)
55 |             if j.get('retweet_count') == 0:
56 |                 text = j.get('text')
57 |                 if text:
58 |                     lang, conf = langid.classify(clean_tweet(text))
59 |                     if lang_set is None or lang in lang_set:
60 |                         print "{0}: {1}".format(lang, squeeze_whitespace(text).encode('utf8'))
61 |     except (IOError, KeyboardInterrupt):
62 |         # Terminate on broken pipe or ^C
63 |         pass
64 | 


--------------------------------------------------------------------------------
/py3langid/langid.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file bundles language identification functions.
  3 | 
  4 | Modifications (fork): Copyright (c) 2021, Adrien Barbaresi.
  5 | 
  6 | Original code: Copyright (c) 2011 Marco Lui <saffsd@gmail.com>.
  7 | Based on research by Marco Lui and Tim Baldwin.
  8 | 
  9 | See LICENSE file for more info.
 10 | """
 11 | 
 12 | import bz2
 13 | import json
 14 | import logging
 15 | import lzma
 16 | import pickle
 17 | 
 18 | from base64 import b64decode
 19 | from collections import Counter
 20 | from operator import itemgetter
 21 | from pathlib import Path
 22 | from urllib.parse import parse_qs
 23 | 
 24 | import numpy as np
 25 | 
 26 | 
 27 | LOGGER = logging.getLogger(__name__)
 28 | 
 29 | # model defaults
 30 | IDENTIFIER = None
 31 | MODEL_FILE = 'data/model.plzma'
 32 | NORM_PROBS = False  # Normalize output probabilities.
 33 | # NORM_PROBS defaults to False for a small speed increase. It does not
 34 | # affect the relative ordering of the predicted classes. It can be
 35 | # re-enabled at runtime - see the readme.
 36 | 
 37 | # quantization: faster but less precise
 38 | DATATYPE = "uint16"
 39 | 
 40 | 
 41 | def load_model(path=None):
 42 |     """
 43 |     Convenience method to set the global identifier using a model at a
 44 |     specified path.
 45 | 
 46 |     @param path to model
 47 |     """
 48 |     LOGGER.debug('initializing identifier')
 49 |     global IDENTIFIER
 50 |     if path is None:
 51 |         IDENTIFIER = LanguageIdentifier.from_pickled_model(MODEL_FILE)
 52 |     else:
 53 |         IDENTIFIER = LanguageIdentifier.from_modelpath(path)
 54 | 
 55 | 
 56 | def set_languages(langs=None):
 57 |     """
 58 |     Set the language set used by the global identifier.
 59 | 
 60 |     @param langs a list of language codes
 61 |     """
 62 |     if IDENTIFIER is None:
 63 |         load_model()
 64 |     return IDENTIFIER.set_languages(langs)
 65 | 
 66 | 
 67 | def classify(instance, datatype=DATATYPE):
 68 |     """
 69 |     Convenience method using a global identifier instance with the default
 70 |     model included in langid.py. Identifies the language that a string is
 71 |     written in.
 72 | 
 73 |     @param instance a text string. Unicode strings will automatically be utf8-encoded
 74 |     @returns a tuple of the most likely language and the confidence score
 75 |     """
 76 |     if IDENTIFIER is None:
 77 |         load_model()
 78 |     return IDENTIFIER.classify(instance, datatype=datatype)
 79 | 
 80 | 
 81 | def rank(instance):
 82 |     """
 83 |     Convenience method using a global identifier instance with the default
 84 |     model included in langid.py. Ranks all the languages in the model according
 85 |     to the likelihood that the string is written in each language.
 86 | 
 87 |     @param instance a text string. Unicode strings will automatically be utf8-encoded
 88 |     @returns a list of tuples language and the confidence score, in descending order
 89 |     """
 90 |     if IDENTIFIER is None:
 91 |         load_model()
 92 |     return IDENTIFIER.rank(instance)
 93 | 
 94 | 
 95 | def cl_path(path):
 96 |     """
 97 |     Convenience method using a global identifier instance with the default
 98 |     model included in langid.py. Identifies the language that the file at `path` is
 99 |     written in.
100 | 
101 |     @param path path to file
102 |     @returns a tuple of the most likely language and the confidence score
103 |     """
104 |     if IDENTIFIER is None:
105 |         load_model()
106 |     return IDENTIFIER.cl_path(path)
107 | 
108 | 
109 | def rank_path(path):
110 |     """
111 |     Convenience method using a global identifier instance with the default
112 |     model included in langid.py. Ranks all the languages in the model according
113 |     to the likelihood that the file at `path` is written in each language.
114 | 
115 |     @param path path to file
116 |     @returns a list of tuples language and the confidence score, in descending order
117 |     """
118 |     if IDENTIFIER is None:
119 |         load_model()
120 |     return IDENTIFIER.rank_path(path)
121 | 
122 | 
123 | class LanguageIdentifier:
124 |     """
125 |     This class implements the actual language identifier.
126 |     """
127 |     __slots__ = ['nb_ptc', 'nb_pc', 'nb_numfeats', 'nb_classes', 'tk_nextmove', 'tk_output',
128 |                  'norm_probs', '__full_model']
129 | 
130 |     # new version: speed-up
131 |     @classmethod
132 |     def from_pickled_model(cls, pickled_file, *args, **kwargs):
133 |         # load data
134 |         filepath = str(Path(__file__).parent / pickled_file)
135 |         with lzma.open(filepath) as filehandle:
136 |             nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.load(filehandle)
137 |         nb_numfeats = len(nb_ptc) // len(nb_pc)
138 | 
139 |         # reconstruct pc and ptc
140 |         nb_pc = np.array(nb_pc)
141 |         nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc))
142 | 
143 |         return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs)
144 | 
145 |     # legacy methods
146 |     @classmethod
147 |     def from_modelstring(cls, string, *args, **kwargs):
148 |         # load data
149 |         nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.loads(bz2.decompress(b64decode(string)))
150 |         nb_numfeats = len(nb_ptc) // len(nb_pc)
151 | 
152 |         # reconstruct pc and ptc
153 |         nb_pc = np.array(nb_pc)
154 |         nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc))
155 | 
156 |         return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs)
157 | 
158 |     @classmethod
159 |     def from_modelpath(cls, path, *args, **kwargs):
160 |         with open(path, 'rb') as f:
161 |             return cls.from_modelstring(f.read(), *args, **kwargs)
162 | 
163 |     def __init__(self, nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output,
164 |                  norm_probs=NORM_PROBS):
165 |         self.nb_ptc = nb_ptc
166 |         self.nb_pc = nb_pc
167 |         self.nb_numfeats = nb_numfeats
168 |         self.nb_classes = nb_classes
169 |         self.tk_nextmove = tk_nextmove
170 |         self.tk_output = tk_output
171 | 
172 |         def apply_norm_probs(pd):
173 |             """
174 |             Renormalize log-probs into a proper distribution (sum 1)
175 |             The technique for dealing with underflow is described in
176 |             http://jblevins.org/log/log-sum-exp
177 |             """
178 |             if norm_probs:
179 |                 # Ignore overflow when computing the exponential. Large values
180 |                 # in the exp produce a result of inf, which does not affect
181 |                 # the correctness of the calculation (as 1/x->0 as x->inf).
182 |                 # On Linux this does not actually trigger a warning, but on
183 |                 # Windows this causes a RuntimeWarning, so we explicitly
184 |                 # suppress it.
185 |                 with np.errstate(over='ignore'):
186 |                     # legacy formula, there are possibly better alternatives
187 |                     pd = 1/np.exp(pd[None,:] - pd[:,None]).sum(1)
188 |             return pd
189 | 
190 |         self.norm_probs = apply_norm_probs
191 | 
192 |         # Maintain a reference to the full model, in case we change our language set
193 |         # multiple times.
194 |         self.__full_model = nb_ptc, nb_pc, nb_classes
195 | 
196 |     def set_languages(self, langs=None):
197 |         LOGGER.debug("restricting languages to: %s", langs)
198 | 
199 |         # Unpack the full original model. This is needed in case the language set
200 |         # has been previously trimmed, and the new set is not a subset of the current
201 |         # set.
202 |         nb_ptc, nb_pc, nb_classes = self.__full_model
203 | 
204 |         if langs is None:
205 |             self.nb_classes, self.nb_ptc, self.nb_pc = nb_classes, nb_ptc, nb_pc
206 | 
207 |         else:
208 |             # We were passed a restricted set of languages. Trim the arrays accordingly
209 |             # to speed up processing.
210 |             for lang in langs:
211 |                 if lang not in nb_classes:
212 |                     raise ValueError(f"Unknown language code {lang}")
213 | 
214 |             subset_mask = np.isin(nb_classes, langs)
215 |             self.nb_classes = [c for c in nb_classes if c in langs]
216 |             self.nb_ptc = nb_ptc[:, subset_mask]
217 |             self.nb_pc = nb_pc[subset_mask]
218 | 
219 |     def instance2fv(self, text, datatype=DATATYPE):
220 |         """
221 |         Map an instance into the feature space of the trained model.
222 | 
223 |         @param datatype NumPy data type (originally uint32)
224 |         """
225 |         # convert to binary if it isn't already the case
226 |         if isinstance(text, str):
227 |             # fix for surrogates on Windows/NT platforms
228 |             text = text.encode('utf8', errors='surrogatepass')
229 | 
230 |         # Convert the text to a sequence of ascii values and
231 |         # Count the number of times we enter each state
232 |         state, indexes = 0, []
233 |         extend = indexes.extend
234 | 
235 |         for letter in text:
236 |             state = self.tk_nextmove[(state << 8) + letter]
237 |             extend(self.tk_output.get(state, []))
238 | 
239 |         # datatype: consider that less feature counts are going to be needed
240 |         arr = np.zeros(self.nb_numfeats, dtype=datatype)
241 |         # Update all the productions corresponding to the state
242 |         for index, value in Counter(indexes).items():
243 |             arr[index] = value
244 | 
245 |         return arr
246 | 
247 |     def nb_classprobs(self, fv):
248 |         # compute the partial log-probability of the document given each class
249 |         pdc = np.dot(fv, self.nb_ptc)  # fv @ self.nb_ptc
250 |         # compute the partial log-probability of the document in each class
251 |         return pdc + self.nb_pc
252 | 
253 |     def classify(self, text, datatype=DATATYPE):
254 |         """
255 |         Classify an instance.
256 |         """
257 |         fv = self.instance2fv(text, datatype=datatype)
258 |         probs = self.norm_probs(self.nb_classprobs(fv))
259 |         cl = np.argmax(probs)
260 |         return self.nb_classes[cl], probs[cl]
261 | 
262 |     def rank(self, text):
263 |         """
264 |         Return a list of languages in order of likelihood.
265 |         """
266 |         fv = self.instance2fv(text)
267 |         probs = self.norm_probs(self.nb_classprobs(fv))
268 |         return sorted(zip(self.nb_classes, probs), key=itemgetter(1), reverse=True)
269 | 
270 |     def cl_path(self, path):
271 |         """
272 |         Classify a file at a given path
273 |         """
274 |         with open(path, 'rb') as f:
275 |             retval = self.classify(f.read())
276 |         return path, retval
277 | 
278 |     def rank_path(self, path):
279 |         """
280 |         Class ranking for a file at a given path
281 |         """
282 |         with open(path, 'rb') as f:
283 |             retval = self.rank(f.read())
284 |         return path, retval
285 | 
286 | 
287 | class NumpyEncoder(json.JSONEncoder):
288 |     """ Custom encoder for numpy data types """
289 |     def default(self, o):
290 |         if isinstance(o, np.float32):
291 |             return float(o)  # Convert float32 to native float
292 |         if isinstance(o, np.ndarray):
293 |             return o.tolist()  # Convert arrays to list
294 |         return json.JSONEncoder.default(self, o)
295 | 
296 | 
297 | METHODS = {
298 |     'detect': lambda data: {'language': classify(data)[0], 'confidence': classify(data)[1]},
299 |     'rank': lambda data: rank(data)
300 | }
301 | 
302 | 
303 | def application(environ, start_response):
304 |     """
305 |     WSGI-compatible langid web service.
306 |     """
307 |     from wsgiref.util import shift_path_info
308 |     try:
309 |         path = shift_path_info(environ)
310 |     except IndexError:
311 |         # Catch shift_path_info's failure to handle empty paths properly
312 |         path = ''
313 | 
314 |     if path not in METHODS:
315 |         return _return_response(start_response, 404, None, 'Not found')
316 | 
317 |     data = _get_data(environ)
318 |     if data is None:
319 |         if environ['REQUEST_METHOD'] == 'GET' and 'QUERY_STRING' not in environ:
320 |             return _return_response(start_response, 400, None, 'Missing query string')
321 |         return _return_response(start_response, 405, None, f"{environ['REQUEST_METHOD']} not allowed")
322 | 
323 |     response_data = METHODS[path](data)
324 |     return _return_response(start_response, 200, response_data, None)
325 | 
326 | 
327 | def _get_data(environ):
328 |     if environ['REQUEST_METHOD'] in ['PUT', 'POST']:
329 |         data = environ['wsgi.input'].read(int(environ['CONTENT_LENGTH']))
330 |         if environ['REQUEST_METHOD'] == 'POST':
331 |             try:
332 |                 data = parse_qs(data)['q'][0]
333 |             except KeyError:
334 |                 pass
335 |         return data
336 |     if environ['REQUEST_METHOD'] == 'GET':
337 |         try:
338 |             return parse_qs(environ['QUERY_STRING'])['q'][0]
339 |         except KeyError:
340 |             pass
341 |     return None
342 | 
343 | 
344 | STATUS_MESSAGES = {
345 |     200: "OK",
346 |     404: "Not Found",
347 |     405: "Method Not Allowed"
348 | }
349 | 
350 | 
351 | def _return_response(start_response, status_code, response_data, response_details):
352 |     status = f"{status_code} {STATUS_MESSAGES.get(status_code, 'Unknown Status')}"
353 |     response = {
354 |         'responseData': response_data,
355 |         'responseStatus': status_code,
356 |         'responseDetails': response_details,
357 |     }
358 |     headers = [('Content-type', 'text/javascript; charset=utf-8')]
359 |     start_response(status, headers)
360 |     return [json.dumps(response, cls=NumpyEncoder).encode('utf-8')]
361 | 
362 | 
363 | def main():
364 | 
365 |     # lazy imports
366 |     import argparse
367 |     import sys
368 | 
369 |     # parse arguments
370 |     parser = argparse.ArgumentParser()
371 |     parser.add_argument('-s', '--serve', action='store_true', default=False, dest='serve', help='launch web service')
372 |     parser.add_argument('--host', default=None, dest='host', help='host/ip to bind to')
373 |     parser.add_argument('--port', default=9008, dest='port', help='port to listen on')
374 |     parser.add_argument('-v', action='count', dest='verbosity', help='increase verbosity (repeat for greater effect)')
375 |     parser.add_argument('-m', dest='model', help='load model from file')
376 |     parser.add_argument('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)')
377 |     parser.add_argument('-r', '--remote', action="store_true", default=False, help='auto-detect IP address for remote access')
378 |     parser.add_argument('-b', '--batch', action="store_true", default=False, help='specify a list of files on the command line')
379 |     parser.add_argument('-d', '--dist', action='store_true', default=False, help='show full distribution over languages')
380 |     parser.add_argument('-u', '--url', help='langid of URL')
381 |     parser.add_argument('--line', action="store_true", default=False, help='process pipes line-by-line rather than as a document')
382 |     parser.add_argument('-n', '--normalize', action='store_true', default=False, help='normalize confidence scores to probability values')
383 |     options = parser.parse_args()
384 | 
385 |     if options.verbosity:
386 |         logging.basicConfig(level=max((5-options.verbosity)*10, 0))
387 |     else:
388 |         logging.basicConfig()
389 | 
390 |     if options.batch and options.serve:
391 |         parser.error("cannot specify both batch and serve at the same time")
392 | 
393 |     # unpack a model
394 |     global IDENTIFIER
395 | 
396 |     if options.model:
397 |         try:
398 |             IDENTIFIER = LanguageIdentifier.from_modelpath(options.model, norm_probs=options.normalize)
399 |             LOGGER.info("Using external model: %s", options.model)
400 |         except IOError as e:
401 |             LOGGER.warning("Failed to load %s: %s", options.model, e)
402 | 
403 |     if IDENTIFIER is None:
404 |         IDENTIFIER = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=options.normalize)
405 |         LOGGER.info("Using internal model")
406 | 
407 |     if options.langs:
408 |         langs = options.langs.split(",")
409 |         IDENTIFIER.set_languages(langs)
410 | 
411 |     def _process(text):
412 |         """
413 |         Set up a local function to do output, configured according to our settings.
414 |         """
415 |         return IDENTIFIER.rank(text) if options.dist else IDENTIFIER.classify(text)
416 | 
417 |     if options.url:
418 |         from urllib.request import urlopen
419 |         with urlopen(options.url) as url:
420 |             text = url.read()
421 |             output = _process(text)
422 |             print(options.url, len(text), output)
423 | 
424 |     elif options.serve:
425 |         import socket
426 |         from wsgiref.simple_server import make_server
427 | 
428 |         # from http://stackoverflow.com/questions/166506/finding-local-ip-addresses-in-python
429 |         if options.remote and options.host is None:
430 |             # resolve the external ip address
431 |             s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
432 |             s.connect(("google.com", 80))
433 |             hostname = s.getsockname()[0]
434 |         elif options.host is None:
435 |             # resolve the local hostname
436 |             hostname = socket.gethostbyname(socket.gethostname())
437 |         else:
438 |             hostname = options.host
439 | 
440 |         print(f"Listening on {hostname}:%{options.port}")
441 |         print("Press Ctrl+C to exit")
442 |         httpd = make_server(hostname, int(options.port), application)
443 |         try:
444 |             httpd.serve_forever()
445 |         except KeyboardInterrupt:
446 |             pass
447 | 
448 |     elif options.batch:
449 |         # Start in batch mode - interpret input as paths rather than content
450 |         # to classify.
451 |         import csv
452 |         from multiprocessing import Pool
453 | 
454 |         def generate_paths():
455 |             for line in sys.stdin:
456 |                 path = line.strip()
457 |                 if path and Path.is_file(path):
458 |                     yield path
459 | 
460 |         writer = csv.writer(sys.stdout)
461 |         with Pool() as pool:
462 |             if options.dist:
463 |                 writer.writerow(['path'] + IDENTIFIER.nb_classes)
464 |                 for path, ranking in pool.imap_unordered(rank_path, generate_paths()):
465 |                     ranking = dict(ranking)
466 |                     row = [path] + [ranking[c] for c in IDENTIFIER.nb_classes]
467 |                     writer.writerow(row)
468 |             else:
469 |                 for path, (lang, conf) in pool.imap_unordered(cl_path, generate_paths()):
470 |                     writer.writerow((path, lang, conf))
471 |     else:
472 |         if sys.stdin.isatty():
473 |             # Interactive mode
474 |             while True:
475 |                 try:
476 |                     print(">>>", end=' ')
477 |                     text = input()
478 |                 except Exception as e:
479 |                     print(e)
480 |                     break
481 |                 print(_process(text))
482 |         else:
483 |             # Redirected
484 |             if options.line:
485 |                 for line in sys.stdin:
486 |                     print(_process(line))
487 |             else:
488 |                 print(_process(sys.stdin.read()))
489 | 
490 | 
491 | if __name__ == "__main__":
492 |     main()
493 | 


--------------------------------------------------------------------------------
/py3langid/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbar/py3langid/812f2055f74c35dea298f30b434644062d9289be/py3langid/tools/__init__.py


--------------------------------------------------------------------------------
/py3langid/tools/featWeights.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tabulate feature weight data into a single CSV for
  3 | further analysis using other tools. This produces
  4 | a CSV with header. The features themselves are not
  5 | included.
  6 | 
  7 | Marco Lui, February 2013
  8 | """
  9 | 
 10 | import argparse, os, csv, sys
 11 | import numpy as np
 12 | import bz2, base64
 13 | from cPickle import loads
 14 | 
 15 | from langid.train.common import read_weights, read_features
 16 | 
 17 | if __name__ == "__main__":
 18 |     parser = argparse.ArgumentParser()
 19 |     parser.add_argument('model', metavar="MODEL_DIR", help="path to langid.py training model dir")
 20 |     parser.add_argument('output', metavar="OUTPUT", help = "write to OUTPUT")
 21 |     parser.add_argument('-f','--features', metavar="FILE", help = 'only output features from FILE')
 22 |     parser.add_argument('--raw', action='store_true', help="include raw features")
 23 |     parser.add_argument('--bin', action='store_true', help="include ig for lang-bin")
 24 |     args = parser.parse_args()
 25 | 
 26 |     def model_file(name):
 27 |         return os.path.join(args.model, name)
 28 | 
 29 |     # Try to determine the set of features to consider
 30 |     if args.features:
 31 |         # Use a pre-determined feature list
 32 |         print >>sys.stderr,  "using user-supplied feature list:", args.features
 33 |         feats = read_features(args.features)
 34 |     elif os.path.exists(model_file('LDfeats')):
 35 |         # Use LDfeats
 36 |         print >>sys.stderr,  "using LDfeats"
 37 |         feats = read_features(model_file('LDfeats'))
 38 |     else:
 39 |         raise ValueError("no suitable feature list")
 40 | 
 41 |     print >>sys.stderr, "considering {0} features".format(len(feats))
 42 | 
 43 |     records = dict( (k, {}) for k in feats )
 44 |     headers = []
 45 | 
 46 |     headers.append('len')
 47 |     for k in feats:
 48 |         records[k]['len'] = len(k)
 49 | 
 50 | 
 51 |     # Document Frequency
 52 |     if os.path.exists(model_file('DF_all')):
 53 |         print >>sys.stderr, "found weights for document frequency"
 54 |         w = read_weights(model_file('DF_all'))
 55 |         headers.append('DF')
 56 |         for k in feats:
 57 |             records[k]['DF'] = w[k][0]
 58 | 
 59 |     # IG weights for the all-languages event
 60 |     if os.path.exists(model_file('IGweights.lang')):
 61 |         print >>sys.stderr, "found weights for lang"
 62 |         w = read_weights(model_file('IGweights.lang'))
 63 |         headers.append('IGlang')
 64 |         for k in feats:
 65 |             records[k]['IGlang'] = w[k][0]
 66 | 
 67 |     # IG weights for the all-domains event
 68 |     if os.path.exists(model_file('IGweights.domain')):
 69 |         print >>sys.stderr, "found weights for domain"
 70 |         w = read_weights(model_file('IGweights.domain'))
 71 |         headers.append('IGdomain')
 72 |         for k in feats:
 73 |             records[k]['IGdomain'] = w[k][0]
 74 | 
 75 |     # IG weights for language-binarized
 76 |     if args.bin and os.path.exists(model_file('IGweights.lang.bin')) and os.path.exists(model_file('lang_index')):
 77 |         print >>sys.stderr, "found weights for lang.bin"
 78 |         w = read_weights(model_file('IGweights.lang.bin'))
 79 | 
 80 |         # find the list of langs in-order
 81 |         with open(os.path.join(args.model, "lang_index")) as f:
 82 |             reader = csv.reader(f)
 83 |             langs = zip(*reader)[0]
 84 | 
 85 |         r_h = ['IGlang.bin.{0}'.format(l) for l in langs]
 86 |         headers.extend( r_h )
 87 |         for k in feats:
 88 |             records[k].update( dict(zip(r_h, w[k])) )
 89 | 
 90 |     if os.path.exists(model_file('LDfeats.scanner')) and os.path.exists(model_file('model')):
 91 |         print >>sys.stderr, "found weights for P(t|c)"
 92 |         with open(model_file('model')) as f:
 93 |             model = loads(bz2.decompress(base64.b64decode(f.read())))
 94 |         with open(model_file('LDfeats.scanner')) as f:
 95 |             _, _, nb_feats = loads(f.read())
 96 |         nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model
 97 |         nb_numfeats = len(nb_ptc) / len(nb_pc)
 98 |         nb_ptc = np.array(nb_ptc).reshape(len(nb_ptc)/len(nb_pc), len(nb_pc))
 99 | 
100 |         # Normalize to 1 on the term axis
101 |         for i in range(nb_ptc.shape[1]):
102 |             nb_ptc[:,i] = (1/np.exp(nb_ptc[:,i][None,:] - nb_ptc[:,i][:,None]).sum(1))
103 |         w = dict(zip(nb_feats, nb_ptc))
104 | 
105 |         r_h = ['ptc.{0}'.format(l) for l in nb_classes]
106 |         headers.extend( r_h )
107 |         for k in feats:
108 |             records[k].update( dict(zip(r_h, w[k])) )
109 | 
110 |     if args.raw:
111 |         headers.append('feat')
112 |         for k in feats:
113 |             records[k]['feat'] = k
114 | 
115 | 
116 | 
117 |     print >>sys.stderr, "writing output"
118 |     with open(args.output, 'w') as f:
119 |         writer = csv.DictWriter(f,headers)
120 |         writer.writeheader()
121 |         writer.writerows(records.values())
122 | 
123 |     print >>sys.stderr, "done"
124 | 


--------------------------------------------------------------------------------
/py3langid/tools/printfeats.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Print features out in order of their weights
 3 | 
 4 | Marco Lui, November 2013
 5 | """
 6 | 
 7 | import argparse, os, csv, sys
 8 | 
 9 | from langid.train.common import read_weights
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('file', help="file to read")
14 |     parser.add_argument('-c','--column',help="project a specific column", type=int)
15 |     parser.add_argument('-n','--number',help="output top N features", type=int)
16 |     parser.add_argument('-v','--value',help="output the value used for ranking", action="store_true")
17 |     parser.add_argument('-p','--printfeat',help="print the actual feature (default is to print repr)", action="store_true")
18 |     parser.add_argument('--output', "-o", default=sys.stdout, type=argparse.FileType('w'), help = "write to OUTPUT")
19 |     args = parser.parse_args()
20 | 
21 |     w = read_weights(args.file)
22 |     n = args.number if args.number is not None else len(w)
23 | 
24 |     def show(feat):
25 |         if args.printfeat:
26 |             return feat
27 |         else:
28 |             return repr(feat)
29 | 
30 |     if args.column is not None:
31 |         for key in sorted(w, key=lambda x:w[x][args.column], reverse=True)[:n]:
32 |             if args.value:
33 |                 args.output.write("{0},{1}\n".format(show(key),w[key][args.column]))
34 |             else:
35 |                 args.output.write("{0}\n".format(show(key)))
36 |     else:
37 |         for key in sorted(w, key=w.get, reverse=True)[:n]:
38 |             if args.value:
39 |                 args.output.write("{0},{1}\n".format(show(key),w[key]))
40 |             else:
41 |                 args.output.write("{0}\n".format(show(key)))
42 | 


--------------------------------------------------------------------------------
/py3langid/train/BLweight.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementing the "blacklist" feature weighting metric proposed by
  3 | Tiedemann & Ljubesic.
  4 | 
  5 | Marco Lui, February 2013
  6 | """
  7 | 
  8 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation
  9 | CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use)
 10 | 
 11 | import argparse
 12 | import os
 13 | 
 14 | import numpy as np
 15 | 
 16 | from .common import read_features, makedir, write_weights
 17 | from .scanner import build_scanner
 18 | from .index import CorpusIndexer
 19 | from .NBtrain import generate_cm, learn_ptc
 20 | 
 21 | 
 22 | if __name__ == "__main__":
 23 | 
 24 |     parser = argparse.ArgumentParser()
 25 |     parser.add_argument("-o","--output", metavar="DIR", help = "write weights to DIR")
 26 |     parser.add_argument('-f','--features', metavar="FILE", help = 'only output features from FILE')
 27 |     parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets")
 28 |     parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
 29 |     parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR")
 30 |     parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
 31 |     parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE)
 32 |     parser.add_argument("--no_norm", default=False, action="store_true", help="do not normalize difference in p(t|C) by sum p(t|C)")
 33 |     parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR")
 34 |     parser.add_argument("pairs", metavar='LANG_PAIR', nargs="*", help="language pairs to compute BL weights for")
 35 |     args = parser.parse_args()
 36 | 
 37 |     # Work out where our model directory is
 38 |     corpus_name = os.path.basename(args.corpus)
 39 |     if args.model:
 40 |         model_dir = args.model
 41 |     else:
 42 |         model_dir = os.path.join('.', corpus_name+'.model')
 43 | 
 44 |     def m_path(name):
 45 |         return os.path.join(model_dir, name)
 46 | 
 47 |     # Try to determine the set of features to consider
 48 |     if args.features:
 49 |         # Use a pre-determined feature list
 50 |         feat_path = args.features
 51 |     elif os.path.exists(m_path('DFfeats')):
 52 |         # Use LDfeats
 53 |         feat_path = m_path('DFfeats')
 54 |     else:
 55 |         raise ValueError("no suitable feature list")
 56 | 
 57 |     # Where temp files go
 58 |     if args.temp:
 59 |         buckets_dir = args.temp
 60 |     else:
 61 |         buckets_dir = m_path('buckets')
 62 |     makedir(buckets_dir)
 63 | 
 64 |     all_langs = set()
 65 |     pairs = []
 66 |     for p in args.pairs:
 67 |         try:
 68 |             lang1, lang2 = p.split(',')
 69 |         except ValueError:
 70 |             # Did not unpack to two values
 71 |             parser.error("{0} is not a lang-pair".format(p))
 72 |         all_langs.add(lang1)
 73 |         all_langs.add(lang2)
 74 |         pairs.append((lang1, lang2))
 75 | 
 76 |     if args.output:
 77 |         makedir(args.output)
 78 |         out_dir = args.output
 79 |     else:
 80 |         out_dir = model_dir
 81 | 
 82 |     langs = sorted(all_langs)
 83 | 
 84 |     # display paths
 85 |     print("languages({1}): {0}".format(langs, len(langs)))
 86 |     print("model path:", model_dir)
 87 |     print("feature path:", feat_path)
 88 |     print("output path:", out_dir)
 89 |     print("temp (buckets) path:", buckets_dir)
 90 | 
 91 |     feats = read_features(feat_path)
 92 | 
 93 |     indexer = CorpusIndexer(args.corpus, langs = langs)
 94 |     items = [ (d,l,p) for (d,l,n,p) in indexer.items ]
 95 |     if len(items) == 0:
 96 |         raise ValueError("found no files!")
 97 | 
 98 |     print("will process {0} features across {1} paths".format(len(feats), len(items)))
 99 |     print("will process {0} features across {1} paths".format(len(feats), len(items)))
100 | 
101 |     # produce a scanner over all the features
102 |     tk_nextmove, tk_output = build_scanner(feats)
103 | 
104 |     # Generate a class map over all the languages we are dealing with
105 |     cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
106 | 
107 |     # Compute P(t|C)
108 |     print("learning P(t|C)")
109 |     paths = zip(*items)[2]
110 |     nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)
111 |     nb_ptc = np.array(nb_ptc).reshape(len(feats), len(langs))
112 | 
113 |     # Normalize to 1 on the term axis
114 |     print("renormalizing P(t|C)")
115 |     for i in range(nb_ptc.shape[1]):
116 |         # had to de-vectorize this due to memory consumption
117 |         newval = np.empty_like(nb_ptc[:,i])
118 |         for j in range(newval.shape[0]):
119 |             newval[j] = (1/np.exp(nb_ptc[:,i] - nb_ptc[j,i]).sum())
120 |         nb_ptc[:,i] = newval
121 |         assert (1.0 - newval.sum()) < 0.0001
122 | 
123 |     print("doing per-pair output")
124 |     for lang1, lang2 in pairs:
125 |         # Where to do output
126 |         if args.no_norm:
127 |             weights_path = os.path.join(out_dir, ('BLfeats.no_norm.{0}.{1}'.format(lang1, lang2)))
128 |         else:
129 |             weights_path = os.path.join(out_dir, ('BLfeats.{0}.{1}'.format(lang1, lang2)))
130 | 
131 |         i1 = indexer.lang_index[lang1]
132 |         i2 = indexer.lang_index[lang2]
133 | 
134 |         w = dict(zip(feats, np.abs((nb_ptc[:,i1] - nb_ptc[:,i2]) / (nb_ptc.sum(1) if not args.no_norm else 1))))
135 |         write_weights(w, weights_path)
136 |         print("wrote weights to {0}".format(weights_path))
137 | 


--------------------------------------------------------------------------------
/py3langid/train/DFfeatureselect.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DFfeatureselect.py -
  3 | First step in the LD feature selection process, select features based on document
  4 | frequency.
  5 | 
  6 | Marco Lui January 2013
  7 | 
  8 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without modification, are
 11 | permitted provided that the following conditions are met:
 12 | 
 13 |    1. Redistributions of source code must retain the above copyright notice, this list of
 14 |       conditions and the following disclaimer.
 15 | 
 16 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 17 |       of conditions and the following disclaimer in the documentation and/or other materials
 18 |       provided with the distribution.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | The views and conclusions contained in the software and documentation are those of the
 31 | authors and should not be interpreted as representing official policies, either expressed
 32 | or implied, of the copyright holder.
 33 | """
 34 | 
 35 | ######
 36 | # Default values
 37 | # Can be overriden with command-line options
 38 | ######
 39 | MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
 40 | TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order
 41 | 
 42 | import argparse
 43 | import os
 44 | import marshal
 45 | 
 46 | from collections import defaultdict
 47 | 
 48 | from .common import unmarshal_iter, MapPool, write_features, write_weights
 49 | 
 50 | 
 51 | def pass_sum_df(bucket):
 52 |     """
 53 |     Compute document frequency (df) by summing up (key,domain,count) triplets
 54 |     over all domains.
 55 |     """
 56 |     doc_count = defaultdict(int)
 57 |     count = 0
 58 |     with open(os.path.join(bucket, "docfreq"),'wb') as docfreq:
 59 |         for path in os.listdir(bucket):
 60 |             # We use the domain buckets as there are usually less domains
 61 |             if path.endswith('.domain'):
 62 |                 for key, _, value in unmarshal_iter(os.path.join(bucket,path)):
 63 |                     doc_count[key] += value
 64 |                     count += 1
 65 | 
 66 |         for item in doc_count.iteritems():
 67 |             docfreq.write(marshal.dumps(item))
 68 |     return count
 69 | 
 70 | def tally(bucketlist, jobs=None):
 71 |     """
 72 |     Sum up the counts for each feature across all buckets. This
 73 |     builds a full mapping of feature->count. This is stored in-memory
 74 |     and thus could be an issue for large feature sets.
 75 |     """
 76 | 
 77 |     with MapPool(jobs) as f:
 78 |         pass_sum_df_out = f(pass_sum_df, bucketlist)
 79 | 
 80 |         for i, keycount in enumerate(pass_sum_df_out):
 81 |             print("processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount))
 82 | 
 83 |     # build the global term->df mapping
 84 |     doc_count = {}
 85 |     for bucket in bucketlist:
 86 |         for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')):
 87 |             doc_count[key] = value
 88 | 
 89 |     return doc_count
 90 | 
 91 | 
 92 | 
 93 | def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_PER_ORDER):
 94 |     """
 95 |     DF feature selection for byte-ngram tokenization
 96 |     """
 97 |     # Work out the set of features to compute IG
 98 |     features = set()
 99 |     for i in range(1, max_order+1):
100 |         d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i)
101 |         features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order])
102 |     features = sorted(features)
103 | 
104 |     return features
105 | 
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     parser = argparse.ArgumentParser()
110 |     parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
111 |     parser.add_argument("-f","--features", metavar='FEATURE_FILE', help="output features to FEATURE_FILE")
112 |     parser.add_argument("--tokens_per_order", metavar='N', type=int, help="consider top N tokens per ngram order")
113 |     parser.add_argument("--tokens", metavar='N', type=int, help="consider top N tokens")
114 |     parser.add_argument("--max_order", type=int, help="highest n-gram order to use", default=MAX_NGRAM_ORDER)
115 |     parser.add_argument("--doc_count", nargs='?', const=True, metavar='DOC_COUNT_PATH', help="output full mapping of feature->frequency to DOC_COUNT_PATH")
116 |     parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR")
117 | 
118 |     args = parser.parse_args()
119 | 
120 |     if args.tokens and args.tokens_per_order:
121 |         parser.error("--tokens and --tokens_per_order are mutually exclusive")
122 | 
123 |     # if neither --tokens nor --tokens_per_order is given, default behaviour is tokens_per_order
124 |     if not(args.tokens) and not(args.tokens_per_order):
125 |         args.tokens_per_order = TOKENS_PER_ORDER
126 | 
127 |     if args.features:
128 |         feature_path = args.features
129 |     else:
130 |         feature_path = os.path.join(args.model, 'DFfeats')
131 | 
132 |     bucketlist_path = os.path.join(args.model, 'bucketlist')
133 | 
134 |     # display paths
135 |     print("buckets path:", bucketlist_path)
136 |     print("features output path:", feature_path)
137 |     if args.tokens_per_order:
138 |         print("max ngram order:", args.max_order)
139 |         print("tokens per order:", args.tokens_per_order)
140 |     else:
141 |         print("tokens:", args.tokens)
142 | 
143 |     with open(bucketlist_path) as f:
144 |         bucketlist = map(str.strip, f)
145 | 
146 |     doc_count = tally(bucketlist, args.jobs)
147 |     print("unique features:", len(doc_count))
148 |     if args.doc_count:
149 |         # The constant true is used to indicate output to default location
150 |         doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count
151 |         write_weights(doc_count, doc_count_path)
152 |         print("wrote DF counts for all features to:", doc_count_path)
153 | 
154 |     if args.tokens_per_order:
155 |         # Choose a number of features for each length of token
156 |         feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
157 |     else:
158 |         # Choose a number of features overall
159 |         feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
160 | 
161 |     print("selected features: ", len(feats))
162 | 
163 |     write_features(feats, feature_path)
164 |     print('wrote features to "%s"' % feature_path)
165 | 


--------------------------------------------------------------------------------
/py3langid/train/IGweight.py:
--------------------------------------------------------------------------------
  1 | """
  2 | IGWeight.py -
  3 | Compute IG Weights given a set of tokenized buckets and a feature set
  4 | 
  5 | Marco Lui, January 2013
  6 | 
  7 | Based on research by Marco Lui and Tim Baldwin.
  8 | 
  9 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
 10 | 
 11 | Redistribution and use in source and binary forms, with or without modification, are
 12 | permitted provided that the following conditions are met:
 13 | 
 14 |    1. Redistributions of source code must retain the above copyright notice, this list of
 15 |       conditions and the following disclaimer.
 16 | 
 17 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 18 |       of conditions and the following disclaimer in the documentation and/or other materials
 19 |       provided with the distribution.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 22 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 23 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 27 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 29 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | The views and conclusions contained in the software and documentation are those of the
 32 | authors and should not be interpreted as representing official policies, either expressed
 33 | or implied, of the copyright holder.
 34 | """
 35 | 
 36 | import argparse
 37 | import csv
 38 | import os
 39 | 
 40 | from collections import defaultdict
 41 | 
 42 | import numpy
 43 | 
 44 | from .common import unmarshal_iter, MapPool, Enumerator, write_weights, read_features
 45 | 
 46 | 
 47 | def entropy(v, axis=0):
 48 |     """
 49 |     Optimized implementation of entropy. This version is faster than that in
 50 |     scipy.stats.distributions, particularly over long vectors.
 51 |     """
 52 |     v = numpy.array(v, dtype='float')
 53 |     s = numpy.sum(v, axis=axis)
 54 |     with numpy.errstate(divide='ignore', invalid='ignore'):
 55 |         rhs = numpy.nansum(v * numpy.log(v), axis=axis) / s
 56 |         r = numpy.log(s) - rhs
 57 |     # Where dealing with binarized events, it is possible that an event always
 58 |     # occurs and thus has 0 information. In this case, the negative class
 59 |     # will have frequency 0, resulting in log(0) being computed as nan.
 60 |     # We replace these nans with 0
 61 |     nan_index = numpy.isnan(rhs)
 62 |     if nan_index.any():
 63 |         r[nan_index] = 0
 64 |     return r
 65 | 
 66 | def setup_pass_IG(features, dist, binarize, suffix):
 67 |     """
 68 |     @param features the list of features to compute IG for
 69 |     @param dist the background distribution
 70 |     @param binarize (boolean) compute IG binarized per-class if True
 71 |     @param suffix of files in bucketdir to process
 72 |     """
 73 |     global __features, __dist, __binarize, __suffix
 74 |     __features = features
 75 |     __dist = dist
 76 |     __binarize = binarize
 77 |     __suffix = suffix
 78 | 
 79 | def pass_IG(bucket):
 80 |     """
 81 |     In this pass we compute the information gain for each feature, binarized
 82 |     with respect to each language as well as unified over the set of all
 83 |     classes.
 84 | 
 85 |     @global __features the list of features to compute IG for
 86 |     @global __dist the background distribution
 87 |     @global __binarize (boolean) compute IG binarized per-class if True
 88 |     @global __suffix of files in bucketdir to process
 89 |     @param bucket the bucket file to process. It is assumed to contain marshalled (term, event_id, count) triplets.
 90 |     """
 91 |     global __features, __dist, __binarize, __suffix
 92 | 
 93 |     # We first tally the per-event frequency of each
 94 |     # term in our selected feature set.
 95 |     term_freq = defaultdict(lambda: defaultdict(int))
 96 |     term_index = defaultdict(Enumerator())
 97 | 
 98 |     for path in os.listdir(bucket):
 99 |         if path.endswith(__suffix):
100 |             for key, event_id, count in unmarshal_iter(os.path.join(bucket,path)):
101 |                 # Select only our listed features
102 |                 if key in __features:
103 |                     term_index[key]
104 |                     term_freq[key][event_id] += count
105 | 
106 |     num_term = len(term_index)
107 |     num_event = len(__dist)
108 | 
109 |     cm_pos = numpy.zeros((num_term, num_event), dtype='int')
110 | 
111 |     for term,term_id in term_index.iteritems():
112 |         # update event matrix
113 |         freq = term_freq[term]
114 |         for event_id, count in freq.iteritems():
115 |             cm_pos[term_id, event_id] = count
116 |     cm_neg = __dist - cm_pos
117 |     cm = numpy.dstack((cm_neg, cm_pos))
118 | 
119 |     if not __binarize:
120 |         # non-binarized event space
121 |         x = cm.sum(axis=1)
122 |         term_w = x / x.sum(axis=1)[:, None].astype(float)
123 | 
124 |         # Entropy of the term-present/term-absent events
125 |         e = entropy(cm, axis=1)
126 | 
127 |         # Information Gain with respect to the set of events
128 |         ig = entropy(__dist) - (term_w * e).sum(axis=1)
129 | 
130 |     else:
131 |         # binarized event space
132 |         # Compute IG binarized with respect to each event
133 |         ig = list()
134 |         for event_id in range(num_event):
135 |             num_doc = __dist.sum()
136 |             prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc
137 | 
138 |             cm_bin = numpy.zeros((num_term, 2, 2), dtype=int) # (term, p(term), p(lang|term))
139 |             cm_bin[:,0,:] = cm.sum(axis=1) - cm[:,event_id,:]
140 |             cm_bin[:,1,:] = cm[:,event_id,:]
141 | 
142 |             e = entropy(cm_bin, axis=1)
143 |             x = cm_bin.sum(axis=1)
144 |             term_w = x / x.sum(axis=1)[:, None].astype(float)
145 | 
146 |             ig.append( entropy(prior) - (term_w * e).sum(axis=1) )
147 |         ig = numpy.vstack(ig)
148 | 
149 |     terms = sorted(term_index, key=term_index.get)
150 |     return terms, ig
151 | 
152 | 
153 | def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None):
154 |     pass_IG_args = (features, dist, binarize, suffix)
155 | 
156 |     num_chunk = len(bucketlist)
157 |     weights = []
158 |     terms = []
159 | 
160 |     with MapPool(job_count, setup_pass_IG, pass_IG_args) as f:
161 |         pass_IG_out = f(pass_IG, bucketlist)
162 | 
163 |         for i, (t, w) in enumerate(pass_IG_out):
164 |             weights.append(w)
165 |             terms.extend(t)
166 |             print("processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t)))
167 | 
168 |     if binarize:
169 |         weights = numpy.hstack(weights).transpose()
170 |     else:
171 |         weights = numpy.concatenate(weights)
172 |     terms = ["".join(t) for t in terms]
173 | 
174 |     return zip(terms, weights)
175 | 
176 | def read_dist(path):
177 |     """
178 |     Read the distribution from a file containing item, count pairs.
179 |     @param path path to read form
180 |     """
181 |     with open(path) as f:
182 |         reader = csv.reader(f)
183 |         return numpy.array(zip(*reader)[1], dtype=int)
184 | 
185 | if __name__ == "__main__":
186 |     parser = argparse.ArgumentParser()
187 |     parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
188 |     parser.add_argument("-f","--features", metavar='FEATURE_FILE', help="read features from FEATURE_FILE")
189 |     parser.add_argument("-w","--weights", metavar='WEIGHTS', help="output weights to WEIGHTS")
190 |     parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR")
191 |     parser.add_argument("-d","--domain", action="store_true", default=False, help="compute IG with respect to domain")
192 |     parser.add_argument("-b","--binarize", action="store_true", default=False, help="binarize the event space in the IG computation")
193 |     parser.add_argument("-l","--lang", action="store_true", default=False, help="compute IG with respect to language")
194 | 
195 |     args = parser.parse_args()
196 |     if not(args.domain or args.lang) or (args.domain and args.lang):
197 |         parser.error("exactly one of domain(-d) or language (-l) must be specified")
198 | 
199 |     if args.features:
200 |         feature_path = args.features
201 |     else:
202 |         feature_path = os.path.join(args.model, 'DFfeats')
203 | 
204 |     bucketlist_path = os.path.join(args.model, 'bucketlist')
205 | 
206 |     if not os.path.exists(feature_path):
207 |         parser.error('{0} does not exist'.format(feature_path))
208 | 
209 |     bucketlist = map(str.strip, open(bucketlist_path))
210 |     features = read_features(feature_path)
211 | 
212 |     if args.domain:
213 |         index_path = os.path.join(args.model,'domain_index')
214 |         suffix = '.domain'
215 |     elif args.lang:
216 |         index_path = os.path.join(args.model,'lang_index')
217 |         suffix = '.lang'
218 |     else:
219 |         raise ValueError("no event specified")
220 | 
221 |     if args.weights:
222 |         weights_path = args.weights
223 |     else:
224 |         weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else ''))
225 | 
226 |     # display paths
227 |     print("model path:", args.model )
228 |     print("buckets path:", bucketlist_path)
229 |     print("features path:", feature_path)
230 |     print("weights path:", weights_path)
231 |     print("index path:", index_path)
232 |     print("suffix:", suffix)
233 | 
234 |     print("computing information gain")
235 | 
236 |     dist = read_dist(index_path)
237 |     ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs)
238 | 
239 |     write_weights(ig, weights_path)
240 | 


--------------------------------------------------------------------------------
/py3langid/train/LDfeatureselect.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LDfeatureselect.py -
  3 | LD (Lang-Domain) feature extractor
  4 | Marco Lui November 2011
  5 | 
  6 | Based on research by Marco Lui and Tim Baldwin.
  7 | 
  8 | Copyright 2011 Marco Lui <saffsd@gmail.com>. All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without modification, are
 11 | permitted provided that the following conditions are met:
 12 | 
 13 |    1. Redistributions of source code must retain the above copyright notice, this list of
 14 |       conditions and the following disclaimer.
 15 | 
 16 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 17 |       of conditions and the following disclaimer in the documentation and/or other materials
 18 |       provided with the distribution.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | The views and conclusions contained in the software and documentation are those of the
 31 | authors and should not be interpreted as representing official policies, either expressed
 32 | or implied, of the copyright holder.
 33 | """
 34 | 
 35 | ######
 36 | # Default values
 37 | # Can be overriden with command-line options
 38 | ######
 39 | FEATURES_PER_LANG = 300 # number of features to select for each language
 40 | 
 41 | import argparse
 42 | import csv
 43 | import os
 44 | 
 45 | from collections import defaultdict
 46 | 
 47 | import numpy
 48 | 
 49 | from common import read_weights, Enumerator, write_features
 50 | 
 51 | def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
 52 |     """
 53 |     @param ignore_domain boolean to indicate whether to use domain weights
 54 |     """
 55 |     assert (ig_domain is None) or (len(ig_lang) == len(ig_domain))
 56 |     num_lang = len(ig_lang.values()[0])
 57 |     num_term = len(ig_lang)
 58 | 
 59 |     term_index = defaultdict(Enumerator())
 60 | 
 61 | 
 62 |     ld = numpy.empty((num_lang, num_term), dtype=float)
 63 | 
 64 |     for term in ig_lang:
 65 |         term_id = term_index[term]
 66 |         if ignore_domain:
 67 |             ld[:, term_id] = ig_lang[term]
 68 |         else:
 69 |             ld[:, term_id] = ig_lang[term] - ig_domain[term]
 70 | 
 71 |     terms = sorted(term_index, key=term_index.get)
 72 |     # compile the final feature set
 73 |     selected_features = {}
 74 |     for lang_id, lang_w in enumerate(ld):
 75 |         term_inds = numpy.argsort(lang_w)[-feats_per_lang:]
 76 |         selected_features[lang_id] = [terms[t] for t in term_inds]
 77 | 
 78 |     return selected_features
 79 | 
 80 | if __name__ == "__main__":
 81 |     parser = argparse.ArgumentParser()
 82 |     parser.add_argument("-o","--output", metavar="OUTPUT_PATH", help = "write selected features to OUTPUT_PATH")
 83 |     parser.add_argument("--feats_per_lang", type=int, metavar='N', help="select top N features for each language", default=FEATURES_PER_LANG)
 84 |     parser.add_argument("--per_lang", action="store_true", default=False, help="produce a list of features selecter per-language")
 85 |     parser.add_argument("--no_domain_ig", action="store_true", default=False, help="use only per-langugage IG in LD calculation")
 86 |     parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR")
 87 |     args = parser.parse_args()
 88 | 
 89 |     lang_w_path = os.path.join(args.model, 'IGweights.lang.bin')
 90 |     domain_w_path = os.path.join(args.model, 'IGweights.domain')
 91 |     feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats')
 92 | 
 93 |     # display paths
 94 |     print("model path:", args.model)
 95 |     print("lang weights path:", lang_w_path)
 96 |     print("domain weights path:", domain_w_path)
 97 |     print("feature output path:", feature_path)
 98 | 
 99 |     lang_w = read_weights(lang_w_path)
100 |     domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None
101 | 
102 |     features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig)
103 |     if args.per_lang:
104 |         with open(feature_path + '.perlang', 'w') as f:
105 |             writer = csv.writer(f)
106 |             for i in range(len(features_per_lang)):
107 |                 writer.writerow(map(repr,features_per_lang[i]))
108 | 
109 | 
110 |     final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
111 |     print('selected %d features' % len(final_feature_set))
112 | 
113 |     write_features(sorted(final_feature_set), feature_path)
114 |     print('wrote features to "%s"' % feature_path)
115 | 


--------------------------------------------------------------------------------
/py3langid/train/NBtrain.py:
--------------------------------------------------------------------------------
  1 | """
  2 | NBtrain.py -
  3 | Model generator for langid.py
  4 | 
  5 | Marco Lui, January 2013
  6 | 
  7 | Based on research by Marco Lui and Tim Baldwin.
  8 | 
  9 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
 10 | 
 11 | Redistribution and use in source and binary forms, with or without modification, are
 12 | permitted provided that the following conditions are met:
 13 | 
 14 |    1. Redistributions of source code must retain the above copyright notice, this list of
 15 |       conditions and the following disclaimer.
 16 | 
 17 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 18 |       of conditions and the following disclaimer in the documentation and/or other materials
 19 |       provided with the distribution.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 22 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 23 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 27 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 29 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | The views and conclusions contained in the software and documentation are those of the
 32 | authors and should not be interpreted as representing official policies, either expressed
 33 | or implied, of the copyright holder.
 34 | """
 35 | MAX_CHUNK_SIZE = 100 # maximum number of files to tokenize at once
 36 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation
 37 | 
 38 | import array
 39 | import argparse
 40 | import atexit
 41 | import base64
 42 | import bz2
 43 | import csv
 44 | import marshal
 45 | import multiprocessing as mp
 46 | import os
 47 | import pickle
 48 | import shutil
 49 | import tempfile
 50 | 
 51 | from collections import defaultdict
 52 | 
 53 | import numpy as np
 54 | 
 55 | from .common import chunk, unmarshal_iter, MapPool
 56 | 
 57 | 
 58 | def offsets(chunks):
 59 |     # Work out the path chunk start offsets
 60 |     chunk_offsets = [0]
 61 |     for c in chunks:
 62 |         chunk_offsets.append(chunk_offsets[-1] + len(c))
 63 |     return chunk_offsets
 64 | 
 65 | def state_trace(path):
 66 |     """
 67 |     Returns counts of how often each state was entered
 68 |     """
 69 |     global __nm_arr
 70 |     c = defaultdict(int)
 71 |     state = 0
 72 | 
 73 |     with open(path) as f:
 74 |         text = f.read()
 75 |         for letter in map(ord,text):
 76 |             state = __nm_arr[(state << 8) + letter]
 77 |             c[state] += 1
 78 |     return c
 79 | 
 80 | def setup_pass_tokenize(nm_arr, output_states, tk_output, b_dirs):
 81 |     """
 82 |     Set the global next-move array used by the aho-corasick scanner
 83 |     """
 84 |     global __nm_arr, __output_states, __tk_output, __b_dirs
 85 |     __nm_arr = nm_arr
 86 |     __output_states = output_states
 87 |     __tk_output = tk_output
 88 |     __b_dirs = b_dirs
 89 | 
 90 | def pass_tokenize(arg):
 91 |     """
 92 |     Tokenize documents and do counts for each feature
 93 |     Split this into buckets chunked over features rather than documents
 94 |     """
 95 |     global __output_states, __tk_output, __b_dirs
 96 |     chunk_offset, chunk_paths = arg
 97 |     term_freq = defaultdict(int)
 98 |     __procname = mp.current_process().name
 99 |     __buckets = [tempfile.mkstemp(prefix=__procname, suffix='.index', dir=p)[0] for p in __b_dirs]
100 | 
101 |     # Tokenize each document and add to a count of (doc_id, f_id) frequencies
102 |     for doc_count, path in enumerate(chunk_paths):
103 |         doc_id = doc_count + chunk_offset
104 |         count = state_trace(path)
105 |         for state in (set(count) & __output_states):
106 |             for f_id in __tk_output[state]:
107 |                 term_freq[doc_id, f_id] += count[state]
108 | 
109 |     # Distribute the aggregated counts into buckets
110 |     bucket_count = len(__buckets)
111 |     for doc_id, f_id in term_freq:
112 |         bucket_index = hash(f_id) % bucket_count
113 |         count = term_freq[doc_id, f_id]
114 |         item = ( f_id, doc_id, count )
115 |         os.write(__buckets[bucket_index], marshal.dumps(item))
116 | 
117 |     for f in __buckets:
118 |         os.close(f)
119 | 
120 |     return len(term_freq)
121 | 
122 | def setup_pass_ptc(cm, num_instances):
123 |     global __cm, __num_instances
124 |     __cm = cm
125 |     __num_instances = num_instances
126 | 
127 | def pass_ptc(b_dir):
128 |     """
129 |     Take a bucket, form a feature map, compute the count of
130 |     each feature in each class.
131 |     @param b_dir path to the bucket directory
132 |     @returns (read_count, f_ids, prod)
133 |     """
134 |     global __cm, __num_instances
135 | 
136 |     terms = defaultdict(lambda : np.zeros((__num_instances,), dtype='int'))
137 | 
138 |     read_count = 0
139 |     for path in os.listdir(b_dir):
140 |         if path.endswith('.index'):
141 |             for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)):
142 |                 terms[f_id][doc_id] = count
143 |                 read_count += 1
144 | 
145 |     f_ids, f_vs = zip(*terms.items())
146 |     fm = np.vstack(f_vs)
147 |     prod = np.dot(fm, __cm)
148 |     return read_count, f_ids, prod
149 | 
150 | 
151 | def learn_pc(cm):
152 |     """
153 |     @param cm class map
154 |     @returns nb_pc: log(P(C))
155 |     """
156 |     pc = np.log(cm.sum(0))
157 |     nb_pc = array.array('d', pc)
158 |     return nb_pc
159 | 
160 | def generate_cm(items, num_classes):
161 |     """
162 |     @param items (class id, path) pairs
163 |     @param num_classes The number of classes present
164 |     """
165 |     num_instances = len(items)
166 | 
167 |     # Generate the class map
168 |     cm = np.zeros((num_instances, num_classes), dtype='bool')
169 |     for docid, (lang_id, path) in enumerate(items):
170 |         cm[docid, lang_id] = True
171 | 
172 |     return cm
173 | 
174 | def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
175 |     global b_dirs
176 |     num_instances = len(paths)
177 |     num_features = max( i for v in tk_output.values() for i in v) + 1
178 | 
179 |     # Generate the feature map
180 |     nm_arr = mp.Array('i', tk_nextmove, lock=False)
181 | 
182 |     if args.jobs:
183 |         chunksize = min(len(paths) / (args.jobs*2), args.chunksize)
184 |     else:
185 |         chunksize = min(len(paths) / (mp.cpu_count()*2), args.chunksize)
186 | 
187 |     # TODO: Set the output dir
188 |     b_dirs = [ tempfile.mkdtemp(prefix="train-",suffix='-bucket', dir=temp_path) for i in range(args.buckets) ]
189 | 
190 |     output_states = set(tk_output)
191 | 
192 |     path_chunks = list(chunk(paths, chunksize))
193 |     pass_tokenize_arg = zip(offsets(path_chunks), path_chunks)
194 | 
195 |     pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs)
196 |     with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
197 |         pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)
198 | 
199 |     write_count = sum(pass_tokenize_out)
200 |     print("wrote a total of %d keys" % write_count)
201 | 
202 |     pass_ptc_params = (cm, num_instances)
203 |     with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
204 |         pass_ptc_out = f(pass_ptc, b_dirs)
205 | 
206 |     reads, ids, prods = zip(*pass_ptc_out)
207 |     read_count = sum(reads)
208 |     print("read a total of %d keys (%d short)" % (read_count, write_count - read_count))
209 | 
210 |     prod = np.zeros((num_features, cm.shape[1]), dtype=int)
211 |     prod[np.concatenate(ids)] = np.vstack(prods)
212 | 
213 |     ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))
214 | 
215 |     nb_ptc = array.array('d')
216 |     for term_dist in ptc.tolist():
217 |         nb_ptc.extend(term_dist)
218 | 
219 |     return nb_ptc
220 | 
221 | @atexit.register
222 | def cleanup():
223 |     global b_dirs
224 |     try:
225 |         for d in b_dirs:
226 |             shutil.rmtree(d)
227 |     except NameError:
228 |         # Failed before b_dirs is defined, nothing to clean
229 |         pass
230 | 
231 | if __name__ == "__main__":
232 |     parser = argparse.ArgumentParser()
233 |     parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
234 |     parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets")
235 |     parser.add_argument("-s", "--scanner", metavar='SCANNER', help="use SCANNER for feature counting")
236 |     parser.add_argument("-o", "--output", metavar='OUTPUT', help="output langid.py-compatible model to OUTPUT")
237 |     #parser.add_argument("-i","--index",metavar='INDEX',help="read list of training document paths from INDEX")
238 |     parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR")
239 |     parser.add_argument("--chunksize", type=int, help='maximum chunk size (number of files)', default=MAX_CHUNK_SIZE)
240 |     parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
241 |     args = parser.parse_args()
242 | 
243 |     if args.temp:
244 |         temp_path = args.temp
245 |     else:
246 |         temp_path = os.path.join(args.model, 'buckets')
247 | 
248 |     if args.scanner:
249 |         scanner_path = args.scanner
250 |     else:
251 |         scanner_path = os.path.join(args.model, 'LDfeats.scanner')
252 | 
253 |     if args.output:
254 |         output_path = args.output
255 |     else:
256 |         output_path = os.path.join(args.model, 'model')
257 | 
258 |     index_path = os.path.join(args.model, 'paths')
259 |     lang_path = os.path.join(args.model, 'lang_index')
260 | 
261 |     # display paths
262 |     print("model path:", args.model)
263 |     print("temp path:", temp_path)
264 |     print("scanner path:", scanner_path)
265 |     #print "index path:", index_path
266 |     print("output path:", output_path)
267 | 
268 |     # read list of training files
269 |     with open(index_path) as f:
270 |         reader = csv.reader(f)
271 |         items = [ (l,p) for _,l,p in reader ]
272 | 
273 |     # read scanner
274 |     with open(scanner_path) as f:
275 |         tk_nextmove, tk_output, _ = pickle.load(f)
276 | 
277 |     # read list of languages in order
278 |     with open(lang_path) as f:
279 |         reader = csv.reader(f)
280 |         langs = zip(*reader)[0]
281 | 
282 |     cm = generate_cm(items, len(langs))
283 |     paths = zip(*items)[1]
284 | 
285 |     nb_classes = langs
286 |     nb_pc = learn_pc(cm)
287 |     nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args)
288 | 
289 |     # output the model
290 |     model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
291 |     string = base64.b64encode(bz2.compress(pickle.dumps(model)))
292 |     with open(output_path, 'w') as f:
293 |         f.write(string)
294 | 
295 |     print("wrote model to %s (%d bytes)" % (output_path, len(string)))
296 | 


--------------------------------------------------------------------------------
/py3langid/train/README:
--------------------------------------------------------------------------------
 1 | Refactoring of the langid.py training tools, to allow for
 2 | more flexibility and easier experimentation. 
 3 | 
 4 | Planned tools:
 5 | 1) index.py  - index a corpus. Produce a list of file, corpus, language pairs.
 6 | 2) tokenize.py - take an index and tokenize the corresponding files
 7 | 3) DFfeatureselect.py - choose features by document frequency
 8 | 3) IGweight.py - compute the IG weights for language and for domain
 9 | 4) LDfeatureselect.py - take the IG weights and use them to select a feature set
10 | 5) scanner.py - build a scanner on the basis of a feature set
11 | 6) NBtrain.py - learn NB parameters using an indexed corpus and a scanner
12 | 
13 | Optional:
14 | A single tool that integrates all steps, calling on each submodule as required.
15 | 
16 | Marco Lui, January 2013
17 | 


--------------------------------------------------------------------------------
/py3langid/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adbar/py3langid/812f2055f74c35dea298f30b434644062d9289be/py3langid/train/__init__.py


--------------------------------------------------------------------------------
/py3langid/train/common.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Common functions
  3 | 
  4 | Marco Lui, January 2013
  5 | """
  6 | 
  7 | import csv
  8 | import errno
  9 | import marshal
 10 | import multiprocessing as mp
 11 | import os
 12 | 
 13 | from contextlib import contextmanager, closing
 14 | from itertools import imap, islice
 15 | 
 16 | import numpy
 17 | 
 18 | 
 19 | class Enumerator(object):
 20 |     """
 21 |     Enumerator object. Returns a larger number each call.
 22 |     Can be used with defaultdict to enumerate a sequence of items.
 23 |     """
 24 |     def __init__(self, start=0):
 25 |         self.n = start
 26 | 
 27 |     def __call__(self):
 28 |         retval = self.n
 29 |         self.n += 1
 30 |         return retval
 31 | 
 32 | def chunk(seq, chunksize):
 33 |     """
 34 |     Break a sequence into chunks not exceeeding a predetermined size
 35 |     """
 36 |     seq_iter = iter(seq)
 37 |     while True:
 38 |         chunk = tuple(islice(seq_iter, chunksize))
 39 |         if not chunk: break
 40 |         yield chunk
 41 | 
 42 | def unmarshal_iter(path):
 43 |     """
 44 |     Open a given path and yield an iterator over items unmarshalled from it.
 45 |     """
 46 |     with open(path, 'rb') as f:
 47 |         while True:
 48 |             try:
 49 |                 yield marshal.load(f)
 50 |             except EOFError:
 51 |                 break
 52 | 
 53 | def makedir(path):
 54 |     try:
 55 |         os.makedirs(path)
 56 |     except OSError as e:
 57 |         if e.errno != errno.EEXIST:
 58 |             raise
 59 | 
 60 | 
 61 | def write_weights(weights, path):
 62 |     w = dict(weights)
 63 |     with open(path, 'w') as f:
 64 |         writer = csv.writer(f)
 65 |         try:
 66 |             key_order = sorted(w, key=w.get, reverse=True)
 67 |         except ValueError:
 68 |             # Could not order keys by value, value is probably a vector.
 69 |             # Order keys alphabetically in this case.
 70 |             key_order = sorted(w)
 71 | 
 72 |         for k in key_order:
 73 |             row = [repr(k)]
 74 |             try:
 75 |                 row.extend(w[k])
 76 |             except TypeError:
 77 |                 row.append(w[k])
 78 |             writer.writerow(row)
 79 | 
 80 | 
 81 | def read_weights(path):
 82 |     with open(path) as f:
 83 |         reader = csv.reader(f)
 84 |         retval = {}
 85 |         for row in reader:
 86 |             key = eval(row[0])
 87 |             #val = numpy.array( map(float,row[1:]) )
 88 |             val = numpy.array( [float(v) if v != 'nan' else 0. for v in row[1:]] )
 89 |             retval[key] = val
 90 |     return retval
 91 | 
 92 | def read_features(path):
 93 |     """
 94 |     Read a list of features in feature-per-line format, where each
 95 |     feature is a repr and needs to be evaled.
 96 |     @param path path to read from
 97 |     """
 98 |     with open(path) as f:
 99 |         return map(eval, f)
100 | 
101 | def write_features(features, path):
102 |     """
103 |     Write a list of features to a file at `path`. The repr of each
104 |     feature is written on a new line.
105 |     @param features list of features to write
106 |     @param path path to write to
107 |     """
108 |     with open(path,'w') as f:
109 |         for feat in features:
110 |             print(repr(feat),file=f)
111 | 
112 | 
113 | def index(seq):
114 |     """
115 |     Build an index for a sequence of items. Assumes
116 |     that the items in the sequence are unique.
117 |     @param seq the sequence to index
118 |     @returns a dictionary from item to position in the sequence
119 |     """
120 |     return {(k,v) for (v,k) in enumerate(seq)}
121 | 
122 | 
123 | @contextmanager
124 | def MapPool(processes=None, initializer=None, initargs=None, maxtasksperchild=None, chunksize=1):
125 |     """
126 |     Contextmanager to express the common pattern of not using multiprocessing if
127 |     only 1 job is allocated (for example for debugging reasons)
128 |     """
129 |     if processes is None:
130 |         processes = mp.cpu_count() + 4
131 | 
132 |     if processes > 1:
133 |         with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool:
134 |             f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
135 |             yield f
136 |     else:
137 |         if initializer is not None:
138 |             initializer(*initargs)
139 |         f = imap
140 |         yield f
141 | 
142 |     if processes > 1:
143 |         pool.join()
144 | 


--------------------------------------------------------------------------------
/py3langid/train/index.py:
--------------------------------------------------------------------------------
  1 | """
  2 | index.py -
  3 | Index a corpus that is stored in a directory hierarchy as follows:
  4 | 
  5 | - corpus
  6 |   - domain1
  7 |     - language1
  8 |       - file1
  9 |       - file2
 10 |       - ...
 11 |     - language2
 12 |     - ...
 13 |   - domain2
 14 |     - language1
 15 |       - file1
 16 |       - file2
 17 |       - ...
 18 |     - language2
 19 |     - ...
 20 |   - ...
 21 | 
 22 | This produces 3 files:
 23 | * index: a list of paths, together with the langid and domainid as integers
 24 | * lang_index: a list of languages in ascending order of id, with the count for each
 25 | * domain_index: a list of domains in ascending order of id, with the count for each
 26 | 
 27 | Marco Lui, January 2013
 28 | 
 29 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
 30 | 
 31 | Redistribution and use in source and binary forms, with or without modification, are
 32 | permitted provided that the following conditions are met:
 33 | 
 34 |    1. Redistributions of source code must retain the above copyright notice, this list of
 35 |       conditions and the following disclaimer.
 36 | 
 37 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 38 |       of conditions and the following disclaimer in the documentation and/or other materials
 39 |       provided with the distribution.
 40 | 
 41 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 42 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 43 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 44 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 45 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 46 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 47 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 48 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 49 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 50 | 
 51 | The views and conclusions contained in the software and documentation are those of the
 52 | authors and should not be interpreted as representing official policies, either expressed
 53 | or implied, of the copyright holder.
 54 | """
 55 | 
 56 | ######
 57 | # Default values
 58 | # Can be overriden with command-line options
 59 | ######
 60 | TRAIN_PROP = 1.0 # probability than any given document is selected
 61 | MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included
 62 | 
 63 | import argparse
 64 | import csv
 65 | import os
 66 | import random
 67 | 
 68 | from collections import defaultdict
 69 | 
 70 | import numpy
 71 | 
 72 | from .common import Enumerator, makedir
 73 | 
 74 | 
 75 | class CorpusIndexer(object):
 76 |     """
 77 |     Class to index the contents of a corpus
 78 |     """
 79 |     def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None):
 80 |         self.root = root
 81 |         self.min_domain = min_domain
 82 |         self.proportion = proportion
 83 | 
 84 |         if langs is None:
 85 |             self.lang_index = defaultdict(Enumerator())
 86 |         else:
 87 |             # pre-specified lang set
 88 |             self.lang_index = {(k,v) for v,k in enumerate(langs)}
 89 | 
 90 |         if domains is None:
 91 |             self.domain_index = defaultdict(Enumerator())
 92 |         else:
 93 |             # pre-specified domain set
 94 |             self.domain_index = dict((k,v) for v,k in enumerate(domains))
 95 | 
 96 |         self.coverage_index = defaultdict(set)
 97 |         self.items = list()
 98 | 
 99 |         self.index(root)
100 |         self.prune_min_domain(self.min_domain)
101 | 
102 |     def index(self, root):
103 |         # build a list of paths
104 |         paths = []
105 |         for dirpath, dirnames, filenames in os.walk(root, followlinks=True):
106 |             for docname in filenames:
107 |                 if random.random() < self.proportion:
108 |                     # Each file has 'proportion' chance of being selected.
109 |                     path = os.path.join(dirpath, docname)
110 | 
111 |                     # split the dirpath into identifying components
112 |                     d, lang = os.path.split(dirpath)
113 |                     d, domain = os.path.split(d)
114 | 
115 |                     # index the language and the domain
116 |                     try:
117 |                         # TODO: If lang is pre-specified but not domain, we can end up
118 |                         #       enumerating empty domains.
119 |                         domain_id = self.domain_index[domain]
120 |                         lang_id = self.lang_index[lang]
121 |                     except KeyError:
122 |                         # lang or domain outside a pre-specified set so
123 |                         # skip this document.
124 |                         continue
125 | 
126 |                     # add the domain-lang relation to the coverage index
127 |                     self.coverage_index[domain].add(lang)
128 | 
129 |                     # add the item to our list
130 |                     self.items.append((domain_id,lang_id,docname,path))
131 | 
132 | 
133 |     def prune_min_domain(self, min_domain):
134 |         # prune files for all languages that do not occur in at least min_domain
135 | 
136 |         # Work out which languages to reject as they are not present in at least
137 |         # the required number of domains
138 |         lang_domain_count = defaultdict(int)
139 |         for langs in self.coverage_index.values():
140 |             for lang in langs:
141 |                 lang_domain_count[lang] += 1
142 |         reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain)
143 | 
144 |         # Remove the languages from the indexer
145 |         if reject_langs:
146 |             #print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs))
147 |             reject_ids = set(self.lang_index[l] for l in reject_langs)
148 | 
149 |             new_lang_index = defaultdict(Enumerator())
150 |             lm = dict()
151 |             for k,v in self.lang_index.items():
152 |                 if v not in reject_ids:
153 |                     new_id = new_lang_index[k]
154 |                     lm[v] = new_id
155 | 
156 |             # Eliminate all entries for the languages
157 |             self.items = [ (d, lm[l], n, p) for (d, l, n, p) in self.items if l in lm]
158 | 
159 |             self.lang_index = new_lang_index
160 | 
161 | 
162 |     @property
163 |     def dist_lang(self):
164 |         """
165 |         @returns A vector over frequency counts for each language
166 |         """
167 |         retval = numpy.zeros((len(self.lang_index),), dtype='int')
168 |         for d, l, n, p in self.items:
169 |             retval[l] += 1
170 |         return retval
171 | 
172 |     @property
173 |     def dist_domain(self):
174 |         """
175 |         @returns A vector over frequency counts for each domain
176 |         """
177 |         retval = numpy.zeros((len(self.domain_index),), dtype='int')
178 |         for d, l, n, p in self.items:
179 |             retval[d] += 1
180 |         return retval
181 | 
182 |     # TODO: Remove this as it should no longer be needed
183 |     @property
184 |     def classmaps(self):
185 |         num_instances = len(self.items)
186 |         if num_instances == 0:
187 |             raise ValueError("no items indexed!")
188 |         cm_domain = numpy.zeros((num_instances, len(self.domain_index)), dtype='bool')
189 |         cm_lang = numpy.zeros((num_instances, len(self.lang_index)), dtype='bool')
190 | 
191 |         # Populate the class maps
192 |         for docid, (domain_id, lang_id, docname, path) in enumerate(self.items):
193 |             cm_domain[docid, domain_id] = True
194 |             cm_lang[docid, lang_id] = True
195 |         return cm_domain, cm_lang
196 | 
197 |     @property
198 |     def paths(self):
199 |         return [ p for (d,l,n,p) in self.items ]
200 | 
201 | 
202 | if __name__ == "__main__":
203 |     parser = argparse.ArgumentParser()
204 |     parser.add_argument("-p","--proportion", type=float, default=TRAIN_PROP,
205 |         help="proportion of training data to use" )
206 |     parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR")
207 |     parser.add_argument("-d","--domain", metavar="DOMAIN", action='append',
208 |         help="use DOMAIN - can be specified multiple times (uses all domains found if not specified)")
209 |     parser.add_argument("-l","--lang", metavar="LANG", action='append',
210 |         help="use LANG - can be specified multiple times (uses all langs found if not specified)")
211 |     parser.add_argument("--min_domain", type=int, default=MIN_DOMAIN,
212 |         help="minimum number of domains a language must be present in" )
213 |     parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR")
214 | 
215 |     args = parser.parse_args()
216 | 
217 |     corpus_name = os.path.basename(args.corpus)
218 |     if args.model:
219 |         model_dir = args.model
220 |     else:
221 |         model_dir = os.path.join('.', corpus_name+'.model')
222 | 
223 |     makedir(model_dir)
224 | 
225 |     langs_path = os.path.join(model_dir, 'lang_index')
226 |     domains_path = os.path.join(model_dir, 'domain_index')
227 |     index_path = os.path.join(model_dir, 'paths')
228 | 
229 |     # display paths
230 |     print("corpus path:", args.corpus)
231 |     print("model path:", model_dir)
232 |     print("writing langs to:", langs_path)
233 |     print("writing domains to:", domains_path)
234 |     print("writing index to:", index_path)
235 | 
236 |     indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion,
237 |                             langs = args.lang, domains = args.domain)
238 | 
239 |     # Compute mappings between files, languages and domains
240 |     lang_dist = indexer.dist_lang
241 |     lang_index = indexer.lang_index
242 |     lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items()))
243 |     print("langs({0}): {1}".format(len(lang_dist), lang_info))
244 | 
245 |     domain_dist = indexer.dist_domain
246 |     domain_index = indexer.domain_index
247 |     domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items()))
248 |     print("domains({0}): {1}".format(len(domain_dist), domain_info))
249 | 
250 |     print("identified {0} files".format(len(indexer.items)))
251 | 
252 |     # output the language index
253 |     with open(langs_path,'w') as f:
254 |         writer = csv.writer(f)
255 |         writer.writerows((l, lang_dist[lang_index[l]])
256 |             for l in sorted(lang_index.keys(), key=lang_index.get))
257 | 
258 |     # output the domain index
259 |     with open(domains_path,'w') as f:
260 |         writer = csv.writer(f)
261 |         writer.writerows((d, domain_dist[domain_index[d]])
262 |             for d in sorted(domain_index.keys(), key=domain_index.get))
263 | 
264 |     # output items found
265 |     with open(index_path,'w') as f:
266 |         writer = csv.writer(f)
267 |         writer.writerows( (d,l,p) for (d,l,n,p) in indexer.items )
268 | 


--------------------------------------------------------------------------------
/py3langid/train/scanner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | scanner.py -
  3 | Assemble a "feature scanner" using Aho-Corasick string matching.
  4 | This takes a list of features (byte sequences) and builds a DFA
  5 | that when run on a byte stream can identify how often each of
  6 | the features is present in a single pass over the stream.
  7 | 
  8 | Marco Lui, January 2013
  9 | 
 10 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
 11 | 
 12 | Redistribution and use in source and binary forms, with or without modification, are
 13 | permitted provided that the following conditions are met:
 14 | 
 15 |    1. Redistributions of source code must retain the above copyright notice, this list of
 16 |       conditions and the following disclaimer.
 17 | 
 18 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 19 |       of conditions and the following disclaimer in the documentation and/or other materials
 20 |       provided with the distribution.
 21 | 
 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 23 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 24 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 26 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 30 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | 
 32 | The views and conclusions contained in the software and documentation are those of the
 33 | authors and should not be interpreted as representing official policies, either expressed
 34 | or implied, of the copyright holder.
 35 | """
 36 | 
 37 | import argparse
 38 | import array
 39 | import os
 40 | import pickle
 41 | from collections import deque, defaultdict
 42 | from .common import read_features
 43 | 
 44 | class Scanner(object):
 45 |     alphabet = map(chr, range(1<<8))
 46 |     """
 47 |     Implementation of Aho-Corasick string matching.
 48 |     This class should be instantiated with a set of keywords, which
 49 |     will then be the only tokens generated by the class's search method,
 50 |     """
 51 |     @classmethod
 52 |     def from_file(cls, path):
 53 |         with open(path) as f:
 54 |             tk_nextmove, tk_output, feats = pickle.load(f)
 55 |         if isinstance(feats, dict):
 56 |             # The old scanner format had two identical dictionaries as the last
 57 |             # two items in the tuple. This format can still be used by langid.py,
 58 |             # but it does not carry the feature list, and so cannot be unpacked
 59 |             # back into a Scanner object.
 60 |             raise ValueError("old format scanner - please retrain. see code for details.")
 61 |         # tk_output is a mapping from state to a list of feature indices.
 62 |         # because of the way the scanner class is written, it needs a mapping
 63 |         # from state to the feature itself. We rebuild this here.
 64 |         tk_output_f = dict( (k,[feats[i] for i in v]) for k,v in tk_output.iteritems() )
 65 |         scanner = cls.__new__(cls)
 66 |         scanner.__setstate__((tk_nextmove, tk_output_f))
 67 |         return scanner
 68 | 
 69 |     def __init__(self, keywords):
 70 |         self.build(keywords)
 71 | 
 72 |     def __call__(self, value):
 73 |         return self.search(value)
 74 | 
 75 |     def build(self, keywords):
 76 |         goto = dict()
 77 |         fail = dict()
 78 |         output = defaultdict(set)
 79 | 
 80 |         # Algorithm 2
 81 |         newstate = 0
 82 |         for a in keywords:
 83 |             state = 0
 84 |             j = 0
 85 |             while (j < len(a)) and (state, a[j]) in goto:
 86 |                 state = goto[(state, a[j])]
 87 |                 j += 1
 88 |             for p in range(j, len(a)):
 89 |                 newstate += 1
 90 |                 goto[(state, a[p])] = newstate
 91 |                 #print "(%d, %s) -> %d" % (state, a[p], newstate)
 92 |                 state = newstate
 93 |             output[state].add(a)
 94 |         for a in self.alphabet:
 95 |             if (0,a) not in goto:
 96 |                 goto[(0,a)] = 0
 97 | 
 98 |         # Algorithm 3
 99 |         queue = deque()
100 |         for a in self.alphabet:
101 |             if goto[(0,a)] != 0:
102 |                 s = goto[(0,a)]
103 |                 queue.append(s)
104 |                 fail[s] = 0
105 |         while queue:
106 |             r = queue.popleft()
107 |             for a in self.alphabet:
108 |                 if (r,a) in goto:
109 |                     s = goto[(r,a)]
110 |                     queue.append(s)
111 |                     state = fail[r]
112 |                     while (state,a) not in goto:
113 |                         state = fail[state]
114 |                     fail[s] = goto[(state,a)]
115 |                     #print "f(%d) -> %d" % (s, goto[(state,a)]), output[fail[s]]
116 |                     if output[fail[s]]:
117 |                         output[s].update(output[fail[s]])
118 | 
119 |         # Algorithm 4
120 |         self.nextmove = {}
121 |         for a in self.alphabet:
122 |             self.nextmove[(0,a)] = goto[(0,a)]
123 |             if goto[(0,a)] != 0:
124 |                 queue.append(goto[(0,a)])
125 |         while queue:
126 |             r = queue.popleft()
127 |             for a in self.alphabet:
128 |                 if (r,a) in goto:
129 |                     s = goto[(r,a)]
130 |                     queue.append(s)
131 |                     self.nextmove[(r,a)] = s
132 |                 else:
133 |                     self.nextmove[(r,a)] = self.nextmove[(fail[r],a)]
134 | 
135 |         # convert the output to tuples, as tuple iteration is faster
136 |         # than set iteration
137 |         self.output = dict((k, tuple(output[k])) for k in output)
138 | 
139 |         # Next move encoded as a single array. The index of the next state
140 |         # is located at current state * alphabet size  + ord(c).
141 |         # The choice of 'H' array typecode limits us to 64k states.
142 |         def generate_nm_arr(typecode):
143 |             def nextstate_iter():
144 |                 # State count starts at 0, so the number of states is the number of i
145 |                 # the last state (newstate) + 1
146 |                 for state in range(newstate+1):
147 |                     for letter in self.alphabet:
148 |                         yield self.nextmove[(state, letter)]
149 |             return array.array(typecode, nextstate_iter())
150 |         try:
151 |             self.nm_arr = generate_nm_arr('H')
152 |         except OverflowError:
153 |             # Could not fit in an unsigned short array, let's try an unsigned long array.
154 |             self.nm_arr = generate_nm_arr('L')
155 | 
156 |     def __getstate__(self):
157 |         """
158 |         Compiled nextmove and output.
159 |         """
160 |         return (self.nm_arr, self.output)
161 | 
162 |     def __setstate__(self, value):
163 |         nm_array, output = value
164 |         self.nm_arr = nm_array
165 |         self.output = output
166 |         self.nextmove = {}
167 |         for i, next_state in enumerate(nm_array):
168 |             state = i / 256
169 |             letter = chr(i % 256)
170 |             self.nextmove[(state, letter)] = next_state
171 | 
172 |     def search(self, string):
173 |         state = 0
174 |         for letter in map(ord,string):
175 |             state = self.nm_arr[(state << 8) + letter]
176 |             for key in self.output.get(state, []):
177 |                 yield key
178 | 
179 | def build_scanner(features):
180 |     """
181 |     In difference to the Scanner class, this function unwraps a layer of indirection in
182 |     the detection of features. It translates the string output of the scanner's output
183 |     mapping into the index values (positions in the list) of the features in the supplied
184 |     feature set. This is very useful where we are only interested in the relative frequencies
185 |     of features.
186 | 
187 |     @param features a list of features (byte sequences)
188 |     @returns a compiled scanner model
189 |     """
190 |     feat_index = index(features)
191 | 
192 |     # Build the actual scanner
193 |     print("building scanner")
194 |     scanner = Scanner(features)
195 |     tk_nextmove, raw_output = scanner.__getstate__()
196 | 
197 |     # tk_output is the output function of the scanner. It should generate indices into
198 |     # the feature space directly, as this saves a lookup
199 |     tk_output = {}
200 |     for k,v in raw_output.items():
201 |         tk_output[k] = tuple(feat_index[f] for f in v)
202 |     return tk_nextmove, tk_output
203 | 
204 | 
205 | def index(seq):
206 |     """
207 |     Build an index for a sequence of items. Assumes
208 |     that the items in the sequence are unique.
209 |     @param seq the sequence to index
210 |     @returns a dictionary from item to position in the sequence
211 |     """
212 |     return dict((k,v) for (v,k) in enumerate(seq))
213 | 
214 | if __name__ == "__main__":
215 |     parser = argparse.ArgumentParser()
216 |     parser.add_argument("input", metavar="INPUT", help="build a scanner for INPUT. If input is a directory, read INPUT/LDfeats")
217 |     parser.add_argument("-o","--output", help="output scanner to OUTFILE", metavar="OUTFILE")
218 |     args = parser.parse_args()
219 | 
220 |     if os.path.isdir(args.input):
221 |         input_path = os.path.join(args.input, 'LDfeats')
222 |     else:
223 |         input_path = args.input
224 | 
225 |     if args.output:
226 |         output_path = args.output
227 |     else:
228 |         output_path = input_path + '.scanner'
229 | 
230 |     # display paths
231 |     print("input path:", input_path)
232 |     print("output path:", output_path)
233 | 
234 |     nb_features = read_features(input_path)
235 |     tk_nextmove, tk_output = build_scanner(nb_features)
236 |     scanner = tk_nextmove, tk_output, nb_features
237 | 
238 |     with open(output_path, 'w') as f:
239 |         pickle.dump(scanner, f)
240 |     print("wrote scanner to {0}".format(output_path))
241 | 


--------------------------------------------------------------------------------
/py3langid/train/tokenize.py:
--------------------------------------------------------------------------------
  1 | """
  2 | tokenize.py -
  3 | Tokenizer for langid.py training system. This takes a list of files and tokenizes them
  4 | in parallel.
  5 | 
  6 | Marco Lui, January 2013
  7 | 
  8 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without modification, are
 11 | permitted provided that the following conditions are met:
 12 | 
 13 |    1. Redistributions of source code must retain the above copyright notice, this list of
 14 |       conditions and the following disclaimer.
 15 | 
 16 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 17 |       of conditions and the following disclaimer in the documentation and/or other materials
 18 |       provided with the distribution.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | The views and conclusions contained in the software and documentation are those of the
 31 | authors and should not be interpreted as representing official policies, either expressed
 32 | or implied, of the copyright holder.
 33 | """
 34 | 
 35 | ######
 36 | # Default values
 37 | # Can be overriden with command-line options
 38 | ######
 39 | 
 40 | MIN_NGRAM_ORDER = 1 # smallest order of n-grams to consider
 41 | MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
 42 | TOP_DOC_FREQ = 15000 # number of tokens to consider for each order
 43 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation
 44 | CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use)
 45 | 
 46 | import argparse
 47 | import atexit
 48 | import csv
 49 | import marshal
 50 | import multiprocessing as mp
 51 | import os
 52 | import random
 53 | import shutil
 54 | import tempfile
 55 | 
 56 | from itertools import tee
 57 | from collections import defaultdict
 58 | 
 59 | from .common import makedir, chunk, MapPool
 60 | 
 61 | class NGramTokenizer(object):
 62 |     def __init__(self, min_order=1, max_order=3):
 63 |         self.min_order = min_order
 64 |         self.max_order = max_order
 65 | 
 66 |     def __call__(self, seq):
 67 |         min_order = self.min_order
 68 |         max_order = self.max_order
 69 |         t = tee(seq, max_order)
 70 |         for i in range(max_order):
 71 |             for _ in range(i):
 72 |                 # advance iterators, ignoring result
 73 |                 t[i].next()
 74 |         while True:
 75 |             token = ''.join(tn.next() for tn in t)
 76 |             if len(token) < max_order: break
 77 |             for n in range(min_order-1, max_order):
 78 |                 yield token[:n+1]
 79 |         for a in range(max_order-1):
 80 |             for b in range(min_order, max_order-a):
 81 |                 yield token[a:a+b]
 82 | 
 83 | @atexit.register
 84 | def cleanup():
 85 |     global b_dirs, complete
 86 |     try:
 87 |         if not complete:
 88 |             for d in b_dirs:
 89 |                 shutil.rmtree(d)
 90 |     except NameError:
 91 |         # Failed before globals defined, nothing to clean
 92 |         pass
 93 | 
 94 | def setup_pass_tokenize(tokenizer, b_dirs, sample_count, sample_size):
 95 |     global __tokenizer, __b_dirs, __sample_count, __sample_size
 96 |     __tokenizer = tokenizer
 97 |     __b_dirs = b_dirs
 98 |     __sample_count = sample_count
 99 |     __sample_size = sample_size
100 | 
101 | def pass_tokenize(chunk_items):
102 |     """
103 |     Chunk files into a doc->term mapping,
104 |     and simultaneously build a term->df count.
105 |     The term->df counts are redistributed to
106 |     buckets via python's in-built hash function.
107 |     This is basically an inversion step, so that
108 |     now we are chunked on the term axis rather
109 |     than the document axis.
110 |     """
111 |     global __maxorder, __b_dirs, __extractor, __sample_count, __sample_size
112 |     __procname = mp.current_process().name
113 |     b_freq_lang = [tempfile.mkstemp(prefix=__procname+'-', suffix='.lang', dir=p)[0] for p in __b_dirs]
114 |     b_freq_domain = [tempfile.mkstemp(prefix=__procname+'-', suffix='.domain', dir=p)[0] for p in __b_dirs]
115 | 
116 |     extractor = __tokenizer
117 |     term_lng_freq = defaultdict(lambda: defaultdict(int))
118 |     term_dom_freq = defaultdict(lambda: defaultdict(int))
119 | 
120 |     for domain_id, lang_id, path in chunk_items:
121 |         with open(path) as f:
122 |             if __sample_count:
123 |                 # sampling tokenization
124 |                 text = f.read()
125 |                 poss = max(1,len(text) - __sample_size) # possibe start locations
126 |                 count = min(poss, __sample_count) # reduce number of samples if document is too short
127 |                 offsets = random.sample(range(poss), count)
128 |                 for offset in offsets:
129 |                     tokenset = set(extractor(text[offset: offset+__sample_size]))
130 |                     for token in tokenset:
131 |                         term_lng_freq[token][lang_id] += 1
132 |                         term_dom_freq[token][domain_id] += 1
133 | 
134 |             else:
135 |                 # whole-document tokenization
136 |                 tokenset = set(extractor(f.read()))
137 |                 for token in tokenset:
138 |                     term_lng_freq[token][lang_id] += 1
139 |                     term_dom_freq[token][domain_id] += 1
140 | 
141 |     for term in term_lng_freq:
142 |         bucket_index = hash(term) % len(b_freq_lang)
143 |         for lang, count in term_lng_freq[term].iteritems():
144 |             os.write(b_freq_lang[bucket_index], marshal.dumps((term, lang, count)))
145 |         for domain, count in term_dom_freq[term].iteritems():
146 |             os.write(b_freq_domain[bucket_index], marshal.dumps((term, domain, count)))
147 | 
148 |     # Close all the open files
149 |     for f in b_freq_lang + b_freq_domain:
150 |         os.close(f)
151 | 
152 |     return len(term_lng_freq)
153 | 
154 | def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE, sample_count=None, sample_size=None):
155 |     """
156 |     @param items a list of (domain, language, path) tuples
157 |     """
158 |     global b_dirs, complete
159 | 
160 |     # Our exitfunc uses this to know whether to delete the tokenized files
161 |     complete = False
162 | 
163 |     if jobs is None:
164 |         jobs = mp.cpu_count() + 4
165 | 
166 |     b_dirs = [ tempfile.mkdtemp(prefix="tokenize-",suffix='-{0}'.format(tokenizer.__class__.__name__), dir=outdir) for i in range(buckets) ]
167 | 
168 |     # PASS 1: Tokenize documents into sets of terms
169 | 
170 |     # If there are few items, make the chunk size such that each job
171 |     # will have 2 chunks
172 |     chunk_size = max(1,min(len(items) / (jobs * 2), chunksize))
173 |     item_chunks = list(chunk(items, chunk_size))
174 |     pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size)
175 | 
176 |     with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
177 |         pass_tokenize_out = f(pass_tokenize, item_chunks)
178 | 
179 | 
180 |         doc_count = defaultdict(int)
181 |         chunk_count = len(item_chunks)
182 |         print("chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count))
183 |         print("job count: {0}".format(jobs))
184 | 
185 |         if sample_count:
186 |             print("sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count))
187 |         else:
188 |             print("whole-document tokenization")
189 | 
190 |         for i, keycount in enumerate(pass_tokenize_out):
191 |             print("tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount))
192 | 
193 |     complete = True
194 | 
195 |     return b_dirs
196 | 
197 | if __name__ == "__main__":
198 |     parser = argparse.ArgumentParser()
199 |     parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
200 |     parser.add_argument("-s", "--scanner", metavar='SCANNER', help="use SCANNER for tokenizing")
201 |     parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
202 |     parser.add_argument("--max_order", type=int, help="highest n-gram order to use")
203 |     parser.add_argument("--word", action='store_true', default=False, help="use 'word' tokenization (currently str.split)")
204 |     parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE)
205 |     parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets")
206 |     parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR")
207 | 
208 |     group = parser.add_argument_group('sampling')
209 |     group.add_argument("--sample_size", type=int, help="size of sample for sampling-based tokenization", default=140)
210 |     group.add_argument("--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None)
211 | 
212 |     args = parser.parse_args()
213 | 
214 |     if args.temp:
215 |         buckets_dir = args.temp
216 |     else:
217 |         buckets_dir = os.path.join(args.model, 'buckets')
218 |     makedir(buckets_dir)
219 | 
220 |     bucketlist_path = os.path.join(args.model, 'bucketlist')
221 |     index_path = os.path.join(args.model, 'paths')
222 | 
223 |     # display paths
224 |     print("index path:", index_path)
225 |     print("bucketlist path:", bucketlist_path)
226 |     print("buckets path:", buckets_dir)
227 | 
228 |     with open(index_path) as f:
229 |         reader = csv.reader(f)
230 |         items = list(reader)
231 | 
232 |     if sum(map(bool,(args.scanner, args.max_order, args.word))) > 1:
233 |         parser.error('can only specify one of --word, --scanner and --max_order')
234 | 
235 |     # Tokenize
236 |     print("will tokenize %d files" % len(items))
237 |     if args.scanner:
238 |         from .scanner import Scanner
239 |         tokenizer = Scanner.from_file(args.scanner)
240 |         print("using provided scanner: ", args.scanner)
241 |     elif args.word:
242 |         tokenizer = str.split
243 |         print("using str.split to tokenize")
244 |     else:
245 |         max_order = args.max_order if args.max_order else MAX_NGRAM_ORDER
246 |         tokenizer = NGramTokenizer(1,max_order)
247 |         print("using n-gram tokenizer: max_order({0})".format(max_order))
248 |     b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size)
249 | 
250 |     # output the paths to the buckets
251 |     with open(bucketlist_path,'w') as f:
252 |         for d in b_dirs:
253 |             f.write(d+'\n')
254 | 


--------------------------------------------------------------------------------
/py3langid/train/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | train.py -
  3 | All-in-one tool for easy training of a model for langid.py. This depends on the
  4 | training tools for individual steps, which can be run separately.
  5 | 
  6 | Marco Lui, January 2013
  7 | 
  8 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without modification, are
 11 | permitted provided that the following conditions are met:
 12 | 
 13 |    1. Redistributions of source code must retain the above copyright notice, this list of
 14 |       conditions and the following disclaimer.
 15 | 
 16 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
 17 |       of conditions and the following disclaimer in the documentation and/or other materials
 18 |       provided with the distribution.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
 21 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 28 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | The views and conclusions contained in the software and documentation are those of the
 31 | authors and should not be interpreted as representing official policies, either expressed
 32 | or implied, of the copyright holder.
 33 | """
 34 | 
 35 | TRAIN_PROP = 1.0 # probability than any given document is selected
 36 | MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included
 37 | MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
 38 | TOP_DOC_FREQ = 15000 # number of tokens to consider for each order
 39 | NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation
 40 | CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use)
 41 | FEATURES_PER_LANG = 300 # number of features to select for each language
 42 | 
 43 | import argparse
 44 | import base64
 45 | import bz2
 46 | import csv
 47 | import os
 48 | import pickle
 49 | import shutil
 50 | 
 51 | import numpy
 52 | 
 53 | from .common import makedir, write_weights, write_features, read_features
 54 | from .index import CorpusIndexer
 55 | from .tokenize import build_index, NGramTokenizer
 56 | from .DFfeatureselect import tally, ngram_select
 57 | from .IGweight import compute_IG
 58 | from .LDfeatureselect import select_LD_features
 59 | from .scanner import build_scanner, Scanner
 60 | 
 61 | from .NBtrain import generate_cm, learn_pc, learn_ptc
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 |     parser = argparse.ArgumentParser()
 66 |     parser.add_argument("-p","--proportion", type=float, help="proportion of training data to use", default=TRAIN_PROP)
 67 |     parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR")
 68 |     parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
 69 |     parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets")
 70 |     parser.add_argument("-d","--domain", metavar="DOMAIN", action='append',
 71 |         help="use DOMAIN - can be specified multiple times (uses all domains found if not specified)")
 72 |     parser.add_argument("-l","--lang", metavar="LANG", action='append',
 73 |         help="use LANG - can be specified multiple times (uses all langs found if not specified)")
 74 |     parser.add_argument("--min_domain", type=int, help="minimum number of domains a language must be present in", default=MIN_DOMAIN)
 75 |     parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
 76 |     parser.add_argument("--max_order", type=int, help="highest n-gram order to use", default=MAX_NGRAM_ORDER)
 77 |     parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE)
 78 |     parser.add_argument("--df_tokens", type=int, help="number of tokens to consider for each n-gram order", default=TOP_DOC_FREQ)
 79 |     parser.add_argument("--word", action='store_true', default=False, help="use 'word' tokenization (currently str.split)")
 80 |     parser.add_argument("--df_feats", metavar="FEATS", help="Instead of DF feature selection, use a list of features from FEATS")
 81 |     parser.add_argument("--ld_feats", metavar="FEATS", help="Instead of LD feature selection, use a list of features from FEATS")
 82 |     parser.add_argument("--feats_per_lang", type=int, metavar='N', help="select top N features for each language", default=FEATURES_PER_LANG)
 83 |     parser.add_argument("--no_domain_ig", action="store_true", default=False, help="use only per-langugage IG in LD calculation")
 84 |     parser.add_argument("--debug", action="store_true", default=False, help="produce debug output (all intermediates)")
 85 | 
 86 |     group = parser.add_argument_group('sampling')
 87 |     group.add_argument("--sample_size", type=int, help="size of sample for sampling-based tokenization", default=140)
 88 |     group.add_argument("--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None)
 89 | 
 90 |     parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR")
 91 | 
 92 |     args = parser.parse_args()
 93 | 
 94 |     if args.df_feats and args.ld_feats:
 95 |         parser.error("--df_feats and --ld_feats are mutually exclusive")
 96 | 
 97 |     corpus_name = os.path.basename(args.corpus)
 98 |     if args.model:
 99 |         model_dir = args.model
100 |     else:
101 |         model_dir = os.path.join('.', corpus_name+'.model')
102 | 
103 |     makedir(model_dir)
104 | 
105 |     langs_path = os.path.join(model_dir, 'lang_index')
106 |     domains_path = os.path.join(model_dir, 'domain_index')
107 |     index_path = os.path.join(model_dir, 'paths')
108 | 
109 |     # display paths
110 |     print("corpus path:", args.corpus)
111 |     print("model path:", model_dir)
112 | 
113 |     indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion,
114 |                             langs = args.lang, domains = args.domain)
115 | 
116 |     # Compute mappings between files, languages and domains
117 |     lang_dist = indexer.dist_lang
118 |     lang_index = indexer.lang_index
119 |     lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items()))
120 |     print("langs({0}): {1}".format(len(lang_dist), lang_info))
121 | 
122 |     domain_dist = indexer.dist_domain
123 |     domain_index = indexer.domain_index
124 |     domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items()))
125 |     print("domains({0}): {1}".format(len(domain_dist), domain_info))
126 | 
127 |     print("identified {0} files".format(len(indexer.items)))
128 | 
129 |     items = [ (d,l,p) for (d,l,n,p) in indexer.items ]
130 |     if args.debug:
131 |         # output the language index
132 |         with open(langs_path,'w') as f:
133 |             writer = csv.writer(f)
134 |             writer.writerows((l, lang_dist[lang_index[l]])
135 |                 for l in sorted(lang_index, key=lang_index.get))
136 | 
137 |         # output the domain index
138 |         with open(domains_path,'w') as f:
139 |             writer = csv.writer(f)
140 |             writer.writerows((d, domain_dist[domain_index[d]])
141 |                 for d in sorted(domain_index, key=domain_index.get))
142 | 
143 |         # output items found
144 |         with open(index_path,'w') as f:
145 |             writer = csv.writer(f)
146 |             writer.writerows(items)
147 | 
148 |     if args.temp:
149 |         buckets_dir = args.temp
150 |     else:
151 |         buckets_dir = os.path.join(model_dir, 'buckets')
152 |     makedir(buckets_dir)
153 | 
154 |     bucketlist_path = os.path.join(model_dir, 'bucketlist')
155 |     index_path = os.path.join(model_dir, 'paths')
156 | 
157 |     if args.ld_feats:
158 |         # LD features are pre-specified. We are basically just building the NB model.
159 |         LDfeats = read_features(args.ld_feats)
160 | 
161 |     else:
162 |         # LD features not pre-specified, so we compute them.
163 | 
164 |         # Tokenize
165 |         DFfeats = None
166 |         print("will tokenize %d files" % len(items))
167 |         # TODO: Custom tokenizer if doing custom first-pass features
168 |         if args.df_feats:
169 |             print("reading custom features from:", args.df_feats)
170 |             DFfeats = read_features(args.df_feats)
171 |             print("building tokenizer for custom list of {0} features".format(len(DFfeats)))
172 |             tk = Scanner(DFfeats)
173 |         elif args.word:
174 |             print("using word tokenizer")
175 |             tk = str.split
176 |         else:
177 |             print("using byte NGram tokenizer, max_order: {0}".format(args.max_order))
178 |             tk = NGramTokenizer(1, args.max_order)
179 | 
180 |         # First-pass tokenization, used to determine DF of features
181 |         b_dirs = build_index(items, tk, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size)
182 | 
183 |         if args.debug:
184 |             # output the paths to the buckets
185 |             with open(bucketlist_path,'w') as f:
186 |                 for d in b_dirs:
187 |                     f.write(d+'\n')
188 | 
189 |         # We need to compute a tally if we are selecting features by DF, but also if
190 |         # we want full debug output.
191 |         if DFfeats is None or args.debug:
192 |             # Compute DF per-feature
193 |             doc_count = tally(b_dirs, args.jobs)
194 |             if args.debug:
195 |                 doc_count_path = os.path.join(model_dir, 'DF_all')
196 |                 write_weights(doc_count, doc_count_path)
197 |                 print("wrote DF counts for all features to:", doc_count_path)
198 | 
199 |         if DFfeats is None:
200 |             # Choose the first-stage features
201 |             DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens)
202 |         doc_count = None
203 | 
204 |         if args.debug:
205 |             feature_path = os.path.join(model_dir, 'DFfeats')
206 |             write_features(DFfeats, feature_path)
207 |             print('wrote features to "%s"' % feature_path )
208 | 
209 |         # Dispose of the first-pass tokenize output as it is no longer
210 |         # needed.
211 |         if not args.debug:
212 |             for b in b_dirs:
213 |                 shutil.rmtree(b)
214 | 
215 |         # Second-pass tokenization to only obtain counts for the selected features.
216 |         # As the first-pass set is typically much larger than the second pass, it often
217 |         # works out to be faster to retokenize the raw documents rather than iterate
218 |         # over the first-pass counts.
219 |         DF_scanner = Scanner(DFfeats)
220 |         b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets, args.jobs, args.chunksize)
221 |         DF_scanner = None
222 | 
223 |         # Build vectors of domain and language distributions for use in IG calculation
224 |         domain_dist_vec = numpy.array([ domain_dist[domain_index[d]]
225 |                 for d in sorted(domain_index, key=domain_index.get)], dtype=int)
226 |         domain_dist = None
227 |         lang_dist_vec = numpy.array([ lang_dist[lang_index[l]]
228 |                 for l in sorted(lang_index.keys(), key=lang_index.get)], dtype=int)
229 |         lang_dist = None
230 | 
231 |         # Compute IG
232 |         ig_params = [
233 |           ('lang', lang_dist_vec, '.lang', True),
234 |         ]
235 |         if not args.no_domain_ig:
236 |             ig_params.append( ('domain', domain_dist_vec, '.domain', False) )
237 | 
238 |         ig_vals = {}
239 |         for label, dist, suffix, binarize in ig_params:
240 |             print("Computing information gain for {0}".format(label))
241 |             ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs)
242 |             if args.debug:
243 |                 weights_path = os.path.join(model_dir, 'IGweights' + suffix + ('.bin' if binarize else ''))
244 |                 write_weights(ig, weights_path)
245 |             ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig)
246 | 
247 |         ig = None
248 |         DFfeats = None
249 |         # Select features according to the LD criteria
250 |         features_per_lang = select_LD_features(ig_vals['lang'], ig_vals.get('domain'), args.feats_per_lang, ignore_domain = args.no_domain_ig)
251 |         ig_vals = None
252 |         LDfeats = reduce(set.union, map(set, features_per_lang.values()))
253 |         print('selected %d features' % len(LDfeats))
254 | 
255 |         if args.debug:
256 |             feature_path = os.path.join(model_dir, 'LDfeats')
257 |             write_features(sorted(LDfeats), feature_path)
258 |             print('wrote LD features to "%s"' % feature_path )
259 | 
260 |             with open(feature_path + '.perlang', 'w') as f:
261 |                 writer = csv.writer(f)
262 |                 for i in range(len(features_per_lang)):
263 |                     writer.writerow(map(repr,features_per_lang[i]))
264 | 
265 |             print('wrote LD.perlang features to "%s"' % feature_path + '.perlang')
266 |         features_per_lang = None
267 | 
268 |     # Compile a scanner for the LDfeats
269 |     tk_nextmove, tk_output = build_scanner(LDfeats)
270 |     if args.debug:
271 |         scanner_path = feature_path + '.scanner'
272 |         with open(scanner_path, 'w') as f:
273 |             pickle.dump((tk_nextmove, tk_output, LDfeats), f)
274 | 
275 |         print("wrote scanner to {0}".format(scanner_path))
276 | 
277 |     LDfeats = None
278 | 
279 |     # Assemble the NB model
280 |     langs = sorted(lang_index, key=lang_index.get)
281 |     lang_index = None
282 | 
283 |     cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
284 |     paths = zip(*items)[2]
285 | 
286 |     nb_classes = langs
287 |     nb_pc = learn_pc(cm)
288 |     nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)
289 | 
290 |     # output the model
291 |     output_path = os.path.join(model_dir, 'model')
292 |     model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
293 |     string = base64.b64encode(bz2.compress(pickle.dumps(model)))
294 |     with open(output_path, 'w') as f:
295 |         f.write(string)
296 | 
297 |     print("wrote model to %s (%d bytes)" % (output_path, len(string)))
298 | 
299 |     # remove buckets if debug is off. We don't generate buckets if ldfeats is supplied.
300 |     if not args.debug and not args.ld_feats:
301 |         for b in b_dirs:
302 |             shutil.rmtree(b)
303 |         if not args.temp:
304 |             # Do not remove the buckets dir if temp was supplied as we don't know
305 |             # if we created it.
306 |             shutil.rmtree(buckets_dir)
307 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/
 2 | [build-system]
 3 | requires = ["setuptools>=61.0"]
 4 | build-backend = "setuptools.build_meta"
 5 | 
 6 | [project]
 7 | name = "py3langid"
 8 | description = "Fork of the language identification tool langid.py, featuring a modernized codebase and faster execution times."
 9 | readme = "README.rst"
10 | license = { text = "BSD" }
11 | dynamic = ["version"]
12 | requires-python = ">=3.8"
13 | authors = [
14 |   {name = "Marco Lui"},
15 |   {name = "Adrien Barbaresi", email = "barbaresi@bbaw.de"}
16 | ]
17 | keywords=[
18 |     "language detection",
19 |     "language identification",
20 |     "langid",
21 |     "langid.py"
22 | ]
23 | classifiers = [
24 |     # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers
25 |     'Development Status :: 5 - Production/Stable',
26 |     #'Development Status :: 6 - Mature',
27 |     "Environment :: Console",
28 |     "Intended Audience :: Developers",
29 |     "Intended Audience :: Information Technology",
30 |     "Intended Audience :: Science/Research",
31 |     "License :: OSI Approved :: BSD License",
32 |     "Operating System :: MacOS :: MacOS X",
33 |     "Operating System :: Microsoft :: Windows",
34 |     "Operating System :: POSIX :: Linux",
35 |     "Programming Language :: Python",
36 |     "Programming Language :: Python :: 3",
37 |     "Programming Language :: Python :: 3.8",
38 |     "Programming Language :: Python :: 3.9",
39 |     "Programming Language :: Python :: 3.10",
40 |     "Programming Language :: Python :: 3.11",
41 |     "Programming Language :: Python :: 3.12",
42 |     "Programming Language :: Python :: 3.13",
43 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
44 |     "Topic :: Text Processing :: Linguistic",
45 | ]
46 | dependencies = [
47 |     "numpy >= 2.0.0 ; python_version >= '3.9'",
48 |     "numpy >= 1.24.3 ; python_version == '3.8'",
49 | ]
50 | 
51 | # https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
52 | [tool.setuptools]
53 | packages = ["py3langid"]
54 | 
55 | # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
56 | [tool.setuptools.dynamic]
57 | version = {attr = "py3langid.__version__"}
58 | 
59 | [tool.setuptools.package-data]
60 | py3langid = ["data/model.plzma"]
61 | 
62 | [project.scripts]
63 | langid = "py3langid.langid:main"
64 | 
65 | [project.urls]
66 | "Homepage" = "https://github.com/adbar/py3langid"
67 | "Blog" = "https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html"
68 | "Tracker" = "https://github.com/adbar/py3langid/issues"
69 | 
70 | # Development extras
71 | [project.optional-dependencies]
72 | dev = [
73 |     "pytest",
74 |     "pytest-cov",
75 | ]
76 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | import pytest
3 | 


--------------------------------------------------------------------------------
/tests/test_langid.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import subprocess
 3 | import sys
 4 | 
 5 | from io import StringIO
 6 | from pathlib import Path
 7 | 
 8 | import py3langid as langid
 9 | from py3langid.langid import LanguageIdentifier, MODEL_FILE
10 | 
11 | 
12 | def test_langid():
13 |     '''Test if the language detection functions work'''
14 |     # basic classification
15 |     text = b'This text is in English.'
16 |     assert langid.classify(text)[0] == 'en'
17 |     assert langid.rank(text)[0][0] == 'en'
18 |     text = 'This text is in English.'
19 |     assert langid.classify(text)[0] == 'en'
20 |     assert langid.rank(text)[0][0] == 'en'
21 |     text = 'Test Unicode sur du texte en français'
22 |     assert langid.classify(text)[0] == 'fr'
23 |     assert langid.rank(text)[0][0] == 'fr'
24 |     # other datatype
25 |     assert langid.classify(text)[1] != langid.classify(text, datatype='uint32')[1]
26 |     # normalization of probabilities
27 |     identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
28 |     _, normed_prob = identifier.classify(text)
29 |     assert 0 <= normed_prob <= 1
30 |     # probability not equal to 1
31 |     _, normed_prob = identifier.classify('This potrebbe essere a test.')
32 |     normed_prob == 0.8942321
33 |     # not normalized
34 |     identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=False)
35 |     _, prob = identifier.classify(text)
36 |     assert prob < 0
37 |     # subset of target languages
38 |     identifier.set_languages(['de', 'en', 'fr'])
39 |     assert identifier.classify('这样不好')[0] != 'zh'
40 | 
41 | 
42 | 
43 | def test_redirection():
44 |     '''Test if STDIN redirection works'''
45 |     thisdir = Path(__file__).parent
46 |     langid_path = str(thisdir.parent / 'py3langid' / 'langid.py')
47 |     readme_path = str(thisdir.parent / 'README.rst')
48 |     with open(readme_path, 'rb') as f:
49 |         readme = f.read()
50 |     result = subprocess.check_output(['python3', langid_path, '-n'], input=readme)
51 |     assert b'en' in result and b'1.0' in result
52 | 
53 | 
54 | 
55 | def test_cli():
56 |     '''Test console scripts entry point'''
57 |     result = subprocess.check_output(['langid', '-n'], input=b'This should be enough text.')
58 |     assert b'en' in result and b'1.0' in result
59 |     result = subprocess.check_output(['langid', '-n', '-l', 'bg,en,uk'], input=b'This should be enough text.')
60 |     assert b'en' in result and b'1.0' in result
61 | 


--------------------------------------------------------------------------------
/tests/test_server.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from unittest.mock import MagicMock
  4 | 
  5 | import pytest
  6 | 
  7 | from py3langid.langid import application
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def mock_start_response():
 12 |     return MagicMock()
 13 | 
 14 | def test_detect_put(mock_start_response):
 15 |     environ = {
 16 |         'REQUEST_METHOD': 'PUT',
 17 |         'CONTENT_LENGTH': 10,
 18 |         'wsgi.input': MagicMock(read=lambda x: b'This is a test'),
 19 |         'PATH_INFO': '/detect'
 20 |     }
 21 |     response = application(environ, mock_start_response)
 22 |     assert mock_start_response.call_args[0][0] == '200 OK'
 23 |     assert json.loads(response[0].decode('utf-8'))['responseData']['language'] == 'en'
 24 | 
 25 | def test_detect_get(mock_start_response):
 26 |     environ = {
 27 |         'REQUEST_METHOD': 'GET',
 28 |         'QUERY_STRING': 'q=This+is+a+test',
 29 |         'PATH_INFO': '/detect'
 30 |     }
 31 |     response = application(environ, mock_start_response)
 32 |     assert mock_start_response.call_args[0][0] == '200 OK'
 33 |     assert json.loads(response[0].decode('utf-8'))['responseData']['language'] == 'en'
 34 | 
 35 | def test_detect_post(mock_start_response):
 36 |     environ = {
 37 |         'REQUEST_METHOD': 'POST',
 38 |         'CONTENT_LENGTH': 10,
 39 |         'wsgi.input': MagicMock(read=lambda x: b'q=Hello+World'),
 40 |         'PATH_INFO': '/detect'
 41 |     }
 42 |     response = application(environ, mock_start_response)
 43 |     assert mock_start_response.call_args[0][0] == '200 OK'
 44 |     assert json.loads(response[0].decode('utf-8'))['responseData']['language'] == 'en'
 45 | 
 46 | def test_rank_put(mock_start_response):
 47 |     environ = {
 48 |         'REQUEST_METHOD': 'PUT',
 49 |         'CONTENT_LENGTH': 10,
 50 |         'wsgi.input': MagicMock(read=lambda x: b'Hello World'),
 51 |         'PATH_INFO': '/rank'
 52 |     }
 53 |     response = application(environ, mock_start_response)
 54 |     assert mock_start_response.call_args[0][0] == '200 OK'
 55 |     assert json.loads(response[0].decode('utf-8'))['responseData'] is not None
 56 | 
 57 | def test_rank_get(mock_start_response):
 58 |     environ = {
 59 |         'REQUEST_METHOD': 'GET',
 60 |         'QUERY_STRING': 'q=Hello+World',
 61 |         'PATH_INFO': '/rank'
 62 |     }
 63 |     response = application(environ, mock_start_response)
 64 |     assert mock_start_response.call_args[0][0] == '200 OK'
 65 |     assert json.loads(response[0].decode('utf-8'))['responseData'] is not None
 66 | 
 67 | def test_rank_post(mock_start_response):
 68 |     environ = {
 69 |         'REQUEST_METHOD': 'POST',
 70 |         'CONTENT_LENGTH': 10,
 71 |         'wsgi.input': MagicMock(read=lambda x: b'q=Hello+World'),
 72 |         'PATH_INFO': '/rank'
 73 |     }
 74 |     response = application(environ, mock_start_response)
 75 |     assert mock_start_response.call_args[0][0] == '200 OK'
 76 |     assert json.loads(response[0].decode('utf-8'))['responseData'] is not None
 77 | 
 78 | def test_invalid_method(mock_start_response):
 79 |     environ = {
 80 |         'REQUEST_METHOD': 'DELETE',
 81 |         'PATH_INFO': '/detect'
 82 |     }
 83 |     response = application(environ, mock_start_response)
 84 |     assert mock_start_response.call_args[0][0] == '405 Method Not Allowed'
 85 | 
 86 | def test_invalid_path(mock_start_response):
 87 |     environ = {
 88 |         'REQUEST_METHOD': 'GET',
 89 |         'PATH_INFO': '/invalid'
 90 |     }
 91 |     response = application(environ, mock_start_response)
 92 |     assert mock_start_response.call_args[0][0] == '404 Not Found'
 93 | 
 94 | def test_empty_path(mock_start_response):
 95 |     environ = {
 96 |         'REQUEST_METHOD': 'GET',
 97 |         'PATH_INFO': ''
 98 |     }
 99 |     response = application(environ, mock_start_response)
100 |     assert mock_start_response.call_args[0][0] == '404 Not Found'
101 | 
102 | def test_no_query_string(mock_start_response):
103 |     environ = {
104 |         'REQUEST_METHOD': 'GET',
105 |         'PATH_INFO': '/detect'
106 |     }
107 |     response = application(environ, mock_start_response)
108 |     assert mock_start_response.call_args[0][0] == '400 Unknown Status'
109 |     assert json.loads(response[0].decode('utf-8'))['responseData'] is None
110 | 


--------------------------------------------------------------------------------