├── .github
    └── workflows
    │   ├── cd.yml
    │   └── ci.yml
├── .gitignore
├── .mailmap
├── LICENSE.txt
├── README.md
├── SECURITY.md
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── langcodes.rst
    │   └── modules.rst
├── example.py
├── langcodes
    ├── __init__.py
    ├── build_data.py
    ├── data
    │   └── language-subtag-registry.txt
    ├── data_dicts.py
    ├── language_distance.py
    ├── language_lists.py
    ├── py.typed
    ├── registry_parser.py
    ├── tag_parser.py
    ├── tests
    │   ├── README.md
    │   ├── test_alpha3.py
    │   ├── test_issue_59.py
    │   ├── test_language.py
    │   ├── test_language_data.py
    │   └── test_wikt_languages.py
    └── util.py
├── pyproject.toml
├── renovate.json
├── tox.ini
└── uv.lock


/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Delivery
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     environment:
11 |       name: pypi
12 |       url: https://pypi.org/p/langcodes
13 |     permissions:
14 |       id-token: write
15 |     steps:
16 |       - uses: actions/checkout@v4
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v5
19 |         with:
20 |           python-version: '3.13'
21 |       - name: Install dependencies
22 |         run: pip install .[build]
23 |       - name: Build package
24 |         run: python -m build
25 |       - name: Publish package
26 |         uses: pypa/gh-action-pypi-publish@release/v1
27 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       matrix:
10 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - name: Set up Python ${{ matrix.python-version }}
15 |         uses: actions/setup-python@v5
16 |         with:
17 |           python-version: ${{ matrix.python-version }}
18 |       - name: Install
19 |         run: pip install .[test,data]
20 |       - name: Run tests
21 |         run: pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov=language_data --cov-report=xml:junit/coverage-${{ matrix.python-version }}.xml
22 |       - name: Upload 
23 |         uses: actions/upload-artifact@v4
24 |         with:
25 |           name: junit-${{ matrix.python-version }}
26 |           path: junit/*
27 | 
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .coverage
 2 | *.egg-info
 3 | *~
 4 | *.swp
 5 | __pycache__
 6 | build
 7 | dist
 8 | langcodes/data/cldr
 9 | langcodes/data/cldr-json
10 | .ipynb_checkpoints
11 | vendor
12 | .tox
13 | docs/_build/
14 | .hypothesis
15 | poetry.lock
16 | .venv
17 | 


--------------------------------------------------------------------------------
/.mailmap:
--------------------------------------------------------------------------------
1 | # Robyn has used different names and e-mail addresses in the course of this project. Map them all to her current name and e-mail.
2 | Robyn Speer <rspeer@luminoso.com> <rob@luminoso.com>
3 | Robyn Speer <rspeer@luminoso.com> <rob@lumino.so>
4 | Robyn Speer <rspeer@luminoso.com> <rspeer@mit.edu>
5 | Robyn Speer <rspeer@luminoso.com> <rspeer+gh@luminoso.com>
6 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2021 Robyn Speer (rspeer@arborelia.net)
 2 | MIT License
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 | this software and associated documentation files (the "Software"), to deal in
 6 | the Software without restriction, including without limitation the rights to
 7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 8 | of the Software, and to permit persons to whom the Software is furnished to do
 9 | so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Langcodes: a library for language codes
  2 | 
  3 | **langcodes** knows what languages are. It knows the standardized codes that
  4 | refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi.
  5 | 
  6 | These are [IETF language tags][]. You may know them by their old name, ISO 639
  7 | language codes. IETF has done some important things for backward compatibility
  8 | and supporting language variations that you won't find in the ISO standard.
  9 | 
 10 | [IETF language tags]: https://www.w3.org/International/articles/language-tags/
 11 | 
 12 | It may sound to you like langcodes solves a pretty boring problem. At one
 13 | level, that's right. Sometimes you have a boring problem, and it's great when a
 14 | library solves it for you.
 15 | 
 16 | But there's an interesting problem hiding in here. How do you work with
 17 | language codes? How do you know when two different codes represent the same
 18 | thing? How should your code represent relationships between codes, like the
 19 | following?
 20 | 
 21 | * `eng` is equivalent to `en`.
 22 | * `fra` and `fre` are both equivalent to `fr`.
 23 | * `en-GB` might be written as `en-gb` or `en_GB`. Or as 'en-UK', which is
 24 |   erroneous, but should be treated as the same.
 25 | * `en-CA` is not exactly equivalent to `en-US`, but it's really, really close.
 26 | * `en-Latn-US` is equivalent to `en-US`, because written English must be written
 27 |   in the Latin alphabet to be understood.
 28 | * The difference between `ar` and `arb` is the difference between "Arabic" and
 29 |   "Modern Standard Arabic", a difference that may not be relevant to you.
 30 | * You'll find Mandarin Chinese tagged as `cmn` on Wiktionary, but many other
 31 |   resources would call the same language `zh`.
 32 | * Chinese is written in different scripts in different territories. Some
 33 |   software distinguishes the script. Other software distinguishes the territory.
 34 |   The result is that `zh-CN` and `zh-Hans` are used interchangeably, as are
 35 |   `zh-TW` and `zh-Hant`, even though occasionally you'll need something
 36 |   different such as `zh-HK` or `zh-Latn-pinyin`.
 37 | * The Indonesian (`id`) and Malaysian (`ms` or `zsm`) languages are mutually
 38 |   intelligible.
 39 | * `jp` is not a language code. (The language code for Japanese is `ja`, but
 40 |   people confuse it with the country code for Japan.)
 41 | 
 42 | One way to know is to read IETF standards and Unicode technical reports.
 43 | Another way is to use a library that implements those standards and guidelines
 44 | for you, which langcodes does.
 45 | 
 46 | When you're working with these short language codes, you may want to see the
 47 | name that the language is called _in_ a language: `fr` is called "French" in
 48 | English. That language doesn't have to be English: `fr` is called "français" in
 49 | French. A supplement to langcodes, [`language_data`][language-data], provides
 50 | this information.
 51 | 
 52 | [language-data]: https://github.com/rspeer/language_data
 53 | 
 54 | langcodes is maintained by Elia Robyn Lake a.k.a. Robyn Speer, and is released
 55 | as free software under the MIT license.
 56 | 
 57 | 
 58 | ## Standards implemented
 59 | 
 60 | Although this is not the only reason to use it, langcodes will make you more
 61 | acronym-compliant.
 62 | 
 63 | langcodes implements [BCP 47](http://tools.ietf.org/html/bcp47), the IETF Best
 64 | Current Practices on Tags for Identifying Languages. BCP 47 is also known as
 65 | RFC 5646. It subsumes ISO 639 and is backward compatible with it, and it also
 66 | implements recommendations from the [Unicode CLDR](http://cldr.unicode.org).
 67 | 
 68 | langcodes can also refer to a database of language properties and names, built
 69 | from Unicode CLDR and the IANA subtag registry, if you install `language_data`.
 70 | 
 71 | In summary, langcodes takes language codes and does the Right Thing with them,
 72 | and if you want to know exactly what the Right Thing is, there are some
 73 | documents you can go read.
 74 | 
 75 | 
 76 | # Documentation
 77 | 
 78 | ## Standardizing language tags
 79 | 
 80 | This function standardizes tags, as strings, in several ways.
 81 | 
 82 | It replaces overlong tags with their shortest version, and also formats them
 83 | according to the conventions of BCP 47:
 84 | 
 85 |     >>> from langcodes import *
 86 |     >>> standardize_tag('eng_US')
 87 |     'en-US'
 88 | 
 89 | It removes script subtags that are redundant with the language:
 90 | 
 91 |     >>> standardize_tag('en-Latn')
 92 |     'en'
 93 | 
 94 | It replaces deprecated values with their correct versions, if possible:
 95 | 
 96 |     >>> standardize_tag('en-uk')
 97 |     'en-GB'
 98 | 
 99 | Sometimes this involves complex substitutions, such as replacing Serbo-Croatian
100 | (`sh`) with Serbian in Latin script (`sr-Latn`), or the entire tag `sgn-US`
101 | with `ase` (American Sign Language).
102 | 
103 |     >>> standardize_tag('sh-QU')
104 |     'sr-Latn-EU'
105 | 
106 |     >>> standardize_tag('sgn-US')
107 |     'ase'
108 | 
109 | If *macro* is True, it uses macrolanguage codes as a replacement for the most
110 | common standardized language within that macrolanguage.
111 | 
112 |     >>> standardize_tag('arb-Arab', macro=True)
113 |     'ar'
114 | 
115 | Even when *macro* is False, it shortens tags that contain both the
116 | macrolanguage and the language:
117 | 
118 |     >>> standardize_tag('zh-cmn-hans-cn')
119 |     'zh-Hans-CN'
120 | 
121 | If the tag can't be parsed according to BCP 47, this will raise a
122 | LanguageTagError (a subclass of ValueError):
123 | 
124 |     >>> standardize_tag('spa-latn-mx')
125 |     'es-MX'
126 | 
127 |     >>> standardize_tag('spa-mx-latn')
128 |     Traceback (most recent call last):
129 |         ...
130 |     langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
131 | 
132 | 
133 | ## Language objects
134 | 
135 | This package defines one class, named Language, which contains the results
136 | of parsing a language tag. Language objects have the following fields,
137 | any of which may be unspecified:
138 | 
139 | - *language*: the code for the language itself.
140 | - *script*: the 4-letter code for the writing system being used.
141 | - *territory*: the 2-letter or 3-digit code for the country or similar region
142 |   whose usage of the language appears in this text.
143 | - *extlangs*: a list of more specific language codes that follow the language
144 |   code. (This is allowed by the language code syntax, but deprecated.)
145 | - *variants*: codes for specific variations of language usage that aren't
146 |   covered by the *script* or *territory* codes.
147 | - *extensions*: information that's attached to the language code for use in
148 |   some specific system, such as Unicode collation orders.
149 | - *private*: a code starting with `x-` that has no defined meaning.
150 | 
151 | The `Language.get` method converts a string to a Language instance, and the
152 | `Language.make` method makes a Language instance from its fields.  These values
153 | are cached so that calling `Language.get` or `Language.make` again with the
154 | same values returns the same object, for efficiency.
155 | 
156 | By default, it will replace non-standard and overlong tags as it interprets
157 | them. To disable this feature and get the codes that literally appear in the
158 | language tag, use the *normalize=False* option.
159 | 
160 |     >>> Language.get('en-Latn-US')
161 |     Language.make(language='en', script='Latn', territory='US')
162 | 
163 |     >>> Language.get('sgn-US', normalize=False)
164 |     Language.make(language='sgn', territory='US')
165 | 
166 |     >>> Language.get('und')
167 |     Language.make()
168 | 
169 | Here are some examples of replacing non-standard tags:
170 | 
171 |     >>> Language.get('sh-QU')
172 |     Language.make(language='sr', script='Latn', territory='EU')
173 | 
174 |     >>> Language.get('sgn-US')
175 |     Language.make(language='ase')
176 | 
177 |     >>> Language.get('zh-cmn-Hant')
178 |     Language.make(language='zh', script='Hant')
179 | 
180 | Use the `str()` function on a Language object to convert it back to its
181 | standard string form:
182 | 
183 |     >>> str(Language.get('sh-QU'))
184 |     'sr-Latn-EU'
185 | 
186 |     >>> str(Language.make(territory='IN'))
187 |     'und-IN'
188 | 
189 | 
190 | ### Checking validity
191 | 
192 | A language code is _valid_ when every part of it is assigned a meaning by IANA.
193 | That meaning could be "private use".
194 | 
195 | In langcodes, we check the language subtag, script, territory, and variants for
196 | validity. We don't check other parts such as extlangs or Unicode extensions.
197 | 
198 | For example, `ja` is a valid language code, and `jp` is not:
199 | 
200 |     >>> Language.get('ja').is_valid()
201 |     True
202 | 
203 |     >>> Language.get('jp').is_valid()
204 |     False
205 | 
206 | The top-level function `tag_is_valid(tag)` is possibly more convenient to use,
207 | because it can return False even for tags that don't parse:
208 | 
209 |     >>> tag_is_valid('C')
210 |     False
211 | 
212 | If one subtag is invalid, the entire code is invalid:
213 | 
214 |     >>> tag_is_valid('en-000')
215 |     False
216 | 
217 | `iw` is valid, though it's a deprecated alias for `he`:
218 | 
219 |     >>> tag_is_valid('iw')
220 |     True
221 | 
222 | The empty language tag (`und`) is valid:
223 | 
224 |     >>> tag_is_valid('und')
225 |     True
226 | 
227 | Private use codes are valid:
228 | 
229 |     >>> tag_is_valid('x-other')
230 |     True
231 | 
232 |     >>> tag_is_valid('qaa-Qaai-AA-x-what-even-is-this')
233 |     True
234 | 
235 | Language tags that are very unlikely are still valid:
236 | 
237 |     >>> tag_is_valid('fr-Cyrl')
238 |     True
239 | 
240 | Tags with non-ASCII characters are invalid, because they don't parse:
241 | 
242 |     >>> tag_is_valid('zh-普通话')
243 |     False
244 | 
245 | 
246 | ### Getting alpha3 codes
247 | 
248 | Before there was BCP 47, there was ISO 639-2. The ISO tried to make room for the
249 | variety of human languages by assigning every language a 3-letter code,
250 | including the ones that already had 2-letter codes.
251 | 
252 | Unfortunately, this just led to more confusion. Some languages ended up with two
253 | different 3-letter codes for legacy reasons, such as French, which is `fra` as a
254 | "terminology" code, and `fre` as a "biblographic" code. And meanwhile, `fr` was
255 | still a code that you'd be using if you followed ISO 639-1.
256 | 
257 | In BCP 47, you should use 2-letter codes whenever they're available, and that's
258 | what langcodes does. Fortunately, all the languages that have two different
259 | 3-letter codes also have a 2-letter code, so if you prefer the 2-letter code,
260 | you don't have to worry about the distinction.
261 | 
262 | But some applications want the 3-letter code in particular, so langcodes
263 | provides a method for getting those, `Language.to_alpha3()`. It returns the
264 | 'terminology' code by default, and passing `variant='B'` returns the
265 | bibliographic code.
266 | 
267 | When this method returns, it always returns a 3-letter string.
268 | 
269 |     >>> Language.get('fr').to_alpha3()
270 |     'fra'
271 |     >>> Language.get('fr-CA').to_alpha3()
272 |     'fra'
273 |     >>> Language.get('fr-CA').to_alpha3(variant='B')
274 |     'fre'
275 |     >>> Language.get('de').to_alpha3()
276 |     'deu'
277 |     >>> Language.get('no').to_alpha3()
278 |     'nor'
279 |     >>> Language.get('un').to_alpha3()
280 |     Traceback (most recent call last):
281 |         ...
282 |     LookupError: 'un' is not a known language code, and has no alpha3 code.
283 | 
284 | For many languages, the terminology and bibliographic alpha3 codes are the same.
285 | 
286 |     >>> Language.get('en').to_alpha3(variant='T')
287 |     'eng'
288 |     >>> Language.get('en').to_alpha3(variant='B')
289 |     'eng'
290 | 
291 | When you use any of these "overlong" alpha3 codes in langcodes, they normalize
292 | back to the alpha2 code:
293 | 
294 |     >>> Language.get('zho')
295 |     Language.make(language='zh')
296 | 
297 | 
298 | ## Working with language names
299 | 
300 | The methods in this section require an optional package called `language_data`.
301 | You can install it with `pip install language_data`, or request the optional
302 | "data" feature of langcodes with `pip install langcodes[data]`.
303 | 
304 | The dependency that you put in setup.py should be `langcodes[data]`.
305 | 
306 | ### Describing Language objects in natural language
307 | 
308 | It's often helpful to be able to describe a language code in a way that a user
309 | (or you) can understand, instead of in inscrutable short codes. The
310 | `display_name` method lets you describe a Language object *in a language*.
311 | 
312 | The `.display_name(language, min_score)` method will look up the name of the
313 | language. The names come from the IANA language tag registry, which is only in
314 | English, plus CLDR, which names languages in many commonly-used languages.
315 | 
316 | The default language for naming things is English:
317 | 
318 |     >>> Language.make(language='fr').display_name()
319 |     'French'
320 | 
321 |     >>> Language.make().display_name()
322 |     'Unknown language'
323 | 
324 |     >>> Language.get('zh-Hans').display_name()
325 |     'Chinese (Simplified)'
326 | 
327 |     >>> Language.get('en-US').display_name()
328 |     'English (United States)'
329 | 
330 | But you can ask for language names in numerous other languages:
331 | 
332 |     >>> Language.get('fr').display_name('fr')
333 |     'français'
334 | 
335 |     >>> Language.get('fr').display_name('es')
336 |     'francés'
337 | 
338 |     >>> Language.make().display_name('es')
339 |     'lengua desconocida'
340 | 
341 |     >>> Language.get('zh-Hans').display_name('de')
342 |     'Chinesisch (Vereinfacht)'
343 | 
344 |     >>> Language.get('en-US').display_name('zh-Hans')
345 |     '英语（美国）'
346 | 
347 | Why does everyone get Slovak and Slovenian confused? Let's ask them.
348 | 
349 |     >>> Language.get('sl').display_name('sl')
350 |     'slovenščina'
351 |     >>> Language.get('sk').display_name('sk')
352 |     'slovenčina'
353 |     >>> Language.get('sl').display_name('sk')
354 |     'slovinčina'
355 |     >>> Language.get('sk').display_name('sl')
356 |     'slovaščina'
357 | 
358 | If the language has a script or territory code attached to it, these will be
359 | described in parentheses:
360 | 
361 |     >>> Language.get('en-US').display_name()
362 |     'English (United States)'
363 | 
364 | Sometimes these can be the result of tag normalization, such as in this case
365 | where the legacy tag 'sh' becomes 'sr-Latn':
366 | 
367 |     >>> Language.get('sh').display_name()
368 |     'Serbian (Latin)'
369 | 
370 |     >>> Language.get('sh', normalize=False).display_name()
371 |     'Serbo-Croatian'
372 | 
373 | Naming a language in itself is sometimes a useful thing to do, so the
374 | `.autonym()` method makes this easy, providing the display name of a language
375 | in the language itself:
376 | 
377 |     >>> Language.get('fr').autonym()
378 |     'français'
379 |     >>> Language.get('es').autonym()
380 |     'español'
381 |     >>> Language.get('ja').autonym()
382 |     '日本語'
383 |     >>> Language.get('en-AU').autonym()
384 |     'English (Australia)'
385 |     >>> Language.get('sr-Latn').autonym()
386 |     'srpski (latinica)'
387 |     >>> Language.get('sr-Cyrl').autonym()
388 |     'српски (ћирилица)'
389 | 
390 | The names come from the Unicode CLDR data files, and in English they can
391 | also come from the IANA language subtag registry. Together, they can give
392 | you language names in the 196 languages that CLDR supports.
393 | 
394 | 
395 | ### Describing components of language codes
396 | 
397 | You can get the parts of the name separately with the methods `.language_name()`,
398 | `.script_name()`, and `.territory_name()`, or get a dictionary of all the parts
399 | that are present using the `.describe()` method. These methods also accept a
400 | language code for what language they should be described in.
401 | 
402 |     >>> shaw = Language.get('en-Shaw-GB')
403 |     >>> shaw.describe('en')
404 |     {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
405 | 
406 |     >>> shaw.describe('es')
407 |     {'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'}
408 | 
409 | 
410 | ### Recognizing language names in natural language
411 | 
412 | As the reverse of the above operations, you may want to look up a language by
413 | its name, converting a natural language name such as "French" to a code such as
414 | 'fr'.
415 | 
416 | The name can be in any language that CLDR supports (see "Ambiguity" below).
417 | 
418 |     >>> import langcodes
419 |     >>> langcodes.find('french')
420 |     Language.make(language='fr')
421 | 
422 |     >>> langcodes.find('francés')
423 |     Language.make(language='fr')
424 | 
425 | However, this method currently ignores the parenthetical expressions that come from
426 | `.display_name()`:
427 | 
428 |     >>> langcodes.find('English (Canada)')
429 |     Language.make(language='en')
430 | 
431 | There is still room to improve the way that language names are matched, because
432 | some languages are not consistently named the same way. The method currently
433 | works with hundreds of language names that are used on Wiktionary.
434 | 
435 | #### Ambiguity
436 | 
437 | For the sake of usability, `langcodes.find()` doesn't require you to specify what
438 | language you're looking up a language in by name. This could potentially lead to
439 | a conflict: what if name "X" is language A's name for language B, and language C's
440 | name for language D?
441 | 
442 | We can collect the language codes from CLDR and see how many times this
443 | happens. In the majority of cases like that, B and D are codes whose names are
444 | also overlapping in the _same_ language and can be resolved by some general
445 | principle.
446 | 
447 | For example, no matter whether you decide "Tagalog" refers to the language code
448 | `tl` or the largely overlapping code `fil`, that distinction doesn't depend on
449 | the language you're saying "Tagalog" in. We can just return `tl` consistently.
450 | 
451 |     >>> langcodes.find('tagalog')
452 |     Language.make(language='tl')
453 | 
454 | In the few cases of actual interlingual ambiguity, langcodes won't match a result.
455 | You can pass in a `language=` parameter to say what language the name is in.
456 | 
457 | For example, there are two distinct languages called "Tonga" in various languages.
458 | They are `to`, the language of Tonga which is called "Tongan" in English; and `tog`,
459 | a language of Malawi that can be called "Nyasa Tonga" in English.
460 | 
461 |     >>> langcodes.find('tongan')
462 |     Language.make(language='to')
463 | 
464 |     >>> langcodes.find('nyasa tonga')
465 |     Language.make(language='tog')
466 | 
467 |     >>> langcodes.find('tonga')
468 |     Traceback (most recent call last):
469 |     ...
470 |     LookupError: Can't find any language named 'tonga'
471 | 
472 |     >>> langcodes.find('tonga', language='id')
473 |     Language.make(language='to')
474 | 
475 |     >>> langcodes.find('tonga', language='ca')
476 |     Language.make(language='tog')
477 | 
478 | Other ambiguous names written in Latin letters are "Kiga", "Mbundu", "Roman", and "Ruanda".
479 | 
480 | 
481 | ## Demographic language data
482 | 
483 | The `Language.speaking_population()` and `Language.writing_population()`
484 | methods get Unicode's estimates of how many people in the world use a
485 | language.
486 | 
487 | As with the language name data, this requires the optional `language_data`
488 | package to be installed.
489 | 
490 | `.speaking_population()` estimates how many people speak a language. It can
491 | be limited to a particular territory with a territory code (such as a country
492 | code).
493 | 
494 |     >>> Language.get('es').speaking_population()
495 |     493528077
496 | 
497 |     >>> Language.get('pt').speaking_population()
498 |     237496885
499 | 
500 |     >>> Language.get('es-BR').speaking_population()
501 |     76218
502 | 
503 |     >>> Language.get('pt-BR').speaking_population()
504 |     192661560
505 | 
506 |     >>> Language.get('vo').speaking_population()
507 |     0
508 | 
509 | Script codes will be ignored, because the script is not involved in speaking:
510 | 
511 |     >>> Language.get('es-Hant').speaking_population() ==\
512 |     ... Language.get('es').speaking_population()
513 |     True
514 | 
515 | `.writing_population()` estimates how many people write a language.
516 |         
517 |     >>> all = Language.get('zh').writing_population()
518 |     >>> all
519 |     1240841517
520 | 
521 |     >>> traditional = Language.get('zh-Hant').writing_population()
522 |     >>> traditional
523 |     36863340
524 | 
525 |     >>> simplified = Language.get('zh-Hans').writing_population()
526 |     >>> all == traditional + simplified
527 |     True
528 | 
529 | The estimates for "writing population" are often overestimates, as described
530 | in the [CLDR documentation on territory data][overestimates]. In most cases,
531 | they are derived from published data about literacy rates in the places where
532 | those languages are spoken. This doesn't take into account that many literate
533 | people around the world speak a language that isn't typically written, and
534 | write in a _different_ language.
535 | 
536 | [overestimates]: https://unicode-org.github.io/cldr-staging/charts/39/supplemental/territory_language_information.html
537 | 
538 | Like `.speaking_population()`, this can be limited to a particular territory:
539 | 
540 |     >>> Language.get('zh-Hant-HK').writing_population()
541 |     6439733
542 |     >>> Language.get('zh-Hans-HK').writing_population()
543 |     338933
544 | 
545 | 
546 | ## Comparing and matching languages
547 | 
548 | The `tag_distance` function returns a number from 0 to 134 indicating the
549 | distance between the language the user desires and a supported language.
550 | 
551 | The distance data comes from CLDR v38.1 and involves a lot of judgment calls
552 | made by the Unicode consortium.
553 | 
554 | 
555 | ### Distance values
556 | 
557 | This table summarizes the language distance values:
558 | 
559 | | Value | Meaning                                                                                                       | Example
560 | | ----: | :------                                                                                                       | :------
561 | |     0 | These codes represent the same language, possibly after filling in values and normalizing.                    | Norwegian Bokmål → Norwegian
562 | |   1-3 | These codes indicate a minor regional difference.                                                             | Australian English → British English
563 | |   4-9 | These codes indicate a significant but unproblematic regional difference.                                     | American English → British English
564 | | 10-24 | A gray area that depends on your use case. There may be problems with understanding or usability.             | Afrikaans → Dutch, Wu Chinese → Mandarin Chinese
565 | | 25-50 | These languages aren't similar, but there are demographic reasons to expect some intelligibility.             | Tamil → English, Marathi → Hindi
566 | | 51-79 | There are large barriers to understanding.                                                                    | Japanese → Japanese in Hepburn romanization
567 | | 80-99 | These are different languages written in the same script.                                                     | English → French, Arabic → Urdu
568 | |  100+ | These languages have nothing particularly in common.                                                          | English → Japanese, English → Tamil
569 | 
570 | See the docstring of `tag_distance` for more explanation and examples.
571 | 
572 | 
573 | ### Finding the best matching language
574 | 
575 | Suppose you have software that supports any of the `supported_languages`. The
576 | user wants to use `desired_language`.
577 | 
578 | The function `closest_supported_match(desired_language, supported_languages)`
579 | lets you choose the right language, even if there isn't an exact match.
580 | It returns the language tag of the best-supported language, even if there
581 | isn't an exact match.
582 | 
583 | The `max_distance` parameter lets you set a cutoff on what counts as language
584 | support. It has a default of 25, a value that is probably okay for simple
585 | cases of i18n, but you might want to set it lower to require more precision.
586 | 
587 |     >>> closest_supported_match('fr', ['de', 'en', 'fr'])
588 |     'fr'
589 | 
590 |     >>> closest_supported_match('pt', ['pt-BR', 'pt-PT'])
591 |     'pt-BR'
592 | 
593 |     >>> closest_supported_match('en-AU', ['en-GB', 'en-US'])
594 |     'en-GB'
595 | 
596 |     >>> closest_supported_match('af', ['en', 'nl', 'zu'])
597 |     'nl'
598 | 
599 |     >>> closest_supported_match('und', ['en', 'und'])
600 |     'und'
601 | 
602 |     >>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10))
603 |     None
604 | 
605 | A similar function is `closest_match(desired_language, supported_language)`,
606 | which returns both the best matching language tag and the distance. If there is
607 | no match, it returns ('und', 1000).
608 | 
609 |     >>> closest_match('fr', ['de', 'en', 'fr'])
610 |     ('fr', 0)
611 | 
612 |     >>> closest_match('sh', ['hr', 'bs', 'sr-Latn', 'sr-Cyrl'])
613 |     ('sr-Latn', 0)
614 | 
615 |     >>> closest_match('id', ['zsm', 'mhp'])
616 |     ('zsm', 14)
617 | 
618 |     >>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
619 |     ('und', 1000)
620 | 
621 |     >>> closest_match('ja', ['ja-Latn-hepburn', 'en'], max_distance=60)
622 |     ('ja-Latn-hepburn', 50)
623 | 
624 | ## Further API documentation
625 | 
626 | There are many more methods for manipulating and comparing language codes,
627 | and you will find them documented thoroughly in [the code itself][code].
628 | 
629 | The interesting functions all live in this one file, with extensive docstrings
630 | and annotations. Making a separate Sphinx page out of the docstrings would be
631 | the traditional thing to do, but here it just seems redundant. You can go read
632 | the docstrings in context, in their native habitat, and they'll always be up to
633 | date.
634 | 
635 | [Code with documentation][code]
636 | 
637 | [code]: https://github.com/rspeer/langcodes/blob/master/langcodes/__init__.py
638 | 
639 | # Changelog
640 | 
641 | ## Version 3.3 (November 2021)
642 | 
643 | - Updated to CLDR v40.
644 | 
645 | - Updated the IANA subtag registry to version 2021-08-06.
646 | 
647 | - Bug fix: recognize script codes that appear in the IANA registry even if
648 |   they're missing from CLDR for some reason. 'cu-Cyrs' is valid, for example.
649 | 
650 | - Switched the build system from `setuptools` to `poetry`.
651 | 
652 | To install the package in editable mode before PEP 660 is better supported, use
653 | `poetry install` instead of `pip install -e .`.
654 | 
655 | ## Version 3.2 (October 2021)
656 | 
657 | - Supports Python 3.6 through 3.10.
658 | 
659 | - Added the top-level function `tag_is_valid(tag)`, for determining if a string
660 |   is a valid language tag without having to parse it first.
661 | 
662 | - Added the top-level function `closest_supported_match(desired, supported)`,
663 |   which is similar to `closest_match` but with a simpler return value. It
664 |   returns the language tag of the closest match, or None if no match is close
665 |   enough.
666 | 
667 | - Bug fix: a lot of well-formed but invalid language codes appeared to be
668 |   valid, such as 'aaj' or 'en-Latnx', because the regex could match a prefix of
669 |   a subtag. The validity regex is now required to match completely.
670 | 
671 | - Bug fixes that address some edge cases of validity:
672 | 
673 |   - A language tag that is entirely private use, like 'x-private', is valid
674 |   - A language tag that uses the same extension twice, like 'en-a-bbb-a-ccc',
675 |     is invalid
676 |   - A language tag that uses the same variant twice, like 'de-1901-1901', is
677 |     invalid
678 |   - A language tag with two extlangs, like 'sgn-ase-bfi', is invalid
679 | 
680 | - Updated dependencies so they are compatible with Python 3.10, including
681 |   switching back from `marisa-trie-m` to `marisa-trie` in `language_data`.
682 | 
683 | - In bugfix release 3.2.1, corrected cases where the parser accepted
684 |   ill-formed language tags:
685 | 
686 |   - All subtags must be made of between 1 and 8 alphanumeric ASCII characters
687 |   - Tags with two extension 'singletons' in a row (`en-a-b-ccc`) should be
688 |     rejected
689 | 
690 | ## Version 3.1 (February 2021)
691 | 
692 | - Added the `Language.to_alpha3()` method, for getting a three-letter code for a
693 |   language according to ISO 639-2.
694 | 
695 | - Updated the type annotations from obiwan-style to mypy-style.
696 | 
697 | 
698 | ## Version 3.0 (February 2021)
699 | 
700 | - Moved bulky data, particularly language names, into a separate
701 |   `language_data` package. In situations where the data isn't needed,
702 |   `langcodes` becomes a smaller, pure-Python package with no dependencies.
703 | 
704 | - Language codes where the language segment is more than 4 letters no longer
705 |   parse: Language.get('nonsense') now returns an error.
706 | 
707 |   (This is technically stricter than the parse rules of BCP 47, but there are
708 |   no valid language codes of this form and there should never be any. An
709 |   attempt to parse a language code with 5-8 letters is most likely a mistake or
710 |   an attempt to make up a code.)
711 | 
712 | - Added a method for checking the validity of a language code.
713 | 
714 | - Added methods for estimating language population.
715 | 
716 | - Updated to CLDR 38.1, which includes differences in language matching.
717 | 
718 | - Tested on Python 3.6 through 3.9; no longer tested on Python 3.5.
719 | 
720 | 
721 | ## Version 2.2 (February 2021)
722 | 
723 | - Replaced `marisa-trie` dependency with `marisa-trie-m`, to achieve
724 |   compatibility with Python 3.9.
725 | 
726 | 
727 | ## Version 2.1 (June 2020)
728 | 
729 | - Added the `display_name` method to be a more intuitive way to get a string
730 |   describing a language code, and made the `autonym` method use it instead of
731 |   `language_name`.
732 | 
733 | - Updated to CLDR v37.
734 | 
735 | - Previously, some attempts to get the name of a language would return its
736 |   language code instead, perhaps because the name was being requested in a
737 |   language for which CLDR doesn't have name data. This is unfortunate because
738 |   names and codes should not be interchangeable.
739 | 
740 |   Now we fall back on English names instead, which exists for all IANA codes.
741 |   If the code is unknown, we return a string such as "Unknown language [xx]".
742 | 
743 | 
744 | ## Version 2.0 (April 2020)
745 | 
746 | Version 2.0 involves some significant changes that may break compatibility with 1.4,
747 | in addition to updating to version 36.1 of the Unicode CLDR data and the April 2020
748 | version of the IANA subtag registry.
749 | 
750 | This version requires Python 3.5 or later.
751 | 
752 | ### Match scores replaced with distances
753 | 
754 | Originally, the goodness of a match between two different language codes was defined
755 | in terms of a "match score" with a maximum of 100. Around 2016, Unicode started
756 | replacing this with a different measure, the "match distance", which was defined
757 | much more clearly, but we had to keep using the "match score".
758 | 
759 | As of langcodes version 2.0, the "score" functions (such as
760 | `Language.match_score`, `tag_match_score`, and `best_match`) are deprecated.
761 | They'll keep using the deprecated language match tables from around CLDR 27.
762 | 
763 | For a better measure of the closeness of two language codes, use `Language.distance`,
764 | `tag_distance`, and `closest_match`.
765 | 
766 | ### 'region' renamed to 'territory'
767 | 
768 | We were always out of step with CLDR here. Following the example of the IANA
769 | database, we referred to things like the 'US' in 'en-US' as a "region code",
770 | but the Unicode standards consistently call it a "territory code".
771 | 
772 | In langcodes 2.0, parameters, dictionary keys, and attributes named `region`
773 | have been renamed to `territory`.  We try to support a few common cases with
774 | deprecation warnings, such as looking up the `region` property of a Language
775 | object.
776 | 
777 | A nice benefit of this is that when a dictionary is displayed with 'language',
778 | 'script', and 'territory' keys in alphabetical order, they are in the same
779 | order as they are in a language code.
780 | 
781 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | The only supported version is the latest minor version released. As soon as
 6 | a new minor version is released, support for the older one drops.
 7 | 
 8 | ## Reporting a Vulnerability
 9 | 
10 | In order to report a security vulnerability, please contact me at
11 | [mail@georg-krause.net](mailto:mail@georg-krause.net).
12 | [Use GPG if possible](https://www.georg-krause.net/statics/public.key).
13 | 
14 | If the vulnerability is confirmed, I will work on a fix and a new version as
15 | soon as possible. Since maintaining this package isn't my day job, this could
16 | take a few days.
17 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'langcodes'
21 | copyright = '2021, Robyn Speer at Luminoso'
22 | author = 'Robyn Speer at Luminoso'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     'sphinx.ext.autodoc',
32 |     # https://docs.readthedocs.io/en/stable/intro/getting-started-with-sphinx.html#using-markdown-with-sphinx
33 |     'myst_parser',
34 | ]
35 | 
36 | # Add any paths that contain templates here, relative to this directory.
37 | templates_path = ['_templates']
38 | 
39 | # List of patterns, relative to source directory, that match files and
40 | # directories to ignore when looking for source files.
41 | # This pattern also affects html_static_path and html_extra_path.
42 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
43 | 
44 | 
45 | # -- Options for HTML output -------------------------------------------------
46 | 
47 | # The theme to use for HTML and HTML Help pages.  See the documentation for
48 | # a list of builtin themes.
49 | #
50 | html_theme = 'alabaster'
51 | 
52 | # Add any paths that contain custom static files (such as style sheets) here,
53 | # relative to this directory. They are copied after the builtin static files,
54 | # so a file named "default.css" will overwrite the builtin "default.css".
55 | html_static_path = ['_static']
56 | 
57 | 
58 | # -- Extension configuration -------------------------------------------------
59 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. langcodes documentation master file, created by
 2 |    sphinx-quickstart on Fri Apr 16 21:32:52 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to langcodes's documentation!
 7 | =====================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    source/modules
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | myst_parser
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/source/langcodes.rst:
--------------------------------------------------------------------------------
 1 | langcodes package
 2 | =================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | langcodes.build\_data module
 8 | ----------------------------
 9 | 
10 | .. automodule:: langcodes.build_data
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | langcodes.data\_dicts module
16 | ----------------------------
17 | 
18 | .. automodule:: langcodes.data_dicts
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | langcodes.language\_distance module
24 | -----------------------------------
25 | 
26 | .. automodule:: langcodes.language_distance
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | langcodes.language\_lists module
32 | --------------------------------
33 | 
34 | .. automodule:: langcodes.language_lists
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | langcodes.registry\_parser module
40 | ---------------------------------
41 | 
42 | .. automodule:: langcodes.registry_parser
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | langcodes.tag\_parser module
48 | ----------------------------
49 | 
50 | .. automodule:: langcodes.tag_parser
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
55 | langcodes.util module
56 | ---------------------
57 | 
58 | .. automodule:: langcodes.util
59 |    :members:
60 |    :undoc-members:
61 |    :show-inheritance:
62 | 
63 | Module contents
64 | ---------------
65 | 
66 | .. automodule:: langcodes
67 |    :members:
68 |    :undoc-members:
69 |    :show-inheritance:
70 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | langcodes
2 | =========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    langcodes
8 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import langcodes
 3 | 
 4 | # Iterate through all 2- and 3-letter language codes, and for all languages
 5 | # that have enough data to represent their own name, show:
 6 | #
 7 | # - The original code
 8 | # - The code after normalization
 9 | # - The language's name in English
10 | # - The language's name in that language (its autonym)
11 | 
12 | en = langcodes.get('en')
13 | 
14 | for let1 in string.ascii_lowercase:
15 |     for let2 in string.ascii_lowercase:
16 |         for let3 in [''] + list(string.ascii_lowercase):
17 |             code = let1 + let2 + let3
18 |             lcode = langcodes.get(code)
19 |             if lcode.has_name_data():
20 |                 autonym = lcode.autonym()
21 |                 name = lcode.language_name()
22 |                 print('%-3s %-3s %-30s %s' % (code, lcode.language, name, autonym))
23 | 


--------------------------------------------------------------------------------
/langcodes/__init__.py:
--------------------------------------------------------------------------------
   1 | """
   2 | langcodes knows what languages are. It knows the standardized codes that
   3 | refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi.
   4 | Often, it knows what these languages are called *in* a language, and that
   5 | language doesn't have to be English.
   6 | 
   7 | See README.md for the main documentation, or read it on GitHub at
   8 | https://github.com/LuminosoInsight/langcodes/ . For more specific documentation
   9 | on the functions in langcodes, scroll down and read the docstrings.
  10 | 
  11 | Some of these functions, particularly those that work with the names of
  12 | languages, require the `language_data` module to be installed.
  13 | """
  14 | from operator import itemgetter
  15 | from typing import Any, List, Tuple, Dict, Sequence, Iterable, Optional, Mapping, Union
  16 | import warnings
  17 | import sys
  18 | 
  19 | from langcodes.tag_parser import LanguageTagError, parse_tag, normalize_characters
  20 | from langcodes.language_distance import tuple_distance_cached
  21 | from langcodes.data_dicts import (
  22 |     ALL_SCRIPTS,
  23 |     DEFAULT_SCRIPTS,
  24 |     LANGUAGE_REPLACEMENTS,
  25 |     LANGUAGE_ALPHA3,
  26 |     LANGUAGE_ALPHA3_BIBLIOGRAPHIC,
  27 |     TERRITORY_REPLACEMENTS,
  28 |     NORMALIZED_MACROLANGUAGES,
  29 |     LIKELY_SUBTAGS,
  30 |     VALIDITY,
  31 | )
  32 | 
  33 | # When we're getting natural language information *about* languages, it's in
  34 | # English if you don't specify the language.
  35 | DEFAULT_LANGUAGE = 'en'
  36 | 
  37 | 
  38 | LANGUAGE_NAME_IMPORT_MESSAGE = """
  39 | Looking up language names now requires the `language_data` package.
  40 | 
  41 | Install it with:
  42 |     pip install language_data
  43 | Or as an optional feature of langcodes:
  44 |     pip install langcodes[data]
  45 | """
  46 | 
  47 | 
  48 | class Language:
  49 |     """
  50 |     The Language class defines the results of parsing a language tag.
  51 |     Language objects have the following attributes, any of which may be
  52 |     unspecified (in which case their value is None):
  53 | 
  54 |     - *language*: the code for the language itself.
  55 |     - *script*: the 4-letter code for the writing system being used.
  56 |     - *territory*: the 2-letter or 3-digit code for the country or similar territory
  57 |       of the world whose usage of the language appears in this text.
  58 |     - *extlangs*: a list of more specific language codes that follow the language
  59 |       code. (This is allowed by the language code syntax, but deprecated.)
  60 |     - *variants*: codes for specific variations of language usage that aren't
  61 |       covered by the *script* or *territory* codes.
  62 |     - *extensions*: information that's attached to the language code for use in
  63 |       some specific system, such as Unicode collation orders.
  64 |     - *private*: a code starting with `x-` that has no defined meaning.
  65 | 
  66 |     The `Language.get` method converts a string to a Language instance.
  67 |     It's also available at the top level of this module as the `get` function.
  68 |     """
  69 | 
  70 |     ATTRIBUTES = [
  71 |         'language',
  72 |         'extlangs',
  73 |         'script',
  74 |         'territory',
  75 |         'variants',
  76 |         'extensions',
  77 |         'private',
  78 |     ]
  79 | 
  80 |     # When looking up "likely subtags" data, we try looking up the data for
  81 |     # increasingly less specific versions of the language code.
  82 |     BROADER_KEYSETS = [
  83 |         {'language', 'script', 'territory'},
  84 |         {'language', 'territory'},
  85 |         {'language', 'script'},
  86 |         {'language'},
  87 |         {'script'},
  88 |         {},
  89 |     ]
  90 | 
  91 |     MATCHABLE_KEYSETS = [
  92 |         {'language', 'script', 'territory'},
  93 |         {'language', 'script'},
  94 |         {'language'},
  95 |     ]
  96 | 
  97 |     # Values cached at the class level
  98 |     _INSTANCES: Dict[tuple, 'Language'] = {}
  99 |     _PARSE_CACHE: Dict[Tuple[str, bool], 'Language'] = {}
 100 | 
 101 |     def __init__(
 102 |         self,
 103 |         language: Optional[str] = None,
 104 |         extlangs: Optional[Sequence[str]] = None,
 105 |         script: Optional[str] = None,
 106 |         territory: Optional[str] = None,
 107 |         variants: Optional[Sequence[str]] = None,
 108 |         extensions: Optional[Sequence[str]] = None,
 109 |         private: Optional[str] = None,
 110 |     ):
 111 |         """
 112 |         The constructor for Language objects.
 113 | 
 114 |         It's inefficient to call this directly, because it can't return
 115 |         an existing instance. Instead, call Language.make(), which
 116 |         has the same signature.
 117 |         """
 118 |         self.language = language
 119 |         self.extlangs = extlangs
 120 |         self.script = script
 121 |         self.territory = territory
 122 |         self.variants = variants
 123 |         self.extensions = extensions
 124 |         self.private = private
 125 | 
 126 |         # Cached values
 127 |         self._simplified: 'Language' = None
 128 |         self._searchable: 'Language' = None
 129 |         self._broader: List[str] = None
 130 |         self._assumed: 'Language' = None
 131 |         self._filled: 'Language' = None
 132 |         self._macrolanguage: Optional['Language'] = None
 133 |         self._str_tag: str = None
 134 |         self._dict: dict = None
 135 |         self._disp_separator: str = None
 136 |         self._disp_pattern: str = None
 137 | 
 138 |         # Make sure the str_tag value is cached
 139 |         self.to_tag()
 140 | 
 141 |     @classmethod
 142 |     def make(
 143 |         cls,
 144 |         language: Optional[str] = None,
 145 |         extlangs: Optional[Sequence[str]] = None,
 146 |         script: Optional[str] = None,
 147 |         territory: Optional[str] = None,
 148 |         variants: Optional[Sequence[str]] = None,
 149 |         extensions: Optional[Sequence[str]] = None,
 150 |         private: Optional[str] = None,
 151 |     ) -> 'Language':
 152 |         """
 153 |         Create a Language object by giving any subset of its attributes.
 154 | 
 155 |         If this value has been created before, return the existing value.
 156 |         """
 157 |         values = (
 158 |             language,
 159 |             tuple(extlangs or ()),
 160 |             script,
 161 |             territory,
 162 |             tuple(variants or ()),
 163 |             tuple(extensions or ()),
 164 |             private,
 165 |         )
 166 |         if values in cls._INSTANCES:
 167 |             return cls._INSTANCES[values]
 168 | 
 169 |         instance = cls(
 170 |             language=language,
 171 |             extlangs=extlangs,
 172 |             script=script,
 173 |             territory=territory,
 174 |             variants=variants,
 175 |             extensions=extensions,
 176 |             private=private,
 177 |         )
 178 |         cls._INSTANCES[values] = instance
 179 |         return instance
 180 | 
 181 |     @staticmethod
 182 |     def get(tag: Union[str, 'Language'], normalize=True) -> 'Language':
 183 |         """
 184 |         Create a Language object from a language tag string.
 185 | 
 186 |         If normalize=True, non-standard or overlong tags will be replaced as
 187 |         they're interpreted. This is recommended.
 188 | 
 189 |         Here are several examples of language codes, which are also test cases.
 190 |         Most language codes are straightforward, but these examples will get
 191 |         pretty obscure toward the end.
 192 | 
 193 |         >>> Language.get('en-US')
 194 |         Language.make(language='en', territory='US')
 195 | 
 196 |         >>> Language.get('zh-Hant')
 197 |         Language.make(language='zh', script='Hant')
 198 | 
 199 |         >>> Language.get('und')
 200 |         Language.make()
 201 | 
 202 |         This function is idempotent, in case you already have a Language object:
 203 | 
 204 |         >>> Language.get(Language.get('en-us'))
 205 |         Language.make(language='en', territory='US')
 206 | 
 207 |         The non-code 'root' is sometimes used to represent the lack of any
 208 |         language information, similar to 'und'.
 209 | 
 210 |         >>> Language.get('root')
 211 |         Language.make()
 212 | 
 213 |         By default, getting a Language object will automatically convert
 214 |         deprecated tags:
 215 | 
 216 |         >>> Language.get('iw')
 217 |         Language.make(language='he')
 218 | 
 219 |         >>> Language.get('in')
 220 |         Language.make(language='id')
 221 | 
 222 |         One type of deprecated tag that should be replaced is for sign
 223 |         languages, which used to all be coded as regional variants of a
 224 |         fictitious global sign language called 'sgn'. Of course, there is no
 225 |         global sign language, so sign languages now have their own language
 226 |         codes.
 227 | 
 228 |         >>> Language.get('sgn-US')
 229 |         Language.make(language='ase')
 230 | 
 231 |         >>> Language.get('sgn-US', normalize=False)
 232 |         Language.make(language='sgn', territory='US')
 233 | 
 234 |         'en-gb-oed' is a tag that's grandfathered into the standard because it
 235 |         has been used to mean "spell-check this with Oxford English Dictionary
 236 |         spelling", but that tag has the wrong shape. We interpret this as the
 237 |         new standardized tag 'en-gb-oxendict', unless asked not to normalize.
 238 | 
 239 |         >>> Language.get('en-gb-oed')
 240 |         Language.make(language='en', territory='GB', variants=['oxendict'])
 241 | 
 242 |         >>> Language.get('en-gb-oed', normalize=False)
 243 |         Language.make(language='en-gb-oed')
 244 | 
 245 |         'zh-min-nan' is another oddly-formed tag, used to represent the
 246 |         Southern Min language, which includes Taiwanese as a regional form. It
 247 |         now has its own language code.
 248 | 
 249 |         >>> Language.get('zh-min-nan')
 250 |         Language.make(language='nan')
 251 | 
 252 |         The vague tag 'zh-min' is now also interpreted as 'nan', with a private
 253 |         extension indicating that it had a different form:
 254 | 
 255 |         >>> Language.get('zh-min')
 256 |         Language.make(language='nan', private='x-zh-min')
 257 | 
 258 |         Occasionally Wiktionary will use 'extlang' tags in strange ways, such
 259 |         as using the tag 'und-ibe' for some unspecified Iberian language.
 260 | 
 261 |         >>> Language.get('und-ibe')
 262 |         Language.make(extlangs=['ibe'])
 263 | 
 264 |         Here's an example of replacing multiple deprecated tags.
 265 | 
 266 |         The language tag 'sh' (Serbo-Croatian) ended up being politically
 267 |         problematic, and different standards took different steps to address
 268 |         this. The IANA made it into a macrolanguage that contains 'sr', 'hr',
 269 |         and 'bs'. Unicode further decided that it's a legacy tag that should
 270 |         be interpreted as 'sr-Latn', which the language matching rules say
 271 |         is mutually intelligible with all those languages.
 272 | 
 273 |         We complicate the example by adding on the territory tag 'QU', an old
 274 |         provisional tag for the European Union, which is now standardized as
 275 |         'EU'.
 276 | 
 277 |         >>> Language.get('sh-QU')
 278 |         Language.make(language='sr', script='Latn', territory='EU')
 279 |         """
 280 |         if isinstance(tag, Language):
 281 |             if not normalize:
 282 |                 # shortcut: we have the tag already
 283 |                 return tag
 284 | 
 285 |             # We might need to normalize this tag. Convert it back into a
 286 |             # string tag, to cover all the edge cases of normalization in a
 287 |             # way that we've already solved.
 288 |             tag = tag.to_tag()
 289 | 
 290 |         if (tag, normalize) in Language._PARSE_CACHE:
 291 |             return Language._PARSE_CACHE[tag, normalize]
 292 | 
 293 |         data: Dict[str, Any] = {}
 294 | 
 295 |         # If the complete tag appears as something to normalize, do the
 296 |         # normalization right away. Smash case and convert underscores to
 297 |         # hyphens when checking, because the case normalization that comes from
 298 |         # parse_tag() hasn't been applied yet.
 299 | 
 300 |         tag_lower = normalize_characters(tag)
 301 |         if normalize and tag_lower in LANGUAGE_REPLACEMENTS:
 302 |             tag = LANGUAGE_REPLACEMENTS[tag_lower]
 303 | 
 304 |         components = parse_tag(tag)
 305 | 
 306 |         for typ, value in components:
 307 |             if typ == 'extlang' and normalize and 'language' in data:
 308 |                 # smash extlangs when possible
 309 |                 minitag = f"{data['language']}-{value}"
 310 |                 norm = LANGUAGE_REPLACEMENTS.get(normalize_characters(minitag))
 311 |                 if norm is not None:
 312 |                     data.update(Language.get(norm, normalize).to_dict())
 313 |                 else:
 314 |                     data.setdefault('extlangs', []).append(value)
 315 |             elif typ in {'extlang', 'variant', 'extension'}:
 316 |                 data.setdefault(typ + 's', []).append(value)
 317 |             elif typ == 'language':
 318 |                 if value == 'und':
 319 |                     pass
 320 |                 elif normalize:
 321 |                     replacement = LANGUAGE_REPLACEMENTS.get(value.lower())
 322 |                     if replacement is not None:
 323 |                         # parse the replacement if necessary -- this helps with
 324 |                         # Serbian and Moldovan
 325 |                         data.update(Language.get(replacement, normalize).to_dict())
 326 |                     else:
 327 |                         data['language'] = value
 328 |                 else:
 329 |                     data['language'] = value
 330 |             elif typ == 'territory':
 331 |                 if normalize:
 332 |                     data['territory'] = TERRITORY_REPLACEMENTS.get(value.lower(), value)
 333 |                 else:
 334 |                     data['territory'] = value
 335 |             elif typ == 'grandfathered':
 336 |                 # If we got here, we got a grandfathered tag but we were asked
 337 |                 # not to normalize it, or the CLDR data doesn't know how to
 338 |                 # normalize it. The best we can do is set the entire tag as the
 339 |                 # language.
 340 |                 data['language'] = value
 341 |             else:
 342 |                 data[typ] = value
 343 | 
 344 |         result = Language.make(**data)
 345 |         Language._PARSE_CACHE[tag, normalize] = result
 346 |         return result
 347 | 
 348 |     def to_tag(self) -> str:
 349 |         """
 350 |         Convert a Language back to a standard language tag, as a string.
 351 |         This is also the str() representation of a Language object.
 352 | 
 353 |         >>> Language.make(language='en', territory='GB').to_tag()
 354 |         'en-GB'
 355 | 
 356 |         >>> Language.make(language='yue', script='Hant', territory='HK').to_tag()
 357 |         'yue-Hant-HK'
 358 | 
 359 |         >>> Language.make(script='Arab').to_tag()
 360 |         'und-Arab'
 361 | 
 362 |         >>> str(Language.make(territory='IN'))
 363 |         'und-IN'
 364 |         """
 365 |         if self._str_tag is not None:
 366 |             return self._str_tag
 367 |         subtags = ['und']
 368 |         if self.language:
 369 |             subtags[0] = self.language
 370 |         if self.extlangs:
 371 |             for extlang in sorted(self.extlangs):
 372 |                 subtags.append(extlang)
 373 |         if self.script:
 374 |             subtags.append(self.script)
 375 |         if self.territory:
 376 |             subtags.append(self.territory)
 377 |         if self.variants:
 378 |             for variant in sorted(self.variants):
 379 |                 subtags.append(variant)
 380 |         if self.extensions:
 381 |             for ext in self.extensions:
 382 |                 subtags.append(ext)
 383 |         if self.private:
 384 |             subtags.append(self.private)
 385 |         self._str_tag = '-'.join(subtags)
 386 |         return self._str_tag
 387 | 
 388 |     def simplify_script(self) -> 'Language':
 389 |         """
 390 |         Remove the script from some parsed language data, if the script is
 391 |         redundant with the language.
 392 | 
 393 |         >>> Language.make(language='en', script='Latn').simplify_script()
 394 |         Language.make(language='en')
 395 | 
 396 |         >>> Language.make(language='yi', script='Latn').simplify_script()
 397 |         Language.make(language='yi', script='Latn')
 398 | 
 399 |         >>> Language.make(language='yi', script='Hebr').simplify_script()
 400 |         Language.make(language='yi')
 401 |         """
 402 |         if self._simplified is not None:
 403 |             return self._simplified
 404 | 
 405 |         if self.language and self.script:
 406 |             if DEFAULT_SCRIPTS.get(self.language) == self.script:
 407 |                 result = self.update_dict({'script': None})
 408 |                 self._simplified = result
 409 |                 return self._simplified
 410 | 
 411 |         self._simplified = self
 412 |         return self._simplified
 413 | 
 414 |     def assume_script(self) -> 'Language':
 415 |         """
 416 |         Fill in the script if it's missing, and if it can be assumed from the
 417 |         language subtag. This is the opposite of `simplify_script`.
 418 | 
 419 |         >>> Language.make(language='en').assume_script()
 420 |         Language.make(language='en', script='Latn')
 421 | 
 422 |         >>> Language.make(language='yi').assume_script()
 423 |         Language.make(language='yi', script='Hebr')
 424 | 
 425 |         >>> Language.make(language='yi', script='Latn').assume_script()
 426 |         Language.make(language='yi', script='Latn')
 427 | 
 428 |         This fills in nothing when the script cannot be assumed -- such as when
 429 |         the language has multiple scripts, or it has no standard orthography:
 430 | 
 431 |         >>> Language.make(language='sr').assume_script()
 432 |         Language.make(language='sr')
 433 | 
 434 |         >>> Language.make(language='eee').assume_script()
 435 |         Language.make(language='eee')
 436 | 
 437 |         It also doesn't fill anything in when the language is unspecified.
 438 | 
 439 |         >>> Language.make(territory='US').assume_script()
 440 |         Language.make(territory='US')
 441 |         """
 442 |         if self._assumed is not None:
 443 |             return self._assumed
 444 |         if self.language and not self.script:
 445 |             try:
 446 |                 self._assumed = self.update_dict(
 447 |                     {'script': DEFAULT_SCRIPTS[self.language]}
 448 |                 )
 449 |             except KeyError:
 450 |                 self._assumed = self
 451 |         else:
 452 |             self._assumed = self
 453 |         return self._assumed
 454 | 
 455 |     def prefer_macrolanguage(self) -> 'Language':
 456 |         """
 457 |         BCP 47 doesn't specify what to do with macrolanguages and the languages
 458 |         they contain. The Unicode CLDR, on the other hand, says that when a
 459 |         macrolanguage has a dominant standardized language, the macrolanguage
 460 |         code should be used for that language. For example, Mandarin Chinese
 461 |         is 'zh', not 'cmn', according to Unicode, and Malay is 'ms', not 'zsm'.
 462 | 
 463 |         This isn't a rule you'd want to follow in all cases -- for example, you may
 464 |         want to be able to specifically say that 'ms' (the Malay macrolanguage)
 465 |         contains both 'zsm' (Standard Malay) and 'id' (Indonesian). But applying
 466 |         this rule helps when interoperating with the Unicode CLDR.
 467 | 
 468 |         So, applying `prefer_macrolanguage` to a Language object will
 469 |         return a new object, replacing the language with the macrolanguage if
 470 |         it is the dominant language within that macrolanguage. It will leave
 471 |         non-dominant languages that have macrolanguages alone.
 472 | 
 473 |         >>> Language.get('arb').prefer_macrolanguage()
 474 |         Language.make(language='ar')
 475 | 
 476 |         >>> Language.get('cmn-Hant').prefer_macrolanguage()
 477 |         Language.make(language='zh', script='Hant')
 478 | 
 479 |         >>> Language.get('yue-Hant').prefer_macrolanguage()
 480 |         Language.make(language='yue', script='Hant')
 481 |         """
 482 |         if self._macrolanguage is not None:
 483 |             return self._macrolanguage
 484 |         language = self.language or 'und'
 485 |         if language in NORMALIZED_MACROLANGUAGES:
 486 |             self._macrolanguage = self.update_dict(
 487 |                 {'language': NORMALIZED_MACROLANGUAGES[language]}
 488 |             )
 489 |         else:
 490 |             self._macrolanguage = self
 491 |         return self._macrolanguage
 492 | 
 493 |     def to_alpha3(self, variant: str = 'T') -> str:
 494 |         """
 495 |         Get the three-letter language code for this language, even if it's
 496 |         canonically written with a two-letter code.
 497 | 
 498 |         These codes are the 'alpha3' codes defined by ISO 639-2.
 499 | 
 500 |         When this function returns, it always returns a 3-letter string. If
 501 |         there is no known alpha3 code for the language, it raises a LookupError.
 502 | 
 503 |         In cases where the distinction matters, we default to the 'terminology'
 504 |         code. You can pass `variant='B'` to get the 'bibliographic' code instead.
 505 |         For example, the terminology code for German is 'deu', while the
 506 |         bibliographic code is 'ger'.
 507 | 
 508 |         (The confusion between these two sets of codes is a good reason to avoid
 509 |         using alpha3 codes. Every language that has two different alpha3 codes
 510 |         also has an alpha2 code that's preferred, such as 'de' for German.)
 511 | 
 512 |         >>> Language.get('fr').to_alpha3()
 513 |         'fra'
 514 |         >>> Language.get('fr-CA').to_alpha3()
 515 |         'fra'
 516 |         >>> Language.get('fr').to_alpha3(variant='B')
 517 |         'fre'
 518 |         >>> Language.get('de').to_alpha3(variant='T')
 519 |         'deu'
 520 |         >>> Language.get('ja').to_alpha3()
 521 |         'jpn'
 522 |         >>> Language.get('un').to_alpha3()
 523 |         Traceback (most recent call last):
 524 |             ...
 525 |         LookupError: 'un' is not a known language code, and has no alpha3 code.
 526 | 
 527 | 
 528 |         All valid two-letter language codes have corresponding alpha3 codes,
 529 |         even the un-normalized ones. If they were assigned an alpha3 code by ISO
 530 |         before they were assigned a normalized code by CLDR, these codes may be
 531 |         different:
 532 | 
 533 |         >>> Language.get('tl', normalize=False).to_alpha3()
 534 |         'tgl'
 535 |         >>> Language.get('tl').to_alpha3()
 536 |         'fil'
 537 |         >>> Language.get('sh', normalize=False).to_alpha3()
 538 |         'hbs'
 539 | 
 540 | 
 541 |         Three-letter codes are preserved, even if they're unknown:
 542 | 
 543 |         >>> Language.get('qqq').to_alpha3()
 544 |         'qqq'
 545 |         >>> Language.get('und').to_alpha3()
 546 |         'und'
 547 |         """
 548 |         variant = variant.upper()
 549 |         if variant not in 'BT':
 550 |             raise ValueError("Variant must be 'B' or 'T'")
 551 | 
 552 |         language = self.language
 553 |         if language is None:
 554 |             return 'und'
 555 |         elif len(language) == 3:
 556 |             return language
 557 |         else:
 558 |             if variant == 'B' and language in LANGUAGE_ALPHA3_BIBLIOGRAPHIC:
 559 |                 return LANGUAGE_ALPHA3_BIBLIOGRAPHIC[language]
 560 |             elif language in LANGUAGE_ALPHA3:
 561 |                 return LANGUAGE_ALPHA3[language]
 562 |             else:
 563 |                 raise LookupError(
 564 |                     f"{language!r} is not a known language code, "
 565 |                     "and has no alpha3 code."
 566 |                 )
 567 | 
 568 |     def broader_tags(self) -> List[str]:
 569 |         """
 570 |         Iterate through increasingly general tags for this language.
 571 | 
 572 |         This isn't actually that useful for matching two arbitrary language tags
 573 |         against each other, but it is useful for matching them against a known
 574 |         standardized form, such as in the CLDR data.
 575 | 
 576 |         The list of broader versions to try appears in UTR 35, section 4.3,
 577 |         "Likely Subtags".
 578 | 
 579 |         >>> Language.get('nn-Latn-NO-x-thingy').broader_tags()
 580 |         ['nn-Latn-NO-x-thingy', 'nn-Latn-NO', 'nn-NO', 'nn-Latn', 'nn', 'und-Latn', 'und']
 581 | 
 582 |         >>> Language.get('arb-Arab').broader_tags()
 583 |         ['arb-Arab', 'ar-Arab', 'arb', 'ar', 'und-Arab', 'und']
 584 |         """
 585 |         if self._broader is not None:
 586 |             return self._broader
 587 |         self._broader = [self.to_tag()]
 588 |         seen = set([self.to_tag()])
 589 |         for keyset in self.BROADER_KEYSETS:
 590 |             for start_language in (self, self.prefer_macrolanguage()):
 591 |                 filtered = start_language._filter_attributes(keyset)
 592 |                 tag = filtered.to_tag()
 593 |                 if tag not in seen:
 594 |                     self._broader.append(tag)
 595 |                     seen.add(tag)
 596 |         return self._broader
 597 | 
 598 |     def broaden(self) -> 'List[Language]':
 599 |         """
 600 |         Like `broader_tags`, but returrns Language objects instead of strings.
 601 |         """
 602 |         return [Language.get(tag) for tag in self.broader_tags()]
 603 | 
 604 |     def maximize(self) -> 'Language':
 605 |         """
 606 |         The Unicode CLDR contains a "likelySubtags" data file, which can guess
 607 |         reasonable values for fields that are missing from a language tag.
 608 | 
 609 |         This is particularly useful for comparing, for example, "zh-Hant" and
 610 |         "zh-TW", two common language tags that say approximately the same thing
 611 |         via rather different information. (Using traditional Han characters is
 612 |         not the same as being in Taiwan, but each implies that the other is
 613 |         likely.)
 614 | 
 615 |         These implications are provided in the CLDR supplemental data, and are
 616 |         based on the likelihood of people using the language to transmit text
 617 |         on the Internet. (This is why the overall default is English, not
 618 |         Chinese.)
 619 | 
 620 |         It's important to recognize that these tags amplify majorities, and
 621 |         that not all language support fits into a "likely" language tag.
 622 | 
 623 |         >>> str(Language.get('zh-Hant').maximize())
 624 |         'zh-Hant-TW'
 625 |         >>> str(Language.get('zh-TW').maximize())
 626 |         'zh-Hant-TW'
 627 |         >>> str(Language.get('ja').maximize())
 628 |         'ja-Jpan-JP'
 629 |         >>> str(Language.get('pt').maximize())
 630 |         'pt-Latn-BR'
 631 |         >>> str(Language.get('und-Arab').maximize())
 632 |         'ar-Arab-EG'
 633 |         >>> str(Language.get('und-CH').maximize())
 634 |         'de-Latn-CH'
 635 | 
 636 |         As many standards are, this is US-centric:
 637 | 
 638 |         >>> str(Language.make().maximize())
 639 |         'en-Latn-US'
 640 | 
 641 |         "Extlangs" have no likely-subtags information, so they will give
 642 |         maximized results that make no sense:
 643 | 
 644 |         >>> str(Language.get('und-ibe').maximize())
 645 |         'en-ibe-Latn-US'
 646 |         """
 647 |         if self._filled is not None:
 648 |             return self._filled
 649 | 
 650 |         for tag in self.broader_tags():
 651 |             if tag in LIKELY_SUBTAGS:
 652 |                 result = Language.get(LIKELY_SUBTAGS[tag], normalize=False)
 653 |                 result = result.update(self)
 654 |                 self._filled = result
 655 |                 return result
 656 | 
 657 |         raise RuntimeError(
 658 |             "Couldn't fill in likely values. This represents a problem with "
 659 |             "the LIKELY_SUBTAGS data."
 660 |         )
 661 | 
 662 |     # Support an old, wordier name for the method
 663 |     fill_likely_values = maximize
 664 | 
 665 |     def match_score(self, supported: 'Language') -> int:
 666 |         """
 667 |         DEPRECATED: use .distance() instead, which uses newer data and is _lower_
 668 |         for better matching languages.
 669 |         """
 670 |         warnings.warn(
 671 |             "`match_score` is deprecated because it's based on deprecated CLDR info. "
 672 |             "Use `distance` instead, which is _lower_ for better matching languages. ",
 673 |             DeprecationWarning,
 674 |         )
 675 |         return 100 - min(self.distance(supported), 100)
 676 | 
 677 |     def distance(self, supported: 'Language', ignore_script: bool = False) -> int:
 678 |         """
 679 |         Suppose that `self` is the language that the user desires, and
 680 |         `supported` is a language that is actually supported.
 681 | 
 682 |         This method returns a number from 0 to 134 measuring the 'distance'
 683 |         between the languages (lower numbers are better). This is not a
 684 |         symmetric relation. If `ignore_script` is `True`, the script will
 685 |         not be used in the comparison, possibly resulting in a smaller
 686 |         'distance'.
 687 | 
 688 |         The language distance is not really about the linguistic similarity or
 689 |         history of the languages; instead, it's based largely on sociopolitical
 690 |         factors, indicating which language speakers are likely to know which
 691 |         other languages in the present world. Much of the heuristic is about
 692 |         finding a widespread 'world language' like English, Chinese, French, or
 693 |         Russian that speakers of a more localized language will accept.
 694 | 
 695 |         A version that works on language tags, as strings, is in the function
 696 |         `tag_distance`. See that function for copious examples.
 697 |         """
 698 |         if supported == self:
 699 |             return 0
 700 | 
 701 |         # CLDR has realized that these matching rules are undermined when the
 702 |         # unspecified language 'und' gets maximized to 'en-Latn-US', so this case
 703 |         # is specifically not maximized:
 704 |         if self.language is None and self.script is None and self.territory is None:
 705 |             desired_triple = ('und', 'Zzzz', 'ZZ')
 706 |         else:
 707 |             desired_complete = self.prefer_macrolanguage().maximize()
 708 | 
 709 |             desired_triple = (
 710 |                 desired_complete.language,
 711 |                 None if ignore_script else desired_complete.script,
 712 |                 desired_complete.territory,
 713 |             )
 714 | 
 715 |         if (
 716 |             supported.language is None
 717 |             and supported.script is None
 718 |             and supported.territory is None
 719 |         ):
 720 |             supported_triple = ('und', 'Zzzz', 'ZZ')
 721 |         else:
 722 |             supported_complete = supported.prefer_macrolanguage().maximize()
 723 | 
 724 |             supported_triple = (
 725 |                 supported_complete.language,
 726 |                 None if ignore_script else supported_complete.script,
 727 |                 supported_complete.territory,
 728 |             )
 729 | 
 730 |         return tuple_distance_cached(desired_triple, supported_triple)
 731 | 
 732 |     def is_valid(self) -> bool:
 733 |         """
 734 |         Checks whether the language, script, territory, and variants
 735 |         (if present) are all tags that have meanings assigned by IANA.
 736 |         For example, 'ja' (Japanese) is a valid tag, and 'jp' is not.
 737 | 
 738 |         The data is current as of CLDR 40.
 739 | 
 740 |         >>> Language.get('ja').is_valid()
 741 |         True
 742 |         >>> Language.get('jp').is_valid()
 743 |         False
 744 |         >>> Language.get('en-001').is_valid()
 745 |         True
 746 |         >>> Language.get('en-000').is_valid()
 747 |         False
 748 |         >>> Language.get('en-Latn').is_valid()
 749 |         True
 750 |         >>> Language.get('en-Latnx').is_valid()
 751 |         False
 752 |         >>> Language.get('und').is_valid()
 753 |         True
 754 |         >>> Language.get('en-GB-oxendict').is_valid()
 755 |         True
 756 |         >>> Language.get('en-GB-oxenfree').is_valid()
 757 |         False
 758 |         >>> Language.get('x-heptapod').is_valid()
 759 |         True
 760 | 
 761 |         Some scripts are, confusingly, not included in CLDR's 'validity' pattern.
 762 |         If a script appears in the IANA registry, we consider it valid.
 763 | 
 764 |         >>> Language.get('ur-Aran').is_valid()
 765 |         True
 766 |         >>> Language.get('cu-Cyrs').is_valid()
 767 |         True
 768 | 
 769 |         A language tag with multiple extlangs will parse, but is not valid.
 770 |         The only allowed example is 'zh-min-nan', which normalizes to the
 771 |         language 'nan'.
 772 | 
 773 |         >>> Language.get('zh-min-nan').is_valid()
 774 |         True
 775 |         >>> Language.get('sgn-ase-bfi').is_valid()
 776 |         False
 777 | 
 778 |         These examples check that duplicate tags are not valid:
 779 | 
 780 |         >>> Language.get('de-1901').is_valid()
 781 |         True
 782 |         >>> Language.get('de-1901-1901').is_valid()
 783 |         False
 784 |         >>> Language.get('en-a-bbb-c-ddd').is_valid()
 785 |         True
 786 |         >>> Language.get('en-a-bbb-a-ddd').is_valid()
 787 |         False
 788 | 
 789 |         Of course, you should be prepared to catch a failure to parse the
 790 |         language code at all:
 791 | 
 792 |         >>> Language.get('C').is_valid()
 793 |         Traceback (most recent call last):
 794 |         ...
 795 |         langcodes.tag_parser.LanguageTagError: Expected a language code, got 'c'
 796 |         """
 797 |         if self.extlangs is not None:
 798 |             # An erratum to BCP 47 says that tags with more than one extlang are
 799 |             # invalid.
 800 |             if len(self.extlangs) > 1:
 801 |                 return False
 802 | 
 803 |         subtags = [self.language, self.script, self.territory]
 804 |         checked_subtags = []
 805 |         if self.variants is not None:
 806 |             subtags.extend(self.variants)
 807 |         for subtag in subtags:
 808 |             if subtag is not None:
 809 |                 checked_subtags.append(subtag)
 810 |                 if not subtag.startswith('x-') and not VALIDITY.match(subtag):
 811 |                     if subtag not in ALL_SCRIPTS:
 812 |                         return False
 813 | 
 814 |         # We check extensions for validity by ensuring that there aren't
 815 |         # two extensions introduced by the same letter. For example, you can't
 816 |         # have two 'u-' extensions.
 817 |         if self.extensions:
 818 |             checked_subtags.extend([extension[:2] for extension in self.extensions])
 819 |         if len(set(checked_subtags)) != len(checked_subtags):
 820 |             return False
 821 |         return True
 822 | 
 823 |     def has_name_data(self) -> bool:
 824 |         """
 825 |         Return True when we can name languages in this language. Requires
 826 |         `language_data` to be installed.
 827 | 
 828 |         This is true when the language, or one of its 'broader' versions, is in
 829 |         the list of CLDR target languages.
 830 | 
 831 |         >>> Language.get('fr').has_name_data()
 832 |         True
 833 |         >>> Language.get('so').has_name_data()
 834 |         True
 835 |         >>> Language.get('enc').has_name_data()
 836 |         False
 837 |         >>> Language.get('und').has_name_data()
 838 |         False
 839 |         """
 840 |         try:
 841 |             from language_data.name_data import LANGUAGES_WITH_NAME_DATA
 842 |         except ImportError:
 843 |             print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
 844 |             raise
 845 | 
 846 |         matches = set(self.broader_tags()) & LANGUAGES_WITH_NAME_DATA
 847 |         return bool(matches)
 848 | 
 849 |     # These methods help to show what the language tag means in natural
 850 |     # language. They actually apply the language-matching algorithm to find
 851 |     # the right language to name things in.
 852 | 
 853 |     def _get_name(
 854 |         self, attribute: str, language: Union[str, 'Language'], max_distance: int
 855 |     ) -> str:
 856 |         try:
 857 |             from language_data.names import code_to_names
 858 |         except ImportError:
 859 |             print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
 860 |             raise
 861 | 
 862 |         assert attribute in self.ATTRIBUTES
 863 |         if isinstance(language, str):
 864 |             language = Language.get(language)
 865 | 
 866 |         attr_value = getattr(self, attribute)
 867 |         if attr_value is None:
 868 |             if attribute == 'language':
 869 |                 attr_value = 'und'
 870 |             else:
 871 |                 return None
 872 |         names = code_to_names(attr_value)
 873 | 
 874 |         result = self._best_name(names, language, max_distance)
 875 |         if result is not None:
 876 |             return result
 877 |         else:
 878 |             # Construct a string like "Unknown language [zzz]"
 879 |             placeholder = None
 880 |             if attribute == 'language':
 881 |                 placeholder = 'und'
 882 |             elif attribute == 'script':
 883 |                 placeholder = 'Zzzz'
 884 |             elif attribute == 'territory':
 885 |                 placeholder = 'ZZ'
 886 | 
 887 |             unknown_name = None
 888 |             if placeholder is not None:
 889 |                 names = code_to_names(placeholder)
 890 |                 unknown_name = self._best_name(names, language, max_distance)
 891 |             if unknown_name is None:
 892 |                 unknown_name = 'Unknown language subtag'
 893 |             return f'{unknown_name} [{attr_value}]'
 894 | 
 895 |     def _best_name(
 896 |         self, names: Mapping[str, str], language: 'Language', max_distance: int
 897 |     ):
 898 |         matchable_languages = set(language.broader_tags())
 899 |         possible_languages = [
 900 |             key for key in sorted(names.keys()) if key in matchable_languages
 901 |         ]
 902 | 
 903 |         target_language, score = closest_match(
 904 |             language, possible_languages, max_distance
 905 |         )
 906 |         if target_language in names:
 907 |             return names[target_language]
 908 |         else:
 909 |             return names.get(DEFAULT_LANGUAGE)
 910 | 
 911 |     def language_name(
 912 |         self,
 913 |         language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
 914 |         max_distance: int = 25,
 915 |     ) -> str:
 916 |         """
 917 |         Give the name of the language (not the entire tag, just the language part)
 918 |         in a natural language. The target language can be given as a string or
 919 |         another Language object.
 920 | 
 921 |         By default, things are named in English:
 922 | 
 923 |         >>> Language.get('fr').language_name()
 924 |         'French'
 925 |         >>> Language.get('el').language_name()
 926 |         'Greek'
 927 | 
 928 |         But you can ask for language names in numerous other languages:
 929 | 
 930 |         >>> Language.get('fr').language_name('fr')
 931 |         'français'
 932 |         >>> Language.get('el').language_name('fr')
 933 |         'grec'
 934 | 
 935 |         Why does everyone get Slovak and Slovenian confused? Let's ask them.
 936 | 
 937 |         >>> Language.get('sl').language_name('sl')
 938 |         'slovenščina'
 939 |         >>> Language.get('sk').language_name('sk')
 940 |         'slovenčina'
 941 |         >>> Language.get('sl').language_name('sk')
 942 |         'slovinčina'
 943 |         >>> Language.get('sk').language_name('sl')
 944 |         'slovaščina'
 945 |         """
 946 |         return self._get_name('language', language, max_distance)
 947 | 
 948 |     def display_name(
 949 |         self,
 950 |         language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
 951 |         max_distance: int = 25,
 952 |     ) -> str:
 953 |         """
 954 |         It's often helpful to be able to describe a language code in a way that a user
 955 |         (or you) can understand, instead of in inscrutable short codes. The
 956 |         `display_name` method lets you describe a Language object *in a language*.
 957 | 
 958 |         The `.display_name(language, min_score)` method will look up the name of the
 959 |         language. The names come from the IANA language tag registry, which is only in
 960 |         English, plus CLDR, which names languages in many commonly-used languages.
 961 | 
 962 |         The default language for naming things is English:
 963 | 
 964 |             >>> Language.make(language='fr').display_name()
 965 |             'French'
 966 | 
 967 |             >>> Language.make().display_name()
 968 |             'Unknown language'
 969 | 
 970 |             >>> Language.get('zh-Hans').display_name()
 971 |             'Chinese (Simplified)'
 972 | 
 973 |             >>> Language.get('en-US').display_name()
 974 |             'English (United States)'
 975 | 
 976 |         But you can ask for language names in numerous other languages:
 977 | 
 978 |             >>> Language.get('fr').display_name('fr')
 979 |             'français'
 980 | 
 981 |             >>> Language.get('fr').display_name('es')
 982 |             'francés'
 983 | 
 984 |             >>> Language.make().display_name('es')
 985 |             'lengua desconocida'
 986 | 
 987 |             >>> Language.get('zh-Hans').display_name('de')
 988 |             'Chinesisch (Vereinfacht)'
 989 | 
 990 |             >>> Language.get('en-US').display_name('zh-Hans')
 991 |             '英语（美国）'
 992 |         """
 993 |         reduced = self.simplify_script()
 994 |         language = Language.get(language)
 995 |         language_name = reduced.language_name(language, max_distance)
 996 |         extra_parts = []
 997 | 
 998 |         if reduced.script is not None:
 999 |             extra_parts.append(reduced.script_name(language, max_distance))
1000 |         if reduced.territory is not None:
1001 |             extra_parts.append(reduced.territory_name(language, max_distance))
1002 | 
1003 |         if extra_parts:
1004 |             clarification = language._display_separator().join(extra_parts)
1005 |             pattern = language._display_pattern()
1006 |             return pattern.format(language_name, clarification)
1007 |         else:
1008 |             return language_name
1009 | 
1010 |     def _display_pattern(self) -> str:
1011 |         """
1012 |         Get the pattern, according to CLDR, that should be used for clarifying
1013 |         details of a language code.
1014 |         """
1015 |         # Technically we are supposed to look up this pattern in each language.
1016 |         # Practically, it's the same in every language except Chinese, where the
1017 |         # parentheses are full-width.
1018 |         if self._disp_pattern is not None:
1019 |             return self._disp_pattern
1020 |         if self.distance(Language.get('zh')) <= 25 or self.distance(Language.get('zh-Hant')) <= 25:
1021 |             self._disp_pattern = "{0}（{1}）"
1022 |         else:
1023 |             self._disp_pattern = "{0} ({1})"
1024 |         return self._disp_pattern
1025 | 
1026 |     def _display_separator(self) -> str:
1027 |         """
1028 |         Get the symbol that should be used to separate multiple clarifying
1029 |         details -- such as a comma in English, or an ideographic comma in
1030 |         Japanese.
1031 | 
1032 |         Requires that `language_data` is installed.
1033 |         """
1034 |         try:
1035 |             from language_data.names import DISPLAY_SEPARATORS
1036 |         except ImportError:
1037 |             print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
1038 |             raise
1039 | 
1040 |         if self._disp_separator is not None:
1041 |             return self._disp_separator
1042 |         matched, _dist = closest_match(self, DISPLAY_SEPARATORS.keys())
1043 |         self._disp_separator = DISPLAY_SEPARATORS[matched]
1044 |         return self._disp_separator
1045 | 
1046 |     def autonym(self, max_distance: int = 9) -> str:
1047 |         """
1048 |         Give the display name of this language *in* this language.
1049 |         Requires that `language_data` is installed.
1050 | 
1051 |         >>> Language.get('fr').autonym()
1052 |         'français'
1053 |         >>> Language.get('es').autonym()
1054 |         'español'
1055 |         >>> Language.get('ja').autonym()
1056 |         '日本語'
1057 | 
1058 |         This uses the `display_name()` method, so it can include the name of a
1059 |         script or territory when appropriate.
1060 | 
1061 |         >>> Language.get('en-AU').autonym()
1062 |         'English (Australia)'
1063 |         >>> Language.get('sr-Latn').autonym()
1064 |         'srpski (latinica)'
1065 |         >>> Language.get('sr-Cyrl').autonym()
1066 |         'српски (ћирилица)'
1067 |         >>> Language.get('pa').autonym()
1068 |         'ਪੰਜਾਬੀ'
1069 |         >>> Language.get('pa-Arab').autonym()
1070 |         'پنجابی (عربی)'
1071 | 
1072 |         This only works for language codes that CLDR has locale data for. You
1073 |         can't ask for the autonym of 'ja-Latn' and get 'nihongo (rōmaji)'.
1074 |         """
1075 |         lang = self.prefer_macrolanguage()
1076 |         return lang.display_name(language=lang, max_distance=max_distance)
1077 | 
1078 |     def script_name(
1079 |         self,
1080 |         language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
1081 |         max_distance: int = 25,
1082 |     ) -> str:
1083 |         """
1084 |         Describe the script part of the language tag in a natural language.
1085 |         Requires that `language_data` is installed.
1086 |         """
1087 |         return self._get_name('script', language, max_distance)
1088 | 
1089 |     def territory_name(
1090 |         self,
1091 |         language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
1092 |         max_distance: int = 25,
1093 |     ) -> str:
1094 |         """
1095 |         Describe the territory part of the language tag in a natural language.
1096 |         Requires that `language_data` is installed.
1097 |         """
1098 |         return self._get_name('territory', language, max_distance)
1099 | 
1100 |     def region_name(
1101 |         self,
1102 |         language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
1103 |         max_distance: int = 25,
1104 |     ) -> str:
1105 |         warnings.warn(
1106 |             "`region_name` has been renamed to `territory_name` for consistency",
1107 |             DeprecationWarning,
1108 |         )
1109 |         return self.territory_name(language, max_distance)
1110 | 
1111 |     @property
1112 |     def region(self):
1113 |         warnings.warn(
1114 |             "The `region` property has been renamed to `territory` for consistency",
1115 |             DeprecationWarning,
1116 |         )
1117 |         return self.territory
1118 | 
1119 |     def variant_names(
1120 |         self,
1121 |         language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
1122 |         max_distance: int = 25,
1123 |     ) -> Sequence[str]:
1124 |         """
1125 |         Deprecated in version 3.0.
1126 | 
1127 |         We don't store names for variants anymore, so this just returns the list
1128 |         of variant codes, such as ['oxendict'] for en-GB-oxendict.
1129 |         """
1130 |         warnings.warn(
1131 |             "variant_names is deprecated and just returns the variant codes",
1132 |             DeprecationWarning,
1133 |         )
1134 |         return self.variants or []
1135 | 
1136 |     def describe(
1137 |         self,
1138 |         language: Union[str, 'Language'] = DEFAULT_LANGUAGE,
1139 |         max_distance: int = 25,
1140 |     ) -> dict:
1141 |         """
1142 |         Return a dictionary that describes a given language tag in a specified
1143 |         natural language. Requires that `language_data` is installed.
1144 | 
1145 |         See `language_name` and related methods for more specific versions of this.
1146 | 
1147 |         The desired `language` will in fact be matched against the available
1148 |         options using the matching technique that this module provides. We can
1149 |         illustrate many aspects of this by asking for a description of Shavian
1150 |         script (a phonetic script for English devised by author George Bernard
1151 |         Shaw), and where you might find it, in various languages.
1152 | 
1153 |         >>> shaw = Language.make(script='Shaw').maximize()
1154 |         >>> shaw.describe('en')
1155 |         {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
1156 | 
1157 |         >>> shaw.describe('fr')
1158 |         {'language': 'anglais', 'script': 'shavien', 'territory': 'Royaume-Uni'}
1159 | 
1160 |         >>> shaw.describe('es')
1161 |         {'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'}
1162 | 
1163 |         >>> shaw.describe('pt')
1164 |         {'language': 'inglês', 'script': 'shaviano', 'territory': 'Reino Unido'}
1165 | 
1166 |         >>> shaw.describe('uk')
1167 |         {'language': 'англійська', 'script': 'шоу', 'territory': 'Велика Британія'}
1168 | 
1169 |         >>> shaw.describe('arb')
1170 |         {'language': 'الإنجليزية', 'script': 'الشواني', 'territory': 'المملكة المتحدة'}
1171 | 
1172 |         >>> shaw.describe('th')
1173 |         {'language': 'อังกฤษ', 'script': 'ซอเวียน', 'territory': 'สหราชอาณาจักร'}
1174 | 
1175 |         >>> shaw.describe('zh-Hans')
1176 |         {'language': '英语', 'script': '萧伯纳式文', 'territory': '英国'}
1177 | 
1178 |         >>> shaw.describe('zh-Hant')
1179 |         {'language': '英文', 'script': '簫柏納字符', 'territory': '英國'}
1180 | 
1181 |         >>> shaw.describe('ja')
1182 |         {'language': '英語', 'script': 'ショー文字', 'territory': 'イギリス'}
1183 | 
1184 |         When we don't have a localization for the language, we fall back on English,
1185 |         because the IANA provides names for all known codes in English.
1186 | 
1187 |         >>> shaw.describe('lol')
1188 |         {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
1189 | 
1190 |         When the language tag itself is a valid tag but with no known meaning, we
1191 |         say so in the appropriate language.
1192 | 
1193 |         >>> Language.get('xyz-ZY').display_name()
1194 |         'Unknown language [xyz] (Unknown Region [ZY])'
1195 | 
1196 |         >>> Language.get('xyz-ZY').display_name('es')
1197 |         'lengua desconocida [xyz] (Región desconocida [ZY])'
1198 |         """
1199 |         names = {}
1200 |         if self.language:
1201 |             names['language'] = self.language_name(language, max_distance)
1202 |         if self.script:
1203 |             names['script'] = self.script_name(language, max_distance)
1204 |         if self.territory:
1205 |             names['territory'] = self.territory_name(language, max_distance)
1206 |         return names
1207 | 
1208 |     def speaking_population(self) -> int:
1209 |         """
1210 |         Get an estimate of how many people in the world speak this language,
1211 |         derived from CLDR data. Requires that `language_data` is installed.
1212 | 
1213 |         Only the language and territory codes will be considered. If a
1214 |         territory code is included, the population will count only the
1215 |         speakers of the language in that territory.
1216 | 
1217 |         Script subtags are disregarded, because it doesn't make sense to ask
1218 |         how many people speak in a particular writing script.
1219 | 
1220 |         >>> Language.get('es').speaking_population()
1221 |         493528077
1222 |         >>> Language.get('pt').speaking_population()
1223 |         237496885
1224 |         >>> Language.get('es-BR').speaking_population()
1225 |         76218
1226 |         >>> Language.get('pt-BR').speaking_population()
1227 |         192661560
1228 |         >>> Language.get('vo').speaking_population()
1229 |         0
1230 |         """
1231 |         try:
1232 |             from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
1233 |         except ImportError:
1234 |             print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
1235 |             raise
1236 | 
1237 |         lang = self._filter_attributes(['language', 'territory'])
1238 |         return LANGUAGE_SPEAKING_POPULATION.get(str(lang), 0)
1239 | 
1240 |     def writing_population(self) -> int:
1241 |         """
1242 |         Get an estimate of how many people in the world read and write
1243 |         this language, derived from CLDR data. Requires that `language_data`
1244 |         is installed.
1245 | 
1246 |         For many languages that aren't typically written, this is an
1247 |         overestimate, according to CLDR -- the data often includes people who
1248 |         speak that language but write in a different language.
1249 | 
1250 |         Only the language, script, and territory codes will be considered.
1251 |         If a territory code is included, the population will count only the
1252 |         speakers of the language in that territory.
1253 | 
1254 |         >>> all = Language.get('zh').writing_population()
1255 |         >>> all
1256 |         1240841517
1257 | 
1258 |         >>> traditional = Language.get('zh-Hant').writing_population()
1259 |         >>> traditional
1260 |         36863340
1261 | 
1262 |         >>> simplified = Language.get('zh-Hans').writing_population()
1263 |         >>> all == traditional + simplified
1264 |         True
1265 | 
1266 |         >>> Language.get('zh-Hant-HK').writing_population()
1267 |         6439733
1268 |         >>> Language.get('zh-Hans-HK').writing_population()
1269 |         338933
1270 | 
1271 |         Note that if you want to get the total Chinese writing population
1272 |         of Hong Kong, you need to avoid normalization that would interpret
1273 |         'zh-HK' as 'zh-Hant-HK'.
1274 | 
1275 |         >>> Language.get('zh-HK', normalize=False).writing_population()
1276 |         6778666
1277 | 
1278 |         Unknown or unspecified language codes get a population of 0.
1279 | 
1280 |         >>> Language.get('xyz').writing_population()
1281 |         0
1282 | 
1283 |         >>> Language.get('und').writing_population()
1284 |         0
1285 |         """
1286 |         try:
1287 |             from language_data.population_data import LANGUAGE_WRITING_POPULATION
1288 |         except ImportError:
1289 |             print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
1290 |             raise
1291 | 
1292 |         lang = self._filter_attributes(['language', 'script', 'territory'])
1293 |         if str(lang) in LANGUAGE_WRITING_POPULATION:
1294 |             return LANGUAGE_WRITING_POPULATION[str(lang)]
1295 |         else:
1296 |             lang = lang.simplify_script()
1297 |             return LANGUAGE_WRITING_POPULATION.get(str(lang), 0)
1298 | 
1299 |     @staticmethod
1300 |     def find_name(
1301 |         tagtype: str, name: str, language: Optional[Union[str, 'Language']] = None
1302 |     ) -> 'Language':
1303 |         """
1304 |         Find the subtag of a particular `tagtype` that has the given `name`.
1305 |         Requires that `language_data` is installed.
1306 | 
1307 |         The default language, "und", will allow matching names in any language,
1308 |         so you can get the code 'fr' by looking up "French", "Français", or
1309 |         "francés".
1310 | 
1311 |         Occasionally, names are ambiguous in a way that can be resolved by
1312 |         specifying what name the language is supposed to be in. For example,
1313 |         there is a language named 'Malayo' in English, but it's different from
1314 |         the language named 'Malayo' in Spanish (which is Malay). Specifying the
1315 |         language will look up the name in a trie that is only in that language.
1316 | 
1317 |         In a previous version, we thought we were going to deprecate the
1318 |         `language` parameter, as there weren't significant cases of conflicts
1319 |         in names of things between languages. Well, we got more data, and
1320 |         conflicts in names are everywhere.
1321 | 
1322 |         Specifying the language that the name should be in is still not
1323 |         required, but it will help to make sure that names can be
1324 |         round-tripped.
1325 | 
1326 |         >>> Language.find_name('language', 'francés')
1327 |         Language.make(language='fr')
1328 | 
1329 |         >>> Language.find_name('territory', 'United Kingdom')
1330 |         Language.make(territory='GB')
1331 | 
1332 |         >>> Language.find_name('script', 'Arabic')
1333 |         Language.make(script='Arab')
1334 | 
1335 |         >>> Language.find_name('language', 'norsk bokmål')
1336 |         Language.make(language='nb')
1337 | 
1338 |         >>> Language.find_name('language', 'norsk')
1339 |         Language.make(language='no')
1340 | 
1341 |         >>> Language.find_name('language', 'norsk', 'en')
1342 |         Traceback (most recent call last):
1343 |             ...
1344 |         LookupError: Can't find any language named 'norsk'
1345 | 
1346 |         >>> Language.find_name('language', 'norsk', 'no')
1347 |         Language.make(language='no')
1348 | 
1349 |         >>> Language.find_name('language', 'malayo', 'en')
1350 |         Language.make(language='mbp')
1351 | 
1352 |         >>> Language.find_name('language', 'malayo', 'es')
1353 |         Language.make(language='ms')
1354 | 
1355 |         Some language names resolve to more than a language. For example,
1356 |         the name 'Brazilian Portuguese' resolves to a language and a territory,
1357 |         and 'Simplified Chinese' resolves to a language and a script. In these
1358 |         cases, a Language object with multiple subtags will be returned.
1359 | 
1360 |         >>> Language.find_name('language', 'Brazilian Portuguese', 'en')
1361 |         Language.make(language='pt', territory='BR')
1362 | 
1363 |         >>> Language.find_name('language', 'Simplified Chinese', 'en')
1364 |         Language.make(language='zh', script='Hans')
1365 | 
1366 |         A small amount of fuzzy matching is supported: if the name can be
1367 |         shortened to match a single language name, you get that language.
1368 |         This allows, for example, "Hakka dialect" to match "Hakka".
1369 | 
1370 |         >>> Language.find_name('language', 'Hakka dialect')
1371 |         Language.make(language='hak')
1372 |         """
1373 |         try:
1374 |             from language_data.names import name_to_code
1375 |         except ImportError:
1376 |             print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout)
1377 |             raise
1378 | 
1379 |         # No matter what form of language we got, normalize it to a single
1380 |         # language subtag
1381 |         if isinstance(language, Language):
1382 |             language = language.language
1383 |         elif isinstance(language, str):
1384 |             language = get(language).language
1385 |         if language is None:
1386 |             language = 'und'
1387 | 
1388 |         code = name_to_code(tagtype, name, language)
1389 |         if code is None:
1390 |             raise LookupError(f"Can't find any {tagtype} named {name!r}")
1391 |         if '-' in code:
1392 |             return Language.get(code)
1393 |         else:
1394 |             data = {tagtype: code}
1395 |             return Language.make(**data)
1396 | 
1397 |     @staticmethod
1398 |     def find(
1399 |         name: str, language: Optional[Union[str, 'Language']] = None
1400 |     ) -> 'Language':
1401 |         """
1402 |         A concise version of `find_name`, used to get a language tag by its
1403 |         name in a natural language. The language can be omitted in the large
1404 |         majority of cases, where the language name is not ambiguous.
1405 | 
1406 |         >>> Language.find('Türkçe')
1407 |         Language.make(language='tr')
1408 |         >>> Language.find('brazilian portuguese')
1409 |         Language.make(language='pt', territory='BR')
1410 |         >>> Language.find('simplified chinese')
1411 |         Language.make(language='zh', script='Hans')
1412 | 
1413 |         Some language names are ambiguous: for example, there is a language
1414 |         named 'Fala' in English (with code 'fax'), but 'Fala' is also the
1415 |         Kwasio word for French. In this case, specifying the language that
1416 |         the name is in is necessary for disambiguation.
1417 | 
1418 |         >>> Language.find('fala')
1419 |         Language.make(language='fr')
1420 |         >>> Language.find('fala', 'nmg')
1421 |         Language.make(language='fr')
1422 |         >>> Language.find('fala', 'en')
1423 |         Language.make(language='fax')
1424 |         """
1425 |         return Language.find_name('language', name, language)
1426 | 
1427 |     def to_dict(self) -> dict:
1428 |         """
1429 |         Get a dictionary of the attributes of this Language object, which
1430 |         can be useful for constructing a similar object.
1431 |         """
1432 |         if self._dict is not None:
1433 |             return self._dict
1434 | 
1435 |         result = {}
1436 |         for key in self.ATTRIBUTES:
1437 |             value = getattr(self, key)
1438 |             if value:
1439 |                 result[key] = value
1440 |         self._dict = result
1441 |         return result
1442 | 
1443 |     def update(self, other: 'Language') -> 'Language':
1444 |         """
1445 |         Update this Language with the fields of another Language.
1446 |         """
1447 |         return Language.make(
1448 |             language=other.language or self.language,
1449 |             extlangs=other.extlangs or self.extlangs,
1450 |             script=other.script or self.script,
1451 |             territory=other.territory or self.territory,
1452 |             variants=other.variants or self.variants,
1453 |             extensions=other.extensions or self.extensions,
1454 |             private=other.private or self.private,
1455 |         )
1456 | 
1457 |     def update_dict(self, newdata: dict) -> 'Language':
1458 |         """
1459 |         Update the attributes of this Language from a dictionary.
1460 |         """
1461 |         return Language.make(
1462 |             language=newdata.get('language', self.language),
1463 |             extlangs=newdata.get('extlangs', self.extlangs),
1464 |             script=newdata.get('script', self.script),
1465 |             territory=newdata.get('territory', self.territory),
1466 |             variants=newdata.get('variants', self.variants),
1467 |             extensions=newdata.get('extensions', self.extensions),
1468 |             private=newdata.get('private', self.private),
1469 |         )
1470 | 
1471 |     @staticmethod
1472 |     def _filter_keys(d: dict, keys: Iterable[str]) -> dict:
1473 |         """
1474 |         Select a subset of keys from a dictionary.
1475 |         """
1476 |         return {key: d[key] for key in keys if key in d}
1477 | 
1478 |     def _filter_attributes(self, keyset: Iterable[str]) -> 'Language':
1479 |         """
1480 |         Return a copy of this object with a subset of its attributes set.
1481 |         """
1482 |         filtered = self._filter_keys(self.to_dict(), keyset)
1483 |         return Language.make(**filtered)
1484 | 
1485 |     def _searchable_form(self) -> 'Language':
1486 |         """
1487 |         Convert a parsed language tag so that the information it contains is in
1488 |         the best form for looking up information in the CLDR.
1489 |         """
1490 |         if self._searchable is not None:
1491 |             return self._searchable
1492 | 
1493 |         self._searchable = (
1494 |             self._filter_attributes({'language', 'script', 'territory'})
1495 |             .simplify_script()
1496 |             .prefer_macrolanguage()
1497 |         )
1498 |         return self._searchable
1499 | 
1500 |     def __eq__(self, other):
1501 |         if self is other:
1502 |             return True
1503 |         if not isinstance(other, Language):
1504 |             return False
1505 |         return self._str_tag == other._str_tag
1506 | 
1507 |     def __hash__(self) -> int:
1508 |         return hash(self._str_tag)
1509 | 
1510 |     def __getitem__(self, key: str) -> Optional[Union[str, List[str]]]:
1511 |         if key in self.ATTRIBUTES:
1512 |             return getattr(self, key)
1513 |         else:
1514 |             raise KeyError(key)
1515 | 
1516 |     def __contains__(self, key: str) -> bool:
1517 |         return key in self.ATTRIBUTES and getattr(self, key)
1518 | 
1519 |     def __repr__(self) -> str:
1520 |         items = []
1521 |         for attr in self.ATTRIBUTES:
1522 |             if getattr(self, attr):
1523 |                 value = getattr(self, attr)
1524 |                 items.append(f'{attr}={value!r}')
1525 |         joined = ', '.join(items)
1526 |         return f"Language.make({joined})"
1527 | 
1528 |     def __str__(self) -> str:
1529 |         return self.to_tag()
1530 | 
1531 | 
1532 | # Make the get(), find(), and find_name() functions available at the top level
1533 | get = Language.get
1534 | find = Language.find
1535 | find_name = Language.find_name
1536 | 
1537 | # Make the Language object available under the old name LanguageData
1538 | LanguageData = Language
1539 | 
1540 | 
1541 | def standardize_tag(tag: Union[str, Language], macro: bool = False) -> str:
1542 |     """
1543 |     Standardize a language tag:
1544 | 
1545 |     - Replace deprecated values with their updated versions (if those exist)
1546 |     - Remove script tags that are redundant with the language
1547 |     - If *macro* is True, use a macrolanguage to represent the most common
1548 |       standardized language within that macrolanguage. For example, 'cmn'
1549 |       (Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic)
1550 |       becomes 'ar' (Arabic).
1551 |     - Format the result according to the conventions of BCP 47
1552 | 
1553 |     Macrolanguage replacement is not required by BCP 47, but it is required
1554 |     by the Unicode CLDR.
1555 | 
1556 |     >>> standardize_tag('en_US')
1557 |     'en-US'
1558 | 
1559 |     >>> standardize_tag('en-Latn')
1560 |     'en'
1561 | 
1562 |     >>> standardize_tag('en-uk')
1563 |     'en-GB'
1564 | 
1565 |     >>> standardize_tag('eng')
1566 |     'en'
1567 | 
1568 |     >>> standardize_tag('arb-Arab', macro=True)
1569 |     'ar'
1570 | 
1571 |     >>> standardize_tag('sh-QU')
1572 |     'sr-Latn-EU'
1573 | 
1574 |     >>> standardize_tag('sgn-US')
1575 |     'ase'
1576 | 
1577 |     >>> standardize_tag('zh-cmn-hans-cn')
1578 |     'zh-Hans-CN'
1579 | 
1580 |     >>> standardize_tag('zsm', macro=True)
1581 |     'ms'
1582 | 
1583 |     >>> standardize_tag('ja-latn-hepburn')
1584 |     'ja-Latn-hepburn'
1585 | 
1586 |     >>> standardize_tag('spa-latn-mx')
1587 |     'es-MX'
1588 | 
1589 |     If the tag can't be parsed according to BCP 47, this will raise a
1590 |     LanguageTagError (a subclass of ValueError):
1591 | 
1592 |     >>> standardize_tag('spa-mx-latn')
1593 |     Traceback (most recent call last):
1594 |         ...
1595 |     langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
1596 |     """
1597 |     langdata = Language.get(tag, normalize=True)
1598 |     if macro:
1599 |         langdata = langdata.prefer_macrolanguage()
1600 | 
1601 |     return langdata.simplify_script().to_tag()
1602 | 
1603 | 
1604 | def tag_is_valid(tag: Union[str, Language]) -> bool:
1605 |     """
1606 |     Determines whether a string is a valid language tag. This is similar to
1607 |     Language.get(tag).is_valid(), but can return False in the case where
1608 |     the tag doesn't parse.
1609 | 
1610 |     >>> tag_is_valid('ja')
1611 |     True
1612 |     >>> tag_is_valid('jp')
1613 |     False
1614 |     >>> tag_is_valid('spa-Latn-MX')
1615 |     True
1616 |     >>> tag_is_valid('spa-MX-Latn')
1617 |     False
1618 |     >>> tag_is_valid('')
1619 |     False
1620 |     >>> tag_is_valid('C.UTF-8')
1621 |     False
1622 |     """
1623 |     try:
1624 |         langdata = Language.get(tag)
1625 |         return langdata.is_valid()
1626 |     except LanguageTagError:
1627 |         return False
1628 | 
1629 | 
1630 | def tag_match_score(
1631 |     desired: Union[str, Language], supported: Union[str, Language]
1632 | ) -> int:
1633 |     """
1634 |     DEPRECATED: use .distance() instead, which uses newer data and is _lower_
1635 |     for better matching languages.
1636 | 
1637 |     Return a number from 0 to 100 indicating the strength of match between the
1638 |     language the user desires, D, and a supported language, S. Higher numbers
1639 |     are better. A reasonable cutoff for not messing with your users is to
1640 |     only accept scores of 75 or more.
1641 | 
1642 |     A score of 100 means the languages are the same, possibly after normalizing
1643 |     and filling in likely values.
1644 |     """
1645 |     warnings.warn(
1646 |         "tag_match_score is deprecated because it's based on deprecated CLDR info. "
1647 |         "Use tag_distance instead, which is _lower_ for better matching languages. ",
1648 |         DeprecationWarning,
1649 |     )
1650 |     desired_ld = Language.get(desired)
1651 |     supported_ld = Language.get(supported)
1652 |     return desired_ld.match_score(supported_ld)
1653 | 
1654 | 
1655 | def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int:
1656 |     """
1657 |     Tags that expand to the same thing when likely values are filled in get a
1658 |     distance of 0.
1659 | 
1660 |     >>> tag_distance('en', 'en')
1661 |     0
1662 |     >>> tag_distance('en', 'en-US')
1663 |     0
1664 |     >>> tag_distance('zh-Hant', 'zh-TW')
1665 |     0
1666 |     >>> tag_distance('ru-Cyrl', 'ru')
1667 |     0
1668 | 
1669 |     As a specific example, Serbo-Croatian is a politically contentious idea,
1670 |     but in CLDR, it's considered equivalent to Serbian in Latin characters.
1671 | 
1672 |     >>> tag_distance('sh', 'sr-Latn')
1673 |     0
1674 | 
1675 |     ... which is very similar to Croatian but sociopolitically not the same.
1676 | 
1677 |     >>> tag_distance('sh', 'hr')
1678 |     9
1679 | 
1680 |     Unicode reorganized its distinction between 'no' (Norwegian) and 'nb'
1681 |     (Norwegian Bokmål) in 2021. 'no' is preferred in most contexts, and the more
1682 |     specific 'nb' is a distance of 1 from it:
1683 | 
1684 |     >>> tag_distance('nb', 'no')
1685 |     1
1686 | 
1687 |     These distances can be asymmetrical: this data includes the fact that speakers
1688 |     of Swiss German (gsw) know High German (de), but not at all the other way around.
1689 | 
1690 |     The difference seems a little bit extreme, but the asymmetry is certainly
1691 |     there. And if your text is tagged as 'gsw', it must be that way for a
1692 |     reason.
1693 | 
1694 |     >>> tag_distance('gsw', 'de')
1695 |     8
1696 |     >>> tag_distance('de', 'gsw')
1697 |     84
1698 | 
1699 |     Unconnected languages get a distance of 80 to 134.
1700 | 
1701 |     >>> tag_distance('en', 'zh')
1702 |     134
1703 |     >>> tag_distance('es', 'fr')
1704 |     84
1705 |     >>> tag_distance('fr-CH', 'de-CH')
1706 |     80
1707 | 
1708 |     Different local variants of the same language get a distance from 3 to 5.
1709 |     >>> tag_distance('zh-HK', 'zh-MO')   # Chinese is similar in Hong Kong and Macao
1710 |     4
1711 |     >>> tag_distance('en-AU', 'en-GB')   # Australian English is similar to British English
1712 |     3
1713 |     >>> tag_distance('en-IN', 'en-GB')   # Indian English is also similar to British English
1714 |     3
1715 |     >>> tag_distance('es-PE', 'es-419')  # Peruvian Spanish is Latin American Spanish
1716 |     1
1717 |     >>> tag_distance('es-419', 'es-PE')  # but Latin American Spanish is not necessarily Peruvian
1718 |     4
1719 |     >>> tag_distance('es-ES', 'es-419')  # Spanish in Spain is further from Latin American Spanish
1720 |     5
1721 |     >>> tag_distance('en-US', 'en-GB')   # American and British English are somewhat different
1722 |     5
1723 |     >>> tag_distance('es-MX', 'es-ES')   # Mexican Spanish is different from Spanish Spanish
1724 |     5
1725 |     >>> # European Portuguese is different from the most common form (Brazilian Portuguese)
1726 |     >>> tag_distance('pt', 'pt-PT')
1727 |     5
1728 | 
1729 |     >>> # Serbian has two scripts, and people might prefer one but understand both
1730 |     >>> tag_distance('sr-Latn', 'sr-Cyrl')
1731 |     5
1732 | 
1733 |     A distance of 10 is used for matching a specific language to its
1734 |     more-commonly-used macrolanguage tag.
1735 | 
1736 |     >>> tag_distance('arz', 'ar')  # Egyptian Arabic to Modern Standard Arabic
1737 |     10
1738 |     >>> tag_distance('wuu', 'zh')  # Wu Chinese to (Mandarin) Chinese
1739 |     10
1740 | 
1741 |     Higher distances can arrive due to particularly contentious differences in
1742 |     the script for writing the language, where people who understand one script
1743 |     can learn the other but may not be happy with it. This specifically applies
1744 |     to Chinese.
1745 | 
1746 |     >>> tag_distance('zh-TW', 'zh-CN')
1747 |     54
1748 |     >>> tag_distance('zh-Hans', 'zh-Hant')
1749 |     54
1750 |     >>> tag_distance('zh-CN', 'zh-HK')
1751 |     54
1752 |     >>> tag_distance('zh-CN', 'zh-TW')
1753 |     54
1754 |     >>> tag_distance('zh-Hant', 'zh-Hans')
1755 |     54
1756 | 
1757 |     This distance range also applies to the differences between Norwegian
1758 |     Bokmål, Nynorsk, and Danish.
1759 | 
1760 |     >>> tag_distance('no', 'da')
1761 |     12
1762 |     >>> tag_distance('no', 'nn')
1763 |     20
1764 | 
1765 |     Differences of 20 to 50 can represent substantially different languages,
1766 |     in cases where speakers of the first may understand the second for demographic
1767 |     reasons.
1768 | 
1769 |     >>> tag_distance('eu', 'es')  # Basque to Spanish
1770 |     20
1771 |     >>> tag_distance('af', 'nl')  # Afrikaans to Dutch
1772 |     24
1773 |     >>> tag_distance('mr', 'hi')  # Marathi to Hindi
1774 |     30
1775 |     >>> tag_distance('ms', 'id')  # Malay to Indonesian
1776 |     34
1777 |     >>> tag_distance('mg', 'fr')  # Malagasy to French
1778 |     34
1779 |     >>> tag_distance('ta', 'en')  # Tamil to English
1780 |     44
1781 | 
1782 |     A complex example is the tag 'yue' for Cantonese. Written Chinese is usually
1783 |     presumed to be Mandarin Chinese, but colloquial Cantonese can be written as
1784 |     well. (Some things could not be written any other way, such as Cantonese
1785 |     song lyrics.)
1786 | 
1787 |     The difference between Cantonese and Mandarin also implies script and
1788 |     territory differences by default, adding to the distance.
1789 | 
1790 |     >>> tag_distance('yue', 'zh')
1791 |     64
1792 | 
1793 |     When the supported script is a different one than desired, this is usually
1794 |     a major difference with score of 50 or more.
1795 | 
1796 |     >>> tag_distance('ja', 'ja-Latn-US-hepburn')
1797 |     54
1798 | 
1799 |     If `ignore_script` is used, the script difference is ignored and a smaller
1800 |     difference with lower score will be found.
1801 | 
1802 |     >>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True)
1803 |     0
1804 | 
1805 |     >>> # You can read the Shavian script, right?
1806 |     >>> tag_distance('en', 'en-Shaw')
1807 |     54
1808 |     """
1809 |     desired_obj = Language.get(desired)
1810 |     supported_obj = Language.get(supported)
1811 |     return desired_obj.distance(supported_obj, ignore_script)
1812 | 
1813 | 
1814 | def best_match(
1815 |     desired_language: Union[str, Language],
1816 |     supported_languages: Sequence[str],
1817 |     min_score: int = 75,
1818 | ) -> Tuple[str, int]:
1819 |     """
1820 |     DEPRECATED: use .closest_match() instead. This function emulates the old
1821 |     matching behavior by subtracting the language distance from 100.
1822 | 
1823 |     You have software that supports any of the `supported_languages`. You want
1824 |     to use `desired_language`. This function lets you choose the right language,
1825 |     even if there isn't an exact match.
1826 | 
1827 |     Returns:
1828 | 
1829 |     - The best-matching language code, which will be one of the
1830 |       `supported_languages` or 'und'
1831 |     - The score of the match, from 0 to 100; higher is better.
1832 | 
1833 |     `min_score` sets the minimum match score. If all languages match with a lower
1834 |     score than that, the result will be 'und' with a score of 0.
1835 |     """
1836 |     max_distance = 100 - min_score
1837 |     supported, distance = closest_match(
1838 |         desired_language, supported_languages, max_distance
1839 |     )
1840 |     score = max(0, 100 - distance)
1841 |     return supported, score
1842 | 
1843 | 
1844 | def closest_match(
1845 |     desired_language: Union[str, Language],
1846 |     supported_languages: Sequence[str],
1847 |     max_distance: int = 25,
1848 |     ignore_script: bool = False,
1849 | ) -> Tuple[str, int]:
1850 |     """
1851 |     You have software that supports any of the `supported_languages`. You want
1852 |     to use `desired_language`. This function lets you choose the right language,
1853 |     even if there isn't an exact match.
1854 | 
1855 |     Returns:
1856 | 
1857 |     - The best-matching language code, which will be one of the
1858 |       `supported_languages` or 'und' for no match
1859 |     - The distance of the match, which is 0 for a perfect match and increases
1860 |       from there (see `tag_distance`)
1861 | 
1862 |     `max_distance` sets the maximum match distance. If all matches are farther
1863 |     than that, the result will be 'und' with a distance of 1000. The default
1864 |     value is 25, and raising it can cause data to be processed in significantly
1865 |     the wrong language. The documentation for `tag_distance` describes the
1866 |     distance values in more detail.
1867 | 
1868 |     `ignore_script` makes the matching ignore scripts, allowing matches to be
1869 |     found when they wouldn't otherwise be due to different scripts.
1870 | 
1871 |     When there is a tie for the best matching language, the first one in the
1872 |     tie will be used.
1873 | 
1874 |     >>> closest_match('fr', ['de', 'en', 'fr'])
1875 |     ('fr', 0)
1876 | 
1877 |     >>> closest_match('pt', ['pt-BR', 'pt-PT'])
1878 |     ('pt-BR', 0)
1879 | 
1880 |     >>> closest_match('en-AU', ['en-GB', 'en-US'])
1881 |     ('en-GB', 3)
1882 | 
1883 |     >>> closest_match('af', ['en', 'nl', 'zu'])
1884 |     ('nl', 24)
1885 | 
1886 |     >>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
1887 |     ('und', 1000)
1888 | 
1889 |     >>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True)
1890 |     ('ja-Latn-hepburn', 0)
1891 |     """
1892 |     desired_language = str(desired_language)
1893 | 
1894 |     # Quickly return if the desired language is directly supported
1895 |     if desired_language in supported_languages:
1896 |         return desired_language, 0
1897 | 
1898 |     # Reduce the desired language to a standard form that could also match
1899 |     desired_language = standardize_tag(desired_language)
1900 |     if desired_language in supported_languages:
1901 |         return desired_language, 0
1902 | 
1903 |     match_distances = [
1904 |         (supported, tag_distance(desired_language, supported, ignore_script))
1905 |         for supported in supported_languages
1906 |     ]
1907 |     match_distances = [
1908 |         (supported, distance)
1909 |         for (supported, distance) in match_distances
1910 |         if distance <= max_distance
1911 |     ] + [('und', 1000)]
1912 | 
1913 |     match_distances.sort(key=itemgetter(1))
1914 |     return match_distances[0]
1915 | 
1916 | 
1917 | def closest_supported_match(
1918 |     desired_language: Union[str, Language],
1919 |     supported_languages: Sequence[str],
1920 |     max_distance: int = 25,
1921 | ) -> Optional[str]:
1922 |     """
1923 |     Wraps `closest_match` with a simpler return type. Returns the language
1924 |     tag of the closest match if there is one, or None if there is not.
1925 | 
1926 |     >>> closest_supported_match('fr', ['de', 'en', 'fr'])
1927 |     'fr'
1928 | 
1929 |     >>> closest_supported_match('pt', ['pt-BR', 'pt-PT'])
1930 |     'pt-BR'
1931 | 
1932 |     >>> closest_supported_match('en-AU', ['en-GB', 'en-US'])
1933 |     'en-GB'
1934 | 
1935 |     >>> closest_supported_match('und', ['en', 'und'])
1936 |     'und'
1937 | 
1938 |     >>> closest_supported_match('af', ['en', 'nl', 'zu'])
1939 |     'nl'
1940 | 
1941 |     >>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10))
1942 |     None
1943 |     """
1944 |     code, distance = closest_match(desired_language, supported_languages, max_distance)
1945 |     if distance == 1000:
1946 |         return None
1947 |     else:
1948 |         return code
1949 | 


--------------------------------------------------------------------------------
/langcodes/build_data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import xml.etree.ElementTree as ET
  3 | from langcodes.util import data_filename
  4 | from langcodes.registry_parser import parse_registry
  5 | 
  6 | 
  7 | def read_cldr_supplemental(dataname):
  8 |     cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental')
  9 |     filename = data_filename(f'{cldr_supp_path}/{dataname}.json')
 10 |     fulldata = json.load(open(filename, encoding='utf-8'))
 11 |     if dataname == 'aliases':
 12 |         data = fulldata['supplemental']['metadata']['alias']
 13 |     else:
 14 |         data = fulldata['supplemental'][dataname]
 15 |     return data
 16 | 
 17 | 
 18 | def read_iana_registry_suppress_scripts():
 19 |     scripts = {}
 20 |     for entry in parse_registry():
 21 |         if entry['Type'] == 'language' and 'Suppress-Script' in entry:
 22 |             scripts[entry['Subtag']] = entry['Suppress-Script']
 23 |     return scripts
 24 | 
 25 | 
 26 | def read_iana_registry_scripts():
 27 |     scripts = set()
 28 |     for entry in parse_registry():
 29 |         if entry['Type'] == 'script':
 30 |             scripts.add(entry['Subtag'])
 31 |     return scripts
 32 | 
 33 | 
 34 | def read_iana_registry_macrolanguages():
 35 |     macros = {}
 36 |     for entry in parse_registry():
 37 |         if entry['Type'] == 'language' and 'Macrolanguage' in entry:
 38 |             macros[entry['Subtag']] = entry['Macrolanguage']
 39 |     return macros
 40 | 
 41 | 
 42 | def read_iana_registry_replacements():
 43 |     replacements = {}
 44 |     for entry in parse_registry():
 45 |         if entry['Type'] == 'language' and 'Preferred-Value' in entry:
 46 |             # Replacements for language codes
 47 |             replacements[entry['Subtag']] = entry['Preferred-Value']
 48 |         elif 'Tag' in entry and 'Preferred-Value' in entry:
 49 |             # Replacements for entire tags
 50 |             replacements[entry['Tag'].lower()] = entry['Preferred-Value']
 51 |     return replacements
 52 | 
 53 | 
 54 | def write_python_dict(outfile, name, d):
 55 |     print(f"{name} = {{", file=outfile)
 56 |     for key in sorted(d):
 57 |         value = d[key]
 58 |         print(f"    {key!r}: {value!r},", file=outfile)
 59 |     print("}", file=outfile)
 60 | 
 61 | 
 62 | def write_python_set(outfile, name, s):
 63 |     print(f"{name} = {{", file=outfile)
 64 |     for key in sorted(set(s)):
 65 |         print(f"    {key!r},", file=outfile)
 66 |     print("}", file=outfile)
 67 | 
 68 | 
 69 | GENERATED_HEADER = "# This file is generated by build_data.py."
 70 | 
 71 | 
 72 | def read_validity_regex():
 73 |     validity_options = []
 74 |     for codetype in ('language', 'region', 'script', 'variant'):
 75 |         validity_path = data_filename(f'cldr/common/validity/{codetype}.xml')
 76 |         root = ET.fromstring(open(validity_path).read())
 77 |         matches = root.findall('./idValidity/id')
 78 |         for match in matches:
 79 |             for item in match.text.strip().split():
 80 |                 if '~' in item:
 81 |                     assert item[-2] == '~'
 82 |                     prefix = item[:-3]
 83 |                     range_start = item[-3]
 84 |                     range_end = item[-1]
 85 |                     option = f"{prefix}[{range_start}-{range_end}]"
 86 |                     validity_options.append(option)
 87 |                 else:
 88 |                     validity_options.append(item)
 89 |     options = '|'.join(validity_options)
 90 |     return f'^({options})$'
 91 | 
 92 | 
 93 | def read_language_distances():
 94 |     language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml')
 95 |     root = ET.fromstring(open(language_info_path).read())
 96 |     matches = root.findall(
 97 |         './languageMatching/languageMatches[@type="written_new"]/languageMatch'
 98 |     )
 99 |     tag_distances = {}
100 |     for match in matches:
101 |         attribs = match.attrib
102 |         n_parts = attribs['desired'].count('_') + 1
103 |         if n_parts < 3:
104 |             if attribs.get('oneway') == 'true':
105 |                 pairs = [(attribs['desired'], attribs['supported'])]
106 |             else:
107 |                 pairs = [
108 |                     (attribs['desired'], attribs['supported']),
109 |                     (attribs['supported'], attribs['desired']),
110 |                 ]
111 |             for (desired, supported) in pairs:
112 |                 desired_distance = tag_distances.setdefault(desired, {})
113 |                 desired_distance[supported] = int(attribs['distance'])
114 | 
115 |                 # The 'languageInfo' data file contains distances for the unnormalized
116 |                 # tag 'sh', but we work mostly with normalized tags, and they don't
117 |                 # describe at all how to cope with this.
118 |                 #
119 |                 # 'sh' normalizes to 'sr-Latn', and when we're matching languages we
120 |                 # aren't matching scripts yet, so when 'sh' appears we'll add a
121 |                 # corresponding match for 'sr'.
122 |                 #
123 |                 # Then because we're kind of making this plan up, add 1 to the distance
124 |                 # so it's a worse match than ones that are actually clearly defined
125 |                 # in languageInfo.
126 |                 if desired == 'sh' or supported == 'sh':
127 |                     if desired == 'sh':
128 |                         desired = 'sr'
129 |                     if supported == 'sh':
130 |                         supported = 'sr'
131 |                     if desired != supported:
132 |                         # don't try to define a non-zero distance for sr <=> sr
133 |                         desired_distance = tag_distances.setdefault(desired, {})
134 |                         desired_distance[supported] = int(attribs['distance']) + 1
135 | 
136 |     return tag_distances
137 | 
138 | 
139 | def build_data():
140 |     lang_scripts = read_iana_registry_suppress_scripts()
141 |     all_scripts = read_iana_registry_scripts()
142 |     macrolanguages = read_iana_registry_macrolanguages()
143 |     iana_replacements = read_iana_registry_replacements()
144 |     language_distances = read_language_distances()
145 | 
146 |     alias_data = read_cldr_supplemental('aliases')
147 |     likely_subtags = read_cldr_supplemental('likelySubtags')
148 |     replacements = {}
149 | 
150 |     # Aliased codes can still have alpha3 codes, and there's no unified source
151 |     # about what they are. It depends on whether the alias predates or postdates
152 |     # ISO 639-2, which nobody should have to care about. So let's set all the
153 |     # alpha3 codes for aliased alpha2 codes here.
154 |     alpha3_mapping = {
155 |         'tl': 'tgl',  # even though it normalizes to 'fil'
156 |         'in': 'ind',
157 |         'iw': 'heb',
158 |         'ji': 'yid',
159 |         'jw': 'jav',
160 |         'sh': 'hbs',
161 |     }
162 |     alpha3_biblio = {}
163 |     norm_macrolanguages = {}
164 |     for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:
165 |         aliases = alias_data[alias_type]
166 |         # Initially populate 'languageAlias' with the aliases from the IANA file
167 |         if alias_type == 'languageAlias':
168 |             replacements[alias_type] = iana_replacements
169 |             replacements[alias_type]['root'] = 'und'
170 |         else:
171 |             replacements[alias_type] = {}
172 |         for code, value in aliases.items():
173 |             # Make all keys lowercase so they can be looked up
174 |             # case-insensitively
175 |             code = code.lower()
176 | 
177 |             # If there are multiple replacements, take the first one. For example,
178 |             # we just replace the Soviet Union (SU) with Russia (RU), instead of
179 |             # trying to do something context-sensitive and poorly standardized
180 |             # that selects one of the successor countries to the Soviet Union.
181 |             replacement = value['_replacement'].split()[0]
182 |             if value['_reason'] == 'macrolanguage':
183 |                 norm_macrolanguages[code] = replacement
184 |             else:
185 |                 # CLDR tries to oversimplify some codes as it assigns aliases.
186 |                 # For example, 'nor' is the ISO alpha3 code for 'no', but CLDR
187 |                 # would prefer you use 'nb' over 'no', so it makes 'nor' an
188 |                 # alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'.
189 |                 #
190 |                 # We undo this oversimplification so that we can get a
191 |                 # canonical mapping between alpha2 and alpha3 codes.
192 |                 if code == 'nor':
193 |                     replacement = 'no'
194 |                 elif code == 'mol':
195 |                     replacement = 'mo'
196 |                 elif code == 'twi':
197 |                     replacement = 'tw'
198 |                 elif code == 'bih':
199 |                     replacement = 'bh'
200 | 
201 |                 replacements[alias_type][code] = replacement
202 |                 if alias_type == 'languageAlias':
203 |                     if value['_reason'] == 'overlong':
204 |                         if replacement in alpha3_mapping:
205 |                             raise ValueError(
206 |                                 "{code!r} is an alpha3 for {replacement!r}, which"
207 |                                 " already has an alpha3: {orig!r}".format(
208 |                                     code=code,
209 |                                     replacement=replacement,
210 |                                     orig=alpha3_mapping[replacement],
211 |                                 )
212 |                             )
213 |                         alpha3_mapping[replacement] = code
214 |                     elif value['_reason'] == 'bibliographic':
215 |                         alpha3_biblio[replacement] = code
216 | 
217 |     validity_regex = read_validity_regex()
218 | 
219 |     # Write the contents of data_dicts.py.
220 |     with open('data_dicts.py', 'w', encoding='utf-8') as outfile:
221 |         print(GENERATED_HEADER, file=outfile)
222 |         print("import re\n", file=outfile)
223 |         write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)
224 |         write_python_dict(
225 |             outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']
226 |         )
227 |         write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping)
228 |         write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio)
229 |         write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias'])
230 |         write_python_set(outfile, 'ALL_SCRIPTS', all_scripts)
231 |         write_python_dict(
232 |             outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']
233 |         )
234 |         write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)
235 |         write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages)
236 |         write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)
237 |         write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)
238 |         print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile)
239 | 
240 | 
241 | if __name__ == '__main__':
242 |     build_data()
243 | 


--------------------------------------------------------------------------------
/langcodes/language_distance.py:
--------------------------------------------------------------------------------
  1 | from .data_dicts import LANGUAGE_DISTANCES
  2 | from typing import Dict, Tuple
  3 | 
  4 | 
  5 | TagTriple = Tuple[str, str, str]
  6 | _DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}
  7 | DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"]
  8 | DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"]
  9 | DEFAULT_TERRITORY_DISTANCE = 4
 10 | 
 11 | 
 12 | # Territory clusters used in territory matching:
 13 | # Maghreb (the western Arab world)
 14 | MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}
 15 | 
 16 | # United States and its territories
 17 | US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}
 18 | 
 19 | # Special Autonomous Regions of China
 20 | CNSAR = {"HK", "MO"}
 21 | 
 22 | LATIN_AMERICA = {
 23 |     "419",
 24 |     # Central America
 25 |     "013",
 26 |     "BZ",
 27 |     "CR",
 28 |     "SV",
 29 |     "GT",
 30 |     "HN",
 31 |     "MX",
 32 |     "NI",
 33 |     "PA",
 34 |     # South America
 35 |     "005",
 36 |     "AR",
 37 |     "BO",
 38 |     "BR",
 39 |     "CL",
 40 |     "CO",
 41 |     "EC",
 42 |     "FK",
 43 |     "GF",
 44 |     "GY",
 45 |     "PY",
 46 |     "PE",
 47 |     "SR",
 48 |     "UY",
 49 |     "VE",
 50 | }
 51 | 
 52 | # North and South America
 53 | AMERICAS = {
 54 |     "019",
 55 |     # Caribbean
 56 |     "029",
 57 |     "AI",
 58 |     "AG",
 59 |     "AW",
 60 |     "BS",
 61 |     "BB",
 62 |     "VG",
 63 |     "BQ",
 64 |     "KY",
 65 |     "CU",
 66 |     "CW",
 67 |     "DM",
 68 |     "DO",
 69 |     "GD",
 70 |     "GP",
 71 |     "HT",
 72 |     "JM",
 73 |     "MQ",
 74 |     "MS",
 75 |     "PR",
 76 |     "SX",
 77 |     "BL",
 78 |     "KN",
 79 |     "LC",
 80 |     "MF",
 81 |     "VC",
 82 |     "TT",
 83 |     "TC",
 84 |     "VI",
 85 |     # Northern America
 86 |     "021",
 87 |     "BM",
 88 |     "CA",
 89 |     "GL",
 90 |     "PM",
 91 |     "US",
 92 |     # North America as a whole
 93 |     "003",
 94 | } | LATIN_AMERICA
 95 | 
 96 | 
 97 | def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:
 98 |     """
 99 |     Takes in triples of (language, script, territory), which can be derived by
100 |     'maximizing' a language tag. Returns a number from 0 to 135 indicating the
101 |     'distance' between these for the purposes of language matching.
102 |     """
103 |     # First of all, if these are identical, return quickly:
104 |     if supported == desired:
105 |         return 0
106 | 
107 |     # If we've already figured it out, return the cached distance.
108 |     if (desired, supported) in _DISTANCE_CACHE:
109 |         return _DISTANCE_CACHE[desired, supported]
110 |     else:
111 |         result = _tuple_distance(desired, supported)
112 |         _DISTANCE_CACHE[desired, supported] = result
113 |         return result
114 | 
115 | 
116 | def _get2(dictionary: dict, key1: str, key2: str, default):
117 |     return dictionary.get(key1, {}).get(key2, default)
118 | 
119 | 
120 | def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:
121 |     desired_language, desired_script, desired_territory = desired
122 |     supported_language, supported_script, supported_territory = supported
123 |     distance = 0
124 | 
125 |     if desired_language != supported_language:
126 |         distance += _get2(
127 |             LANGUAGE_DISTANCES,
128 |             desired_language,
129 |             supported_language,
130 |             DEFAULT_LANGUAGE_DISTANCE,
131 |         )
132 | 
133 |     desired_script_pair = f"{desired_language}_{desired_script}"
134 |     supported_script_pair = f"{supported_language}_{supported_script}"
135 | 
136 |     if desired_script != supported_script:
137 |         # Scripts can match other scripts, but only when paired with a
138 |         # language. For example, there is no reason to assume someone who can
139 |         # read 'Latn' can read 'Cyrl', but there is plenty of reason to believe
140 |         # someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is
141 |         # a language written in two scripts.
142 |         distance += _get2(
143 |             LANGUAGE_DISTANCES,
144 |             desired_script_pair,
145 |             supported_script_pair,
146 |             DEFAULT_SCRIPT_DISTANCE,
147 |         )
148 | 
149 |     if desired_territory != supported_territory:
150 |         # The rules for matching territories are too weird to implement the
151 |         # general case efficiently. Instead of implementing all the possible
152 |         # match rules the XML could define, instead we just reimplement the
153 |         # rules of CLDR 36.1 here in code.
154 | 
155 |         tdist = DEFAULT_TERRITORY_DISTANCE
156 |         if desired_script_pair == supported_script_pair:
157 |             if desired_language == "ar":
158 |                 if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):
159 |                     tdist = 5
160 |             elif desired_language == "en":
161 |                 if (desired_territory == "GB") and (supported_territory not in US):
162 |                     tdist = 3
163 |                 elif (desired_territory not in US) and (supported_territory == "GB"):
164 |                     tdist = 3
165 |                 elif (desired_territory in US) != (supported_territory in US):
166 |                     tdist = 5
167 |             # This is not a rule that's spelled out in CLDR, but is implied by things
168 |             # about territory containment mentioned in other standards. Numeric values
169 |             # for territories, like '003', represent broad regions that contain more
170 |             # specific territories.
171 |             #
172 |             # 419 is the numeric value most often seen in language codes, particularly
173 |             # 'es-419' for Latin American Spanish. If you have a language code that
174 |             # differs only in that its territory is more specific, like 'es-PY', it should
175 |             # be closer to a supported 'es-419' than anything with a territory difference.
176 |             #
177 |             # We can implement this for 419 without becoming responsible for keeping up
178 |             # with which countries/territories/regions contain others in the general case.
179 |             elif desired_territory in LATIN_AMERICA and supported_territory == "419":
180 |                 tdist = 1
181 |             elif desired_language == "es" or desired_language == "pt":
182 |                 if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):
183 |                     tdist = 5
184 |             elif desired_script_pair == "zh_Hant":
185 |                 if (desired_territory in CNSAR) != (supported_territory in CNSAR):
186 |                     tdist = 5
187 |         distance += tdist
188 |     return distance
189 | 


--------------------------------------------------------------------------------
/langcodes/language_lists.py:
--------------------------------------------------------------------------------
  1 | # This is the list of language codes with the 'modern' level of support in CLDR
  2 | # (compared to 'full', which contains many more languages). We use this as the
  3 | # list of languages that we store specific name-to-code mappings for.
  4 | 
  5 | CLDR_LANGUAGES = {
  6 |     'af',
  7 |     'am',
  8 |     'ar',
  9 |     'az',
 10 |     'be',
 11 |     'bg',
 12 |     'bn',
 13 |     'bs',
 14 |     'ca',
 15 |     'cs',
 16 |     'cy',
 17 |     'da',
 18 |     'de',
 19 |     'el',
 20 |     'en',
 21 |     'es',
 22 |     'et',
 23 |     'eu',
 24 |     'fa',
 25 |     'fi',
 26 |     'fil',
 27 |     'fo',
 28 |     'fr',
 29 |     'ga',
 30 |     'gl',
 31 |     'gu',
 32 |     'he',
 33 |     'hi',
 34 |     'hr',
 35 |     'hu',
 36 |     'hy',
 37 |     'id',
 38 |     'is',
 39 |     'it',
 40 |     'ja',
 41 |     'ka',
 42 |     'kk',
 43 |     'km',
 44 |     'kn',
 45 |     'ko',
 46 |     'ky',
 47 |     'lo',
 48 |     'lt',
 49 |     'lv',
 50 |     'mk',
 51 |     'ml',
 52 |     'mn',
 53 |     'mr',
 54 |     'ms',
 55 |     'my',
 56 |     'nb',
 57 |     'ne',
 58 |     'nl',
 59 |     'pa',
 60 |     'pl',
 61 |     'pt',
 62 |     'ro',
 63 |     'ru',
 64 |     'si',
 65 |     'sk',
 66 |     'sl',
 67 |     'sq',
 68 |     'sr',
 69 |     'sv',
 70 |     'sw',
 71 |     'ta',
 72 |     'te',
 73 |     'th',
 74 |     'ti',
 75 |     'to',
 76 |     'tr',
 77 |     'uk',
 78 |     'und',
 79 |     'ur',
 80 |     'uz',
 81 |     'vi',
 82 |     'yue',
 83 |     'zh',
 84 |     'zu',
 85 | }
 86 | 
 87 | 
 88 | # These are the names languages that have the most entries on the English and
 89 | # German Wiktionaries. Wiktionary only consistently identifies languages by their
 90 | # name, making it important to be able to recognize the names.
 91 | #
 92 | # These lists of names are used in `tests/test_wikt_languages.py`.
 93 | WIKT_LANGUAGE_NAMES = {}
 94 | 
 95 | WIKT_LANGUAGE_NAMES['en'] = [
 96 |     "Spanish",
 97 |     "French",
 98 |     "Latvian",
 99 |     "Latin",
100 |     "English",
101 |     "Mandarin",
102 |     "Italian",
103 |     "Portuguese",
104 |     "Cantonese",
105 |     "Japanese",
106 |     "German",
107 |     "Swedish",
108 |     "Korean",
109 |     "Serbo-Croatian",
110 |     "Serbian",
111 |     "Croatian",
112 |     "Bosnian",
113 |     "Finnish",
114 |     "Vietnamese",
115 |     "Dutch",
116 |     "Galician",
117 |     "Catalan",
118 |     "Polish",
119 |     "Danish",
120 |     "Norwegian Nynorsk",
121 |     "Turkish",
122 |     "Romanian",
123 |     "Lithuanian",
124 |     "Ido",
125 |     "Old French",
126 |     "Czech",
127 |     "Norwegian",
128 |     # Jèrriais -- same as Norman
129 |     "Esperanto",
130 |     "Icelandic",
131 |     # Old Armenian
132 |     "Norwegian Bokmål",
133 |     "Asturian",
134 |     "Hungarian",
135 |     "Proto-Germanic",
136 |     "Russian",
137 |     "Slovene",
138 |     "Min Nan",
139 |     "Scottish Gaelic",
140 |     "Greek",
141 |     "Irish",
142 |     "Lojban",
143 |     "Middle French",
144 |     "Malay",
145 |     "Luxembourgish",
146 |     "Slovak",
147 |     "Estonian",
148 |     "Persian",
149 |     "Venetian",
150 |     "Old English",
151 |     "Volapük",
152 |     "Ladin",
153 |     "Faroese",
154 |     "Scots",
155 |     "Interlingua",
156 |     "Romansch",
157 |     "Urdu",
158 |     # Middle Chinese
159 |     "Indonesian",
160 |     "Swahili",
161 |     "Middle English",
162 |     "Occitan",
163 |     "Welsh",
164 |     "Old Norse",
165 |     "Albanian",
166 |     "Old Irish",
167 |     "Old Saxon",
168 |     "Lower Sorbian",
169 |     "Afrikaans",
170 |     "Ukrainian",
171 |     "Proto-Slavic",
172 |     "Ancient Greek",
173 |     "Gothic",
174 |     "Hawaiian",
175 |     "Kurdish",
176 |     "Tagalog",
177 |     "Old High German",
178 |     "Crimean Tatar",
179 |     "Manx",
180 |     "Sanskrit",
181 |     "Hiligaynon",
182 |     "West Frisian",
183 |     "Hebrew",
184 |     "Tok Pisin",
185 |     "Proto-Indo-European",
186 |     "Macedonian",
187 |     "Novial",
188 |     "Armenian",
189 |     "Arabic",
190 |     "Maltese",
191 |     "Hakka",
192 |     "Sicilian",
193 |     # "Ladino", -- same as Ladin
194 |     "Basque",
195 |     "Breton",
196 |     # Guernésiais -- same as Norman
197 |     "Vai",
198 |     "Navajo",
199 |     "Azeri",
200 |     "Vilamovian",
201 |     # Tarantino
202 |     "Maori",
203 |     "Friulian",
204 |     "Hausa",
205 |     "Haitian Creole",
206 |     "Yiddish",
207 |     "Tatar",
208 |     "Proto-Malayo-Polynesian",
209 |     "Aromanian",
210 |     "Ottoman Turkish",
211 |     "Old Provençal",
212 |     "Northern Sami",
213 |     "Dalmatian",
214 |     "Bulgarian",
215 |     "Neapolitan",
216 |     "Cornish",
217 |     "Middle Dutch",
218 |     "Rapa Nui",
219 |     # Old Portuguese
220 |     "Egyptian Arabic",
221 |     "Romani",
222 |     "Tahitian",
223 |     "Thai",
224 |     "Limburgish",
225 |     "Karelian",
226 |     "Tajik",
227 |     "Turkmen",
228 |     "Kabardian",
229 |     "Uzbek",
230 |     "Samoan",
231 |     "Mongolian",
232 |     "Zulu",
233 |     "Upper Sorbian",
234 |     "Walloon",
235 |     # Proto-Finnic
236 |     "Frankish",
237 |     "Mapudungun",
238 |     "Pashto",
239 |     "Low German",
240 |     "Bashkir",
241 |     "Kashubian",
242 |     "Sranan Tongo",
243 |     "Proto-Sino-Tibetan",
244 |     "Norman",
245 |     "Proto-Austronesian",
246 |     "Marathi",
247 |     "Rohingya",
248 |     "Classical Nahuatl",
249 |     # Proto-Malayic
250 |     # German Low German
251 |     "Fijian",
252 |     "Zazaki",
253 |     "Proto-Italic",
254 |     "Old Dutch",
255 |     "Egyptian",
256 |     "Old Frisian",
257 |     "Greenlandic",
258 |     "Burmese",
259 |     "Votic",
260 |     "Ewe",
261 |     "Cherokee",
262 |     "Old Church Slavonic",
263 |     "Quechua",
264 |     "Mirandese",
265 |     "Livonian",
266 |     "Bengali",
267 |     "Skolt Sami",
268 |     # Proto-Balto-Slavic
269 |     "Pitjantjatjara",
270 |     "Georgian",
271 |     "North Frisian",
272 |     "Tetum",
273 |     "Tongan",
274 |     # Mauritian Creole
275 |     "Torres Strait Creole",
276 |     "Papiamentu",
277 |     "Lao",
278 |     "Malagasy",
279 |     "Interlingue",
280 |     "Aragonese",
281 |     "Istriot",
282 |     "Sumerian",
283 |     "Proto-Celtic",
284 |     "Võro",
285 |     # Proto-Polynesian
286 |     "Nepali",
287 |     "Chickasaw",
288 |     "Akkadian",
289 |     "Middle Armenian",
290 |     "Cimbrian",
291 |     "Somali",
292 |     "Sardinian",
293 |     "Tocharian B",
294 |     "Telugu",
295 |     "Javanese",
296 |     "Taos",
297 |     "Proto-Semitic",
298 |     # Old Prussian
299 |     "Kyrgyz",
300 |     "Corsican",
301 |     "Veps",
302 |     "Baluchi",
303 |     "Middle Low German",
304 |     "Middle High German",
305 |     "Uyghur",
306 |     # Dutch Low Saxon
307 |     "Belarusian",
308 |     "Guaraní",
309 |     "Undetermined",
310 |     "Inuktitut",
311 |     "Tocharian A",
312 |     "Nigerian Pidgin",
313 |     # Gallo
314 |     # Saterland Frisian
315 |     "Punjabi",
316 |     "Proto-Algonquian",
317 |     # Istro-Romanian
318 |     "Wiradhuri",
319 |     "Sichuan Yi",
320 |     "Wu",
321 |     # White Hmong
322 |     "Ugaritic",
323 |     "Sundanese",
324 |     # Old East Slavic
325 |     # Fala
326 |     # Elfdalian
327 |     "Tamil",
328 |     "Pijin",
329 |     "Okinawan",
330 |     "Kazakh",
331 |     "Hindi",
332 |     "Tuvan",
333 |     "Polabian",
334 |     "Aramaic",
335 |     "Malayalam",
336 |     "Kumyk",
337 |     "Inari Sami",
338 |     "Ilocano",
339 |     "Tswana",
340 |     "Libyan Arabic",
341 |     "Latgalian",
342 |     "Yakut",
343 |     "Sindhi",
344 |     "Khmer",
345 |     "Gamilaraay",
346 |     "Ojibwe",
347 |     "Choctaw",
348 |     "Chinese",
349 |     "Chamorro",
350 |     "Yucatec Maya",
351 |     "Picard",
352 |     "Ngarrindjeri",
353 |     "Kott",
354 |     "Ingrian",
355 |     # Crimean Gothic
356 |     "Chamicuro",
357 |     "Rajasthani",
358 |     # Old Tupi
359 |     "Old Spanish",
360 |     "Gagauz",
361 |     "Extremaduran",
362 |     "Chinook Jargon",
363 |     "Cahuilla",
364 |     "Kannada",
365 |     "Iban",
366 |     "American Sign Language",
367 |     "Adyghe",
368 |     "Warlpiri",
369 |     "Tibetan",
370 |     "Ossetian",
371 |     "Meriam",
372 |     "Marshallese",
373 |     "Khakas",
374 |     "Balinese",
375 |     "Zhuang",
376 |     "Tuvaluan",
377 |     "Niuean",
378 |     "Martuthunira",
379 |     "Guugu Yimidhirr",
380 |     "Chechen",
381 |     "Campidanese Sardinian",
382 |     "Tolai",
383 |     # Old Javanese
384 |     "Nahuatl",
385 |     "Lombard",
386 |     "West Coast Bajau",
387 |     "Romagnol",
388 |     "Middle Irish",
389 |     "Yoruba",
390 |     "Wangaaybuwan-Ngiyambaa",
391 |     # Old Swedish
392 |     "Lingala",
393 |     "Fiji Hindi",
394 |     "Shabo",
395 |     "Sasak",
396 |     "Judeo-Arabic",
397 |     "Central Kurdish",
398 |     "Bislama",
399 | ]
400 | 
401 | WIKT_LANGUAGE_NAMES['de'] = [
402 |     "Deutsch",
403 |     "Englisch",
404 |     "Polnisch",
405 |     "Italienisch",
406 |     "Französisch",
407 |     "Esperanto",
408 |     "Schwedisch",
409 |     "Lateinisch",
410 |     "Tschechisch",
411 |     "Katalanisch",
412 |     "Spanisch",
413 |     "Okzitanisch",
414 |     "Ungarisch",
415 |     "Litauisch",
416 |     "Finnisch",
417 |     "Russisch",
418 |     "Altgriechisch",
419 |     "Niederländisch",
420 |     "Kurdisch",
421 |     "Baskisch",
422 |     "Armenisch",
423 |     "Isländisch",
424 |     "Bulgarisch",
425 |     "Färöisch",
426 |     "Dänisch",
427 |     "Portugiesisch",
428 |     "Slowakisch",
429 |     "Türkisch",
430 |     "Maori",
431 |     "Albanisch",
432 |     "Japanisch",
433 |     "Norwegisch",
434 |     "Irisch",
435 |     "Koreanisch",
436 |     "Chinesisch",
437 |     "Venezianisch",
438 |     "Friaulisch",
439 |     "Serbisch",
440 |     "Indonesisch",
441 |     "Walisisch",
442 |     "Arabisch",
443 |     "Zentral-Nahuatl",
444 |     "Neugriechisch",
445 |     "Sumerisch",
446 |     "Obersorbisch",
447 |     "Sesotho",
448 |     "Rumänisch",
449 |     "Suaheli",
450 |     "Persisch",
451 |     "Krimtatarisch",
452 |     "Plattdeutsch",
453 |     "Prußisch",
454 |     "Thai",
455 |     "Bosnisch",
456 |     "Sardisch",
457 |     "Maltesisch",
458 |     "Akkadisch",
459 |     "Hawaiianisch",
460 |     "Hebräisch",
461 |     "Gotisch",
462 |     "Afrikaans",
463 |     "Rätoromanisch",
464 |     "Tamil",
465 |     "Bretonisch",
466 |     "Ukrainisch",
467 |     "Hindi",
468 |     "Georgisch",
469 |     "Panjabi",
470 |     "Papiamentu",
471 |     "Slowenisch",
472 |     "Nauruisch",
473 |     "Schottisch-Gälisch",
474 |     "Balinesisch",
475 |     "Estnisch",
476 |     "Manx",
477 |     "Korsisch",
478 |     # "Frühneuhochdeutsch",
479 |     "Lettisch",
480 |     "isiZulu",
481 |     "Tagalog",
482 |     "Tok Pisin",
483 |     # "Südpikenisch",
484 |     "Kroatisch",
485 |     "Niedersorbisch",
486 |     "Kannada",
487 |     "Guanche",
488 |     "Belarussisch",
489 |     "Sanskrit",
490 |     "Aserbaidschanisch",
491 |     "Mittelhochdeutsch",
492 |     "Laotisch",
493 |     "Altnordisch",
494 |     "Altenglisch",
495 |     "Vietnamesisch",
496 |     "Tadschikisch",
497 |     "Samoanisch",
498 |     "Mazedonisch",
499 |     "Luxemburgisch",
500 |     "Hethitisch",
501 |     # "Yukatekisch",
502 |     "Kaschubisch",
503 |     "Wallonisch",
504 |     # "Klassisches Nahuatl",
505 |     "Telugu",
506 |     "Rapanui",
507 |     "Jiddisch",
508 |     "Ido",
509 |     # "Galicisch",
510 |     "Volapük",
511 |     "Bengalisch",
512 |     "Mapudungun",
513 |     "Lojban",
514 |     "Tuvaluisch",
515 |     "Gujarati",
516 |     "Assamesisch",
517 | ]
518 | 


--------------------------------------------------------------------------------
/langcodes/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rspeer/langcodes/0aebfa862ed86d820d0c96ce311ef661cf0a798a/langcodes/py.typed


--------------------------------------------------------------------------------
/langcodes/registry_parser.py:
--------------------------------------------------------------------------------
 1 | from langcodes.util import data_filename
 2 | 
 3 | LIST_KEYS = {'Description', 'Prefix'}
 4 | 
 5 | 
 6 | def parse_file(file):
 7 |     """
 8 |     Take an open file containing the IANA subtag registry, and yield a
 9 |     dictionary of information for each subtag it describes.
10 |     """
11 |     lines = []
12 |     for line in file:
13 |         line = line.rstrip('\n')
14 |         if line == '%%':
15 |             # This is a separator between items. Parse the data we've
16 |             # collected and yield the result.
17 |             yield from parse_item(lines)
18 |             lines.clear()
19 |         elif line.startswith('  '):
20 |             # This is a continuation line. Concatenate it to the previous
21 |             # line, including one of the spaces.
22 |             lines[-1] += line[1:]
23 |         else:
24 |             lines.append(line)
25 |     yield from parse_item(lines)
26 | 
27 | 
28 | def parse_item(lines):
29 |     """
30 |     Given the lines that form a subtag entry (after joining wrapped lines
31 |     back together), parse the data they contain.
32 | 
33 |     Returns a generator that yields once if there was any data there
34 |     (and an empty generator if this was just the header).
35 |     """
36 |     info = {}
37 |     for line in lines:
38 |         key, value = line.split(': ', 1)
39 |         if key in LIST_KEYS:
40 |             info.setdefault(key, []).append(value)
41 |         else:
42 |             assert key not in info
43 |             info[key] = value
44 | 
45 |     if 'Subtag' in info or 'Tag' in info:
46 |         yield info
47 | 
48 | 
49 | def parse_registry():
50 |     """
51 |     Yield a sequence of dictionaries, containing the info in the included
52 |     IANA subtag registry file.
53 |     """
54 |     with open(
55 |         data_filename('language-subtag-registry.txt'), encoding='utf-8'
56 |     ) as data_file:
57 |         # 'yield from' instead of returning, so that we only close the file
58 |         # when finished.
59 |         yield from parse_file(data_file)
60 | 


--------------------------------------------------------------------------------
/langcodes/tag_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module implements a parser for language tags, according to the RFC 5646
  3 | (BCP 47) standard.
  4 | 
  5 | Here, we're only concerned with the syntax of the language tag. Looking up
  6 | what they actually mean in a data file is a separate step.
  7 | 
  8 | For a full description of the syntax of a language tag, see page 3 of
  9 |     http://tools.ietf.org/html/bcp47
 10 | 
 11 | >>> parse_tag('en')
 12 | [('language', 'en')]
 13 | 
 14 | >>> parse_tag('en_US')
 15 | [('language', 'en'), ('territory', 'US')]
 16 | 
 17 | >>> parse_tag('en-Latn')
 18 | [('language', 'en'), ('script', 'Latn')]
 19 | 
 20 | >>> parse_tag('es-419')
 21 | [('language', 'es'), ('territory', '419')]
 22 | 
 23 | >>> parse_tag('zh-hant-tw')
 24 | [('language', 'zh'), ('script', 'Hant'), ('territory', 'TW')]
 25 | 
 26 | >>> parse_tag('zh-tw-hant')
 27 | Traceback (most recent call last):
 28 |     ...
 29 | langcodes.tag_parser.LanguageTagError: This script subtag, 'hant', is out of place. Expected variant, extension, or end of string.
 30 | 
 31 | >>> parse_tag('de-DE-1901')
 32 | [('language', 'de'), ('territory', 'DE'), ('variant', '1901')]
 33 | 
 34 | >>> parse_tag('ja-latn-hepburn')
 35 | [('language', 'ja'), ('script', 'Latn'), ('variant', 'hepburn')]
 36 | 
 37 | >>> parse_tag('ja-hepburn-latn')
 38 | Traceback (most recent call last):
 39 |     ...
 40 | langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
 41 | 
 42 | >>> parse_tag('zh-yue')
 43 | [('language', 'zh'), ('extlang', 'yue')]
 44 | 
 45 | >>> parse_tag('zh-yue-Hant')
 46 | [('language', 'zh'), ('extlang', 'yue'), ('script', 'Hant')]
 47 | 
 48 | >>> parse_tag('zh-min-nan')
 49 | [('grandfathered', 'zh-min-nan')]
 50 | 
 51 | >>> parse_tag('x-dothraki')
 52 | [('language', 'x-dothraki')]
 53 | 
 54 | >>> parse_tag('en-u-co-backward-x-pig-latin')
 55 | [('language', 'en'), ('extension', 'u-co-backward'), ('private', 'x-pig-latin')]
 56 | 
 57 | >>> parse_tag('en-x-pig-latin-u-co-backward')
 58 | [('language', 'en'), ('private', 'x-pig-latin-u-co-backward')]
 59 | 
 60 | >>> parse_tag('u-co-backward')
 61 | Traceback (most recent call last):
 62 |     ...
 63 | langcodes.tag_parser.LanguageTagError: Expected a language code, got 'u'
 64 | 
 65 | >>> parse_tag('x-')
 66 | Traceback (most recent call last):
 67 |     ...
 68 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
 69 | 
 70 | >>> parse_tag('und-u-')
 71 | Traceback (most recent call last):
 72 |     ...
 73 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got ''
 74 | 
 75 | >>> parse_tag('und-0-foo')
 76 | [('language', 'und'), ('extension', '0-foo')]
 77 | 
 78 | >>> parse_tag('und-?-foo')
 79 | Traceback (most recent call last):
 80 |     ...
 81 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '?'
 82 | 
 83 | >>> parse_tag('und-x-123456789')
 84 | Traceback (most recent call last):
 85 |     ...
 86 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '123456789'
 87 | 
 88 | >>> parse_tag('en-a-b-foo')
 89 | Traceback (most recent call last):
 90 |     ...
 91 | langcodes.tag_parser.LanguageTagError: Tag extensions may not contain two singletons in a row
 92 | 
 93 | >>> parse_tag('ar-٠٠١')
 94 | Traceback (most recent call last):
 95 |     ...
 96 | langcodes.tag_parser.LanguageTagError: Language tags must be made of ASCII characters
 97 | """
 98 | 
 99 | # These tags should not be parsed by the usual parser; they're grandfathered
100 | # in from RFC 3066. The 'irregular' ones don't fit the syntax at all; the
101 | # 'regular' ones do, but would give meaningless results when parsed.
102 | #
103 | # These are all lowercased so they can be matched case-insensitively, as the
104 | # standard requires.
105 | EXCEPTIONS = {
106 |     # Irregular exceptions
107 |     "en-gb-oed",
108 |     "i-ami",
109 |     "i-bnn",
110 |     "i-default",
111 |     "i-enochian",
112 |     "i-hak",
113 |     "i-klingon",
114 |     "i-lux",
115 |     "i-mingo",
116 |     "i-navajo",
117 |     "i-pwn",
118 |     "i-tao",
119 |     "i-tay",
120 |     "i-tsu",
121 |     "sgn-be-fr",
122 |     "sgn-be-nl",
123 |     "sgn-ch-de",
124 |     # Regular exceptions
125 |     "art-lojban",
126 |     "cel-gaulish",
127 |     "no-bok",
128 |     "no-nyn",
129 |     "zh-guoyu",
130 |     "zh-hakka",
131 |     "zh-min",
132 |     "zh-min-nan",
133 |     "zh-xiang",
134 | }
135 | 
136 | # Define the order of subtags as integer constants, but also give them names
137 | # so we can describe them in error messages
138 | EXTLANG, SCRIPT, TERRITORY, VARIANT, EXTENSION = range(5)
139 | SUBTAG_TYPES = [
140 |     'extlang',
141 |     'script',
142 |     'territory',
143 |     'variant',
144 |     'extension',
145 |     'end of string',
146 | ]
147 | 
148 | 
149 | def normalize_characters(tag):
150 |     """
151 |     BCP 47 is case-insensitive, and CLDR's use of it considers underscores
152 |     equivalent to hyphens. So here we smash tags into lowercase with hyphens,
153 |     so we can make exact comparisons.
154 | 
155 |     >>> normalize_characters('en_US')
156 |     'en-us'
157 |     >>> normalize_characters('zh-Hant_TW')
158 |     'zh-hant-tw'
159 |     """
160 |     return tag.lower().replace('_', '-')
161 | 
162 | 
163 | def parse_tag(tag):
164 |     """
165 |     Parse the syntax of a language tag, without looking up anything in the
166 |     registry, yet. Returns a list of (type, value) tuples indicating what
167 |     information will need to be looked up.
168 |     """
169 |     if not tag.isascii():
170 |         raise LanguageTagError("Language tags must be made of ASCII characters")
171 | 
172 |     tag = normalize_characters(tag)
173 |     if tag in EXCEPTIONS:
174 |         return [('grandfathered', tag)]
175 |     else:
176 |         # The first subtag is always either the language code, or 'x' to mark
177 |         # the entire tag as private-use. Other subtags are distinguished
178 |         # by their length and format, but the language code is distinguished
179 |         # by the fact that it is required to come first.
180 |         subtags = tag.split('-')
181 | 
182 |         # check all subtags for their shape: 1-8 alphanumeric characters
183 |         for subtag in subtags:
184 |             if len(subtag) < 1 or len(subtag) > 8 or not subtag.isalnum():
185 |                 raise LanguageTagError(
186 |                     f"Expected 1-8 alphanumeric characters, got {subtag!r}"
187 |                 )
188 | 
189 |         if subtags[0] == 'x':
190 |             if len(subtags) == 1:
191 |                 raise LanguageTagError("'x' is not a language tag on its own")
192 |             # the entire language tag is private use, but we know that,
193 |             # whatever it is, it fills the "language" slot
194 |             return [('language', tag)]
195 |         elif 2 <= len(subtags[0]) <= 4:
196 |             # Language codes should be 2 or 3 letters, but 4-letter codes
197 |             # are allowed to parse for legacy Unicode reasons
198 |             return [('language', subtags[0])] + parse_subtags(subtags[1:])
199 |         else:
200 |             subtag_error(subtags[0], 'a language code')
201 | 
202 | 
203 | def parse_subtags(subtags, expect=EXTLANG):
204 |     """
205 |     Parse everything that comes after the language tag: scripts, territories,
206 |     variants, and assorted extensions.
207 |     """
208 |     # We parse the parts of a language code recursively: each step of
209 |     # language code parsing handles one component of the code, recurses
210 |     # to handle the rest of the code, and adds what it found onto the
211 |     # list of things that were in the rest of the code.
212 |     #
213 |     # This could just as well have been iterative, but the loops would have
214 |     # been convoluted.
215 |     #
216 |     # So here's the base case.
217 |     if not subtags:
218 |         return []
219 | 
220 |     # There's a subtag that comes next. We need to find out what it is.
221 |     #
222 |     # The primary thing that distinguishes different types of subtags is
223 |     # length, but the subtags also come in a specified order. The 'expect'
224 |     # parameter keeps track of where we are in that order. expect=TERRITORY,
225 |     # for example, means we're expecting a territory code, or anything later
226 |     # (because everything but the language is optional).
227 |     subtag = subtags[0]
228 |     tag_length = len(subtag)
229 | 
230 |     # In the usual case, our goal is to recognize what kind of tag this is,
231 |     # and set it in 'tagtype' -- as an integer, so we can compare where it
232 |     # should go in order. You can see the enumerated list of tagtypes above,
233 |     # where the SUBTAG_TYPES global is defined.
234 |     tagtype = None
235 | 
236 |     if tag_length == 1:
237 |         # A one-letter subtag introduces an extension, which can itself have
238 |         # sub-subtags, so we dispatch to a different function at this point.
239 |         #
240 |         # We don't need to check anything about the order, because extensions
241 |         # necessarily come last.
242 |         if subtag.isalnum():
243 |             return parse_extension(subtags)
244 |         else:
245 |             subtag_error(subtag)
246 | 
247 |     elif tag_length == 2:
248 |         if subtag.isalpha():
249 |             # Two-letter alphabetic subtags are territories. These are the only
250 |             # two-character subtags after the language.
251 |             tagtype = TERRITORY
252 | 
253 |     elif tag_length == 3:
254 |         if subtag.isalpha():
255 |             # Three-letter alphabetic subtags are 'extended languages'.
256 |             # It's allowed for there to be up to three of them in a row, so we
257 |             # need another function to enforce that. Before we dispatch to that
258 |             # function, though, we need to check whether we're in the right
259 |             # place in order.
260 |             if expect <= EXTLANG:
261 |                 return parse_extlang(subtags)
262 |             else:
263 |                 order_error(subtag, EXTLANG, expect)
264 |         elif subtag.isdigit():
265 |             # Three-digit subtags are territories representing broad regions,
266 |             # such as Latin America (419).
267 |             tagtype = TERRITORY
268 | 
269 |     elif tag_length == 4:
270 |         if subtag.isalpha():
271 |             # Four-letter alphabetic subtags are scripts.
272 |             tagtype = SCRIPT
273 |         elif subtag[0].isdigit():
274 |             # Four-character subtags that start with a digit are variants.
275 |             tagtype = VARIANT
276 | 
277 |     else:
278 |         # Tags of length 5-8 are variants.
279 |         tagtype = VARIANT
280 | 
281 |     # That's the end of the big elif block for figuring out what kind of
282 |     # subtag we have based on its length. Now we should do something with that
283 |     # kind of subtag.
284 | 
285 |     if tagtype is None:
286 |         # We haven't recognized a type of tag. This subtag just doesn't fit the
287 |         # standard.
288 |         subtag_error(subtag)
289 | 
290 |     elif tagtype < expect:
291 |         # We got a tag type that was supposed to appear earlier in the order.
292 |         order_error(subtag, tagtype, expect)
293 | 
294 |     else:
295 |         # We've recognized a subtag of a particular type. If it's a territory or
296 |         # script, we expect the next subtag to be a strictly later type, because
297 |         # there can be at most one territory and one script. Otherwise, we expect
298 |         # the next subtag to be the type we got or later.
299 | 
300 |         if tagtype in (SCRIPT, TERRITORY):
301 |             expect = tagtype + 1
302 |         else:
303 |             expect = tagtype
304 | 
305 |         # Get the name of this subtag type instead of its integer value.
306 |         typename = SUBTAG_TYPES[tagtype]
307 | 
308 |         # Some subtags are conventionally written with capitalization. Apply
309 |         # those conventions.
310 |         if tagtype == SCRIPT:
311 |             subtag = subtag.title()
312 |         elif tagtype == TERRITORY:
313 |             subtag = subtag.upper()
314 | 
315 |         # Recurse on the remaining subtags.
316 |         return [(typename, subtag)] + parse_subtags(subtags[1:], expect)
317 | 
318 | 
319 | def parse_extlang(subtags):
320 |     """
321 |     Parse an 'extended language' tag, which consists of 1 to 3 three-letter
322 |     language codes.
323 | 
324 |     Extended languages are used for distinguishing dialects/sublanguages
325 |     (depending on your view) of macrolanguages such as Arabic, Bahasa Malay,
326 |     and Chinese.
327 | 
328 |     It's supposed to also be acceptable to just use the sublanguage as the
329 |     primary language code, and your code should know what's a macrolanguage of
330 |     what. For example, 'zh-yue' and 'yue' are the same language (Cantonese),
331 |     and differ only in whether they explicitly spell out that Cantonese is a
332 |     kind of Chinese.
333 |     """
334 |     index = 0
335 |     parsed = []
336 |     while index < len(subtags) and len(subtags[index]) == 3 and index < 3:
337 |         parsed.append(('extlang', subtags[index]))
338 |         index += 1
339 |     return parsed + parse_subtags(subtags[index:], SCRIPT)
340 | 
341 | 
342 | def parse_extension(subtags):
343 |     """
344 |     An extension tag consists of a 'singleton' -- a one-character subtag --
345 |     followed by other subtags. Extension tags are in the BCP 47 syntax, but
346 |     their meaning is outside the scope of the standard.
347 | 
348 |     For example, there's the u- extension, which is used for setting Unicode
349 |     properties in some context I'm not aware of.
350 | 
351 |     If the singleton is 'x', it's a private use extension, and consumes the
352 |     rest of the tag. Otherwise, it stops at the next singleton.
353 |     """
354 |     subtag = subtags[0]
355 |     if len(subtags) == 1:
356 |         raise LanguageTagError(f"The subtag {subtag!r} must be followed by something")
357 | 
358 |     if subtag == 'x':
359 |         # Private use. Everything after this is arbitrary codes that we
360 |         # can't look up.
361 |         return [('private', '-'.join(subtags))]
362 | 
363 |     else:
364 |         # Look for the next singleton, if there is one.
365 |         boundary = 1
366 |         while boundary < len(subtags) and len(subtags[boundary]) != 1:
367 |             boundary += 1
368 | 
369 |         if boundary == 1:
370 |             raise LanguageTagError(
371 |                 "Tag extensions may not contain two singletons in a row"
372 |             )
373 |         # We've parsed a complete extension subtag. Return to the main
374 |         # parse_subtags function, but expect to find nothing but more
375 |         # extensions at this point.
376 |         return [('extension', '-'.join(subtags[:boundary]))] + parse_subtags(
377 |             subtags[boundary:], EXTENSION
378 |         )
379 | 
380 | 
381 | class LanguageTagError(ValueError):
382 |     pass
383 | 
384 | 
385 | def order_error(subtag, got, expected):
386 |     """
387 |     Output an error indicating that tags were out of order.
388 |     """
389 |     options = SUBTAG_TYPES[expected:]
390 |     if len(options) == 1:
391 |         expect_str = options[0]
392 |     elif len(options) == 2:
393 |         expect_str = f'{options[0]} or {options[1]}'
394 |     else:
395 |         joined = ', '.join(options[:-1])
396 |         last = options[-1]
397 |         expect_str = f'{joined}, or {last}'
398 |     got_str = SUBTAG_TYPES[got]
399 |     raise LanguageTagError(
400 |         f"This {got_str} subtag, {subtag!r}, is out of place. Expected {expect_str}."
401 |     )
402 | 
403 | 
404 | def subtag_error(subtag, expected='a valid subtag'):
405 |     """
406 |     Try to output a reasonably helpful error message based on our state of
407 |     parsing. Most of this code is about how to list, in English, the kinds
408 |     of things we were expecting to find.
409 |     """
410 |     raise LanguageTagError(f"Expected {expected}, got {subtag!r}")
411 | 


--------------------------------------------------------------------------------
/langcodes/tests/README.md:
--------------------------------------------------------------------------------
1 | Most of the tests for langcodes are in doctests, intended to be run on Python 3. This directory contains additional tests that ensure langcodes can recognize language names as they are used on Wiktionary, the free multilingual dictionary.
2 | 


--------------------------------------------------------------------------------
/langcodes/tests/test_alpha3.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import langcodes
 3 | 
 4 | def test_alpha2_to_alpha3():
 5 |     """
 6 |     Test that each valid alpha2 code has a corresponding, unique alpha3 code.
 7 |     """
 8 |     seen = set()
 9 |     for letter1 in string.ascii_lowercase:
10 |         for letter2 in string.ascii_lowercase:
11 |             code = letter1 + letter2
12 |             language = langcodes.get(code, normalize=False)
13 |             if language.is_valid():
14 |                 alpha3 = language.to_alpha3()
15 | 
16 |                 # These four 2-letter codes exist only as aliases, and don't have
17 |                 # their own unique 3-letter codes. All other 2-letter codes should
18 |                 # uniquely map to 3-letter codes.
19 |                 if code not in {'in', 'iw', 'ji', 'jw'}:
20 |                     assert alpha3 not in seen
21 |                     seen.add(alpha3)
22 | 


--------------------------------------------------------------------------------
/langcodes/tests/test_issue_59.py:
--------------------------------------------------------------------------------
 1 | from langcodes import closest_match
 2 | 
 3 | 
 4 | def test_language_less_than():
 5 |     spoken_language_1 = 'pa'
 6 |     spoken_language_2 = 'pa-PK'
 7 |     match = closest_match(
 8 |         spoken_language_1, [spoken_language_2], ignore_script=True
 9 |     )
10 |     print(match)
11 |     assert match[0] != "und"
12 | 
13 | 
14 | def test_language_more_than():
15 |     spoken_language_1 = 'pa-PK'
16 |     spoken_language_2 = 'pa'
17 |     match = closest_match(
18 |         spoken_language_1, [spoken_language_2], ignore_script=True
19 |     )
20 |     print(match)
21 |     assert match[0] != "und"


--------------------------------------------------------------------------------
/langcodes/tests/test_language.py:
--------------------------------------------------------------------------------
 1 | from langcodes import Language
 2 | 
 3 | 
 4 | def test__hash__():
 5 |     en1 = Language.get("en")
 6 |     # Disable caching
 7 |     Language._INSTANCES = {}
 8 |     Language._PARSE_CACHE = {}
 9 |     en2 = Language.get("en")
10 |     assert hash(en1) == hash(en2)
11 | 
12 |     # Again, disable caching
13 |     Language._INSTANCES = {}
14 |     Language._PARSE_CACHE = {}
15 |     en_us = Language.get("en-US")
16 |     assert hash(en1) != hash(en_us)
17 | 


--------------------------------------------------------------------------------
/langcodes/tests/test_language_data.py:
--------------------------------------------------------------------------------
 1 | import langcodes
 2 | 
 3 | 
 4 | def test_updated_iana():
 5 |     aqk = langcodes.get('aqk')
 6 |     assert aqk.language_name('en') == 'Aninka'
 7 | 
 8 | 
 9 | def test_cldr_v40():
10 |     en = langcodes.get('en')
11 |     assert en.language_name('dsb') == 'engelšćina'
12 | 


--------------------------------------------------------------------------------
/langcodes/tests/test_wikt_languages.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Here, we test that we can associate a language code with each language name
 3 | that is commonly used on Wiktionary, that all the language codes are
 4 | different, and that each language name matches only one code.
 5 | """
 6 | import pytest
 7 | import langcodes
 8 | from langcodes.language_lists import WIKT_LANGUAGE_NAMES
 9 | 
10 | LANGUAGES = ['en', 'de']
11 | 
12 | 
13 | @pytest.mark.parametrize("target_lang", LANGUAGES)
14 | def test_check_wiktionary_language(target_lang):
15 |     seen_codes = {}
16 |     for lang_name in WIKT_LANGUAGE_NAMES[target_lang]:
17 |         if lang_name.startswith('Proto-'):
18 |             continue
19 |         code = str(langcodes.find(lang_name))
20 |         assert code not in seen_codes, "%r and %r have the same code" % (
21 |             seen_codes[code],
22 |             lang_name,
23 |         )
24 |         seen_codes[code] = lang_name
25 | 


--------------------------------------------------------------------------------
/langcodes/util.py:
--------------------------------------------------------------------------------
1 | from importlib.resources import files
2 | 
3 | DATA_ROOT = files('langcodes').joinpath('data')
4 | import os
5 | 
6 | 
7 | def data_filename(filename):
8 |     return os.path.join(DATA_ROOT, filename)
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "langcodes"
 3 | description = "Tools for labeling human languages with IETF language tags"
 4 | authors = [{name = "Robyn Speer", email = "rspeer@arborelia.net"}]
 5 | maintainers = [{name = "Georg Krause", email = "mail@georg-krause.net"}]
 6 | readme = "README.md"
 7 | classifiers = [
 8 |   "Development Status :: 5 - Production/Stable",
 9 |   "License :: OSI Approved :: MIT License",
10 |   "Programming Language :: Python :: 3",
11 |   "Programming Language :: Python :: 3.9",
12 |   "Programming Language :: Python :: 3.10",
13 |   "Programming Language :: Python :: 3.11",
14 |   "Programming Language :: Python :: 3.12",
15 |   "Programming Language :: Python :: 3.13",
16 | ]
17 | dynamic = ["version"]
18 | 
19 | requires-python = ">= 3.9"
20 | 
21 | [project.urls]
22 | Homepage = "https://github.com/rspeer/langcodes"
23 | Repository = "https://github.com/rspeer/langcodes"
24 | Issues = "https://github.com/rspeer/langcodes/issues"
25 | 
26 | [project.optional-dependencies]
27 | test = [
28 |   'pytest',
29 |   'pytest-cov',
30 |   'language-data>=1.2'
31 | ]
32 | build = [
33 |   'build',
34 |   'twine'
35 | ]
36 | data = [
37 |   "language-data>=1.2"
38 | ]
39 | 
40 | [build-system]
41 | requires = ["setuptools>=60", "setuptools-scm>=8.0"]
42 | build-backend = "setuptools.build_meta"
43 | 
44 | [tool.setuptools_scm]
45 | 
46 | [tool.pytest.ini_options]
47 | addopts = "--doctest-modules --doctest-glob=README.md --ignore=setup.py --ignore=example.py --ignore=langcodes/data"
48 | norecursedirs = ".git ignore build __pycache__"
49 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": [
4 |     "config:recommended"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37, py38, py39, py310, py311, py312
 3 | skipsdist = True
 4 | 
 5 | [testenv]
 6 | deps =
 7 |     pytest
 8 |     marisa_trie
 9 |     language_data
10 | commands = pip install .
11 |            pytest
12 | 


--------------------------------------------------------------------------------