├── .github └── workflows │ ├── cd.yml │ └── ci.yml ├── .gitignore ├── .mailmap ├── LICENSE.txt ├── README.md ├── SECURITY.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat ├── requirements.txt └── source │ ├── langcodes.rst │ └── modules.rst ├── example.py ├── langcodes ├── __init__.py ├── build_data.py ├── data │ └── language-subtag-registry.txt ├── data_dicts.py ├── language_distance.py ├── language_lists.py ├── py.typed ├── registry_parser.py ├── tag_parser.py ├── tests │ ├── README.md │ ├── test_alpha3.py │ ├── test_issue_59.py │ ├── test_language.py │ ├── test_language_data.py │ └── test_wikt_languages.py └── util.py ├── pyproject.toml ├── renovate.json ├── tox.ini └── uv.lock /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Delivery 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | environment: 11 | name: pypi 12 | url: https://pypi.org/p/langcodes 13 | permissions: 14 | id-token: write 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.13' 21 | - name: Install dependencies 22 | run: pip install .[build] 23 | - name: Build package 24 | run: python -m build 25 | - name: Publish package 26 | uses: pypa/gh-action-pypi-publish@release/v1 27 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Python ${{ matrix.python-version }} 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Install 19 | run: pip install .[test,data] 20 | - name: Run tests 21 | run: pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov=language_data --cov-report=xml:junit/coverage-${{ matrix.python-version }}.xml 22 | - name: Upload 23 | uses: actions/upload-artifact@v4 24 | with: 25 | name: junit-${{ matrix.python-version }} 26 | path: junit/* 27 | 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .coverage 2 | *.egg-info 3 | *~ 4 | *.swp 5 | __pycache__ 6 | build 7 | dist 8 | langcodes/data/cldr 9 | langcodes/data/cldr-json 10 | .ipynb_checkpoints 11 | vendor 12 | .tox 13 | docs/_build/ 14 | .hypothesis 15 | poetry.lock 16 | .venv 17 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | # Robyn has used different names and e-mail addresses in the course of this project. Map them all to her current name and e-mail. 2 | Robyn Speer 3 | Robyn Speer 4 | Robyn Speer 5 | Robyn Speer 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2021 Robyn Speer (rspeer@arborelia.net) 2 | MIT License 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | this software and associated documentation files (the "Software"), to deal in 6 | the Software without restriction, including without limitation the rights to 7 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 8 | of the Software, and to permit persons to whom the Software is furnished to do 9 | so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Langcodes: a library for language codes 2 | 3 | **langcodes** knows what languages are. It knows the standardized codes that 4 | refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi. 5 | 6 | These are [IETF language tags][]. You may know them by their old name, ISO 639 7 | language codes. IETF has done some important things for backward compatibility 8 | and supporting language variations that you won't find in the ISO standard. 9 | 10 | [IETF language tags]: https://www.w3.org/International/articles/language-tags/ 11 | 12 | It may sound to you like langcodes solves a pretty boring problem. At one 13 | level, that's right. Sometimes you have a boring problem, and it's great when a 14 | library solves it for you. 15 | 16 | But there's an interesting problem hiding in here. How do you work with 17 | language codes? How do you know when two different codes represent the same 18 | thing? How should your code represent relationships between codes, like the 19 | following? 20 | 21 | * `eng` is equivalent to `en`. 22 | * `fra` and `fre` are both equivalent to `fr`. 23 | * `en-GB` might be written as `en-gb` or `en_GB`. Or as 'en-UK', which is 24 | erroneous, but should be treated as the same. 25 | * `en-CA` is not exactly equivalent to `en-US`, but it's really, really close. 26 | * `en-Latn-US` is equivalent to `en-US`, because written English must be written 27 | in the Latin alphabet to be understood. 28 | * The difference between `ar` and `arb` is the difference between "Arabic" and 29 | "Modern Standard Arabic", a difference that may not be relevant to you. 30 | * You'll find Mandarin Chinese tagged as `cmn` on Wiktionary, but many other 31 | resources would call the same language `zh`. 32 | * Chinese is written in different scripts in different territories. Some 33 | software distinguishes the script. Other software distinguishes the territory. 34 | The result is that `zh-CN` and `zh-Hans` are used interchangeably, as are 35 | `zh-TW` and `zh-Hant`, even though occasionally you'll need something 36 | different such as `zh-HK` or `zh-Latn-pinyin`. 37 | * The Indonesian (`id`) and Malaysian (`ms` or `zsm`) languages are mutually 38 | intelligible. 39 | * `jp` is not a language code. (The language code for Japanese is `ja`, but 40 | people confuse it with the country code for Japan.) 41 | 42 | One way to know is to read IETF standards and Unicode technical reports. 43 | Another way is to use a library that implements those standards and guidelines 44 | for you, which langcodes does. 45 | 46 | When you're working with these short language codes, you may want to see the 47 | name that the language is called _in_ a language: `fr` is called "French" in 48 | English. That language doesn't have to be English: `fr` is called "français" in 49 | French. A supplement to langcodes, [`language_data`][language-data], provides 50 | this information. 51 | 52 | [language-data]: https://github.com/rspeer/language_data 53 | 54 | langcodes is maintained by Elia Robyn Lake a.k.a. Robyn Speer, and is released 55 | as free software under the MIT license. 56 | 57 | 58 | ## Standards implemented 59 | 60 | Although this is not the only reason to use it, langcodes will make you more 61 | acronym-compliant. 62 | 63 | langcodes implements [BCP 47](http://tools.ietf.org/html/bcp47), the IETF Best 64 | Current Practices on Tags for Identifying Languages. BCP 47 is also known as 65 | RFC 5646. It subsumes ISO 639 and is backward compatible with it, and it also 66 | implements recommendations from the [Unicode CLDR](http://cldr.unicode.org). 67 | 68 | langcodes can also refer to a database of language properties and names, built 69 | from Unicode CLDR and the IANA subtag registry, if you install `language_data`. 70 | 71 | In summary, langcodes takes language codes and does the Right Thing with them, 72 | and if you want to know exactly what the Right Thing is, there are some 73 | documents you can go read. 74 | 75 | 76 | # Documentation 77 | 78 | ## Standardizing language tags 79 | 80 | This function standardizes tags, as strings, in several ways. 81 | 82 | It replaces overlong tags with their shortest version, and also formats them 83 | according to the conventions of BCP 47: 84 | 85 | >>> from langcodes import * 86 | >>> standardize_tag('eng_US') 87 | 'en-US' 88 | 89 | It removes script subtags that are redundant with the language: 90 | 91 | >>> standardize_tag('en-Latn') 92 | 'en' 93 | 94 | It replaces deprecated values with their correct versions, if possible: 95 | 96 | >>> standardize_tag('en-uk') 97 | 'en-GB' 98 | 99 | Sometimes this involves complex substitutions, such as replacing Serbo-Croatian 100 | (`sh`) with Serbian in Latin script (`sr-Latn`), or the entire tag `sgn-US` 101 | with `ase` (American Sign Language). 102 | 103 | >>> standardize_tag('sh-QU') 104 | 'sr-Latn-EU' 105 | 106 | >>> standardize_tag('sgn-US') 107 | 'ase' 108 | 109 | If *macro* is True, it uses macrolanguage codes as a replacement for the most 110 | common standardized language within that macrolanguage. 111 | 112 | >>> standardize_tag('arb-Arab', macro=True) 113 | 'ar' 114 | 115 | Even when *macro* is False, it shortens tags that contain both the 116 | macrolanguage and the language: 117 | 118 | >>> standardize_tag('zh-cmn-hans-cn') 119 | 'zh-Hans-CN' 120 | 121 | If the tag can't be parsed according to BCP 47, this will raise a 122 | LanguageTagError (a subclass of ValueError): 123 | 124 | >>> standardize_tag('spa-latn-mx') 125 | 'es-MX' 126 | 127 | >>> standardize_tag('spa-mx-latn') 128 | Traceback (most recent call last): 129 | ... 130 | langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string. 131 | 132 | 133 | ## Language objects 134 | 135 | This package defines one class, named Language, which contains the results 136 | of parsing a language tag. Language objects have the following fields, 137 | any of which may be unspecified: 138 | 139 | - *language*: the code for the language itself. 140 | - *script*: the 4-letter code for the writing system being used. 141 | - *territory*: the 2-letter or 3-digit code for the country or similar region 142 | whose usage of the language appears in this text. 143 | - *extlangs*: a list of more specific language codes that follow the language 144 | code. (This is allowed by the language code syntax, but deprecated.) 145 | - *variants*: codes for specific variations of language usage that aren't 146 | covered by the *script* or *territory* codes. 147 | - *extensions*: information that's attached to the language code for use in 148 | some specific system, such as Unicode collation orders. 149 | - *private*: a code starting with `x-` that has no defined meaning. 150 | 151 | The `Language.get` method converts a string to a Language instance, and the 152 | `Language.make` method makes a Language instance from its fields. These values 153 | are cached so that calling `Language.get` or `Language.make` again with the 154 | same values returns the same object, for efficiency. 155 | 156 | By default, it will replace non-standard and overlong tags as it interprets 157 | them. To disable this feature and get the codes that literally appear in the 158 | language tag, use the *normalize=False* option. 159 | 160 | >>> Language.get('en-Latn-US') 161 | Language.make(language='en', script='Latn', territory='US') 162 | 163 | >>> Language.get('sgn-US', normalize=False) 164 | Language.make(language='sgn', territory='US') 165 | 166 | >>> Language.get('und') 167 | Language.make() 168 | 169 | Here are some examples of replacing non-standard tags: 170 | 171 | >>> Language.get('sh-QU') 172 | Language.make(language='sr', script='Latn', territory='EU') 173 | 174 | >>> Language.get('sgn-US') 175 | Language.make(language='ase') 176 | 177 | >>> Language.get('zh-cmn-Hant') 178 | Language.make(language='zh', script='Hant') 179 | 180 | Use the `str()` function on a Language object to convert it back to its 181 | standard string form: 182 | 183 | >>> str(Language.get('sh-QU')) 184 | 'sr-Latn-EU' 185 | 186 | >>> str(Language.make(territory='IN')) 187 | 'und-IN' 188 | 189 | 190 | ### Checking validity 191 | 192 | A language code is _valid_ when every part of it is assigned a meaning by IANA. 193 | That meaning could be "private use". 194 | 195 | In langcodes, we check the language subtag, script, territory, and variants for 196 | validity. We don't check other parts such as extlangs or Unicode extensions. 197 | 198 | For example, `ja` is a valid language code, and `jp` is not: 199 | 200 | >>> Language.get('ja').is_valid() 201 | True 202 | 203 | >>> Language.get('jp').is_valid() 204 | False 205 | 206 | The top-level function `tag_is_valid(tag)` is possibly more convenient to use, 207 | because it can return False even for tags that don't parse: 208 | 209 | >>> tag_is_valid('C') 210 | False 211 | 212 | If one subtag is invalid, the entire code is invalid: 213 | 214 | >>> tag_is_valid('en-000') 215 | False 216 | 217 | `iw` is valid, though it's a deprecated alias for `he`: 218 | 219 | >>> tag_is_valid('iw') 220 | True 221 | 222 | The empty language tag (`und`) is valid: 223 | 224 | >>> tag_is_valid('und') 225 | True 226 | 227 | Private use codes are valid: 228 | 229 | >>> tag_is_valid('x-other') 230 | True 231 | 232 | >>> tag_is_valid('qaa-Qaai-AA-x-what-even-is-this') 233 | True 234 | 235 | Language tags that are very unlikely are still valid: 236 | 237 | >>> tag_is_valid('fr-Cyrl') 238 | True 239 | 240 | Tags with non-ASCII characters are invalid, because they don't parse: 241 | 242 | >>> tag_is_valid('zh-普通话') 243 | False 244 | 245 | 246 | ### Getting alpha3 codes 247 | 248 | Before there was BCP 47, there was ISO 639-2. The ISO tried to make room for the 249 | variety of human languages by assigning every language a 3-letter code, 250 | including the ones that already had 2-letter codes. 251 | 252 | Unfortunately, this just led to more confusion. Some languages ended up with two 253 | different 3-letter codes for legacy reasons, such as French, which is `fra` as a 254 | "terminology" code, and `fre` as a "biblographic" code. And meanwhile, `fr` was 255 | still a code that you'd be using if you followed ISO 639-1. 256 | 257 | In BCP 47, you should use 2-letter codes whenever they're available, and that's 258 | what langcodes does. Fortunately, all the languages that have two different 259 | 3-letter codes also have a 2-letter code, so if you prefer the 2-letter code, 260 | you don't have to worry about the distinction. 261 | 262 | But some applications want the 3-letter code in particular, so langcodes 263 | provides a method for getting those, `Language.to_alpha3()`. It returns the 264 | 'terminology' code by default, and passing `variant='B'` returns the 265 | bibliographic code. 266 | 267 | When this method returns, it always returns a 3-letter string. 268 | 269 | >>> Language.get('fr').to_alpha3() 270 | 'fra' 271 | >>> Language.get('fr-CA').to_alpha3() 272 | 'fra' 273 | >>> Language.get('fr-CA').to_alpha3(variant='B') 274 | 'fre' 275 | >>> Language.get('de').to_alpha3() 276 | 'deu' 277 | >>> Language.get('no').to_alpha3() 278 | 'nor' 279 | >>> Language.get('un').to_alpha3() 280 | Traceback (most recent call last): 281 | ... 282 | LookupError: 'un' is not a known language code, and has no alpha3 code. 283 | 284 | For many languages, the terminology and bibliographic alpha3 codes are the same. 285 | 286 | >>> Language.get('en').to_alpha3(variant='T') 287 | 'eng' 288 | >>> Language.get('en').to_alpha3(variant='B') 289 | 'eng' 290 | 291 | When you use any of these "overlong" alpha3 codes in langcodes, they normalize 292 | back to the alpha2 code: 293 | 294 | >>> Language.get('zho') 295 | Language.make(language='zh') 296 | 297 | 298 | ## Working with language names 299 | 300 | The methods in this section require an optional package called `language_data`. 301 | You can install it with `pip install language_data`, or request the optional 302 | "data" feature of langcodes with `pip install langcodes[data]`. 303 | 304 | The dependency that you put in setup.py should be `langcodes[data]`. 305 | 306 | ### Describing Language objects in natural language 307 | 308 | It's often helpful to be able to describe a language code in a way that a user 309 | (or you) can understand, instead of in inscrutable short codes. The 310 | `display_name` method lets you describe a Language object *in a language*. 311 | 312 | The `.display_name(language, min_score)` method will look up the name of the 313 | language. The names come from the IANA language tag registry, which is only in 314 | English, plus CLDR, which names languages in many commonly-used languages. 315 | 316 | The default language for naming things is English: 317 | 318 | >>> Language.make(language='fr').display_name() 319 | 'French' 320 | 321 | >>> Language.make().display_name() 322 | 'Unknown language' 323 | 324 | >>> Language.get('zh-Hans').display_name() 325 | 'Chinese (Simplified)' 326 | 327 | >>> Language.get('en-US').display_name() 328 | 'English (United States)' 329 | 330 | But you can ask for language names in numerous other languages: 331 | 332 | >>> Language.get('fr').display_name('fr') 333 | 'français' 334 | 335 | >>> Language.get('fr').display_name('es') 336 | 'francés' 337 | 338 | >>> Language.make().display_name('es') 339 | 'lengua desconocida' 340 | 341 | >>> Language.get('zh-Hans').display_name('de') 342 | 'Chinesisch (Vereinfacht)' 343 | 344 | >>> Language.get('en-US').display_name('zh-Hans') 345 | '英语(美国)' 346 | 347 | Why does everyone get Slovak and Slovenian confused? Let's ask them. 348 | 349 | >>> Language.get('sl').display_name('sl') 350 | 'slovenščina' 351 | >>> Language.get('sk').display_name('sk') 352 | 'slovenčina' 353 | >>> Language.get('sl').display_name('sk') 354 | 'slovinčina' 355 | >>> Language.get('sk').display_name('sl') 356 | 'slovaščina' 357 | 358 | If the language has a script or territory code attached to it, these will be 359 | described in parentheses: 360 | 361 | >>> Language.get('en-US').display_name() 362 | 'English (United States)' 363 | 364 | Sometimes these can be the result of tag normalization, such as in this case 365 | where the legacy tag 'sh' becomes 'sr-Latn': 366 | 367 | >>> Language.get('sh').display_name() 368 | 'Serbian (Latin)' 369 | 370 | >>> Language.get('sh', normalize=False).display_name() 371 | 'Serbo-Croatian' 372 | 373 | Naming a language in itself is sometimes a useful thing to do, so the 374 | `.autonym()` method makes this easy, providing the display name of a language 375 | in the language itself: 376 | 377 | >>> Language.get('fr').autonym() 378 | 'français' 379 | >>> Language.get('es').autonym() 380 | 'español' 381 | >>> Language.get('ja').autonym() 382 | '日本語' 383 | >>> Language.get('en-AU').autonym() 384 | 'English (Australia)' 385 | >>> Language.get('sr-Latn').autonym() 386 | 'srpski (latinica)' 387 | >>> Language.get('sr-Cyrl').autonym() 388 | 'српски (ћирилица)' 389 | 390 | The names come from the Unicode CLDR data files, and in English they can 391 | also come from the IANA language subtag registry. Together, they can give 392 | you language names in the 196 languages that CLDR supports. 393 | 394 | 395 | ### Describing components of language codes 396 | 397 | You can get the parts of the name separately with the methods `.language_name()`, 398 | `.script_name()`, and `.territory_name()`, or get a dictionary of all the parts 399 | that are present using the `.describe()` method. These methods also accept a 400 | language code for what language they should be described in. 401 | 402 | >>> shaw = Language.get('en-Shaw-GB') 403 | >>> shaw.describe('en') 404 | {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'} 405 | 406 | >>> shaw.describe('es') 407 | {'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'} 408 | 409 | 410 | ### Recognizing language names in natural language 411 | 412 | As the reverse of the above operations, you may want to look up a language by 413 | its name, converting a natural language name such as "French" to a code such as 414 | 'fr'. 415 | 416 | The name can be in any language that CLDR supports (see "Ambiguity" below). 417 | 418 | >>> import langcodes 419 | >>> langcodes.find('french') 420 | Language.make(language='fr') 421 | 422 | >>> langcodes.find('francés') 423 | Language.make(language='fr') 424 | 425 | However, this method currently ignores the parenthetical expressions that come from 426 | `.display_name()`: 427 | 428 | >>> langcodes.find('English (Canada)') 429 | Language.make(language='en') 430 | 431 | There is still room to improve the way that language names are matched, because 432 | some languages are not consistently named the same way. The method currently 433 | works with hundreds of language names that are used on Wiktionary. 434 | 435 | #### Ambiguity 436 | 437 | For the sake of usability, `langcodes.find()` doesn't require you to specify what 438 | language you're looking up a language in by name. This could potentially lead to 439 | a conflict: what if name "X" is language A's name for language B, and language C's 440 | name for language D? 441 | 442 | We can collect the language codes from CLDR and see how many times this 443 | happens. In the majority of cases like that, B and D are codes whose names are 444 | also overlapping in the _same_ language and can be resolved by some general 445 | principle. 446 | 447 | For example, no matter whether you decide "Tagalog" refers to the language code 448 | `tl` or the largely overlapping code `fil`, that distinction doesn't depend on 449 | the language you're saying "Tagalog" in. We can just return `tl` consistently. 450 | 451 | >>> langcodes.find('tagalog') 452 | Language.make(language='tl') 453 | 454 | In the few cases of actual interlingual ambiguity, langcodes won't match a result. 455 | You can pass in a `language=` parameter to say what language the name is in. 456 | 457 | For example, there are two distinct languages called "Tonga" in various languages. 458 | They are `to`, the language of Tonga which is called "Tongan" in English; and `tog`, 459 | a language of Malawi that can be called "Nyasa Tonga" in English. 460 | 461 | >>> langcodes.find('tongan') 462 | Language.make(language='to') 463 | 464 | >>> langcodes.find('nyasa tonga') 465 | Language.make(language='tog') 466 | 467 | >>> langcodes.find('tonga') 468 | Traceback (most recent call last): 469 | ... 470 | LookupError: Can't find any language named 'tonga' 471 | 472 | >>> langcodes.find('tonga', language='id') 473 | Language.make(language='to') 474 | 475 | >>> langcodes.find('tonga', language='ca') 476 | Language.make(language='tog') 477 | 478 | Other ambiguous names written in Latin letters are "Kiga", "Mbundu", "Roman", and "Ruanda". 479 | 480 | 481 | ## Demographic language data 482 | 483 | The `Language.speaking_population()` and `Language.writing_population()` 484 | methods get Unicode's estimates of how many people in the world use a 485 | language. 486 | 487 | As with the language name data, this requires the optional `language_data` 488 | package to be installed. 489 | 490 | `.speaking_population()` estimates how many people speak a language. It can 491 | be limited to a particular territory with a territory code (such as a country 492 | code). 493 | 494 | >>> Language.get('es').speaking_population() 495 | 493528077 496 | 497 | >>> Language.get('pt').speaking_population() 498 | 237496885 499 | 500 | >>> Language.get('es-BR').speaking_population() 501 | 76218 502 | 503 | >>> Language.get('pt-BR').speaking_population() 504 | 192661560 505 | 506 | >>> Language.get('vo').speaking_population() 507 | 0 508 | 509 | Script codes will be ignored, because the script is not involved in speaking: 510 | 511 | >>> Language.get('es-Hant').speaking_population() ==\ 512 | ... Language.get('es').speaking_population() 513 | True 514 | 515 | `.writing_population()` estimates how many people write a language. 516 | 517 | >>> all = Language.get('zh').writing_population() 518 | >>> all 519 | 1240841517 520 | 521 | >>> traditional = Language.get('zh-Hant').writing_population() 522 | >>> traditional 523 | 36863340 524 | 525 | >>> simplified = Language.get('zh-Hans').writing_population() 526 | >>> all == traditional + simplified 527 | True 528 | 529 | The estimates for "writing population" are often overestimates, as described 530 | in the [CLDR documentation on territory data][overestimates]. In most cases, 531 | they are derived from published data about literacy rates in the places where 532 | those languages are spoken. This doesn't take into account that many literate 533 | people around the world speak a language that isn't typically written, and 534 | write in a _different_ language. 535 | 536 | [overestimates]: https://unicode-org.github.io/cldr-staging/charts/39/supplemental/territory_language_information.html 537 | 538 | Like `.speaking_population()`, this can be limited to a particular territory: 539 | 540 | >>> Language.get('zh-Hant-HK').writing_population() 541 | 6439733 542 | >>> Language.get('zh-Hans-HK').writing_population() 543 | 338933 544 | 545 | 546 | ## Comparing and matching languages 547 | 548 | The `tag_distance` function returns a number from 0 to 134 indicating the 549 | distance between the language the user desires and a supported language. 550 | 551 | The distance data comes from CLDR v38.1 and involves a lot of judgment calls 552 | made by the Unicode consortium. 553 | 554 | 555 | ### Distance values 556 | 557 | This table summarizes the language distance values: 558 | 559 | | Value | Meaning | Example 560 | | ----: | :------ | :------ 561 | | 0 | These codes represent the same language, possibly after filling in values and normalizing. | Norwegian Bokmål → Norwegian 562 | | 1-3 | These codes indicate a minor regional difference. | Australian English → British English 563 | | 4-9 | These codes indicate a significant but unproblematic regional difference. | American English → British English 564 | | 10-24 | A gray area that depends on your use case. There may be problems with understanding or usability. | Afrikaans → Dutch, Wu Chinese → Mandarin Chinese 565 | | 25-50 | These languages aren't similar, but there are demographic reasons to expect some intelligibility. | Tamil → English, Marathi → Hindi 566 | | 51-79 | There are large barriers to understanding. | Japanese → Japanese in Hepburn romanization 567 | | 80-99 | These are different languages written in the same script. | English → French, Arabic → Urdu 568 | | 100+ | These languages have nothing particularly in common. | English → Japanese, English → Tamil 569 | 570 | See the docstring of `tag_distance` for more explanation and examples. 571 | 572 | 573 | ### Finding the best matching language 574 | 575 | Suppose you have software that supports any of the `supported_languages`. The 576 | user wants to use `desired_language`. 577 | 578 | The function `closest_supported_match(desired_language, supported_languages)` 579 | lets you choose the right language, even if there isn't an exact match. 580 | It returns the language tag of the best-supported language, even if there 581 | isn't an exact match. 582 | 583 | The `max_distance` parameter lets you set a cutoff on what counts as language 584 | support. It has a default of 25, a value that is probably okay for simple 585 | cases of i18n, but you might want to set it lower to require more precision. 586 | 587 | >>> closest_supported_match('fr', ['de', 'en', 'fr']) 588 | 'fr' 589 | 590 | >>> closest_supported_match('pt', ['pt-BR', 'pt-PT']) 591 | 'pt-BR' 592 | 593 | >>> closest_supported_match('en-AU', ['en-GB', 'en-US']) 594 | 'en-GB' 595 | 596 | >>> closest_supported_match('af', ['en', 'nl', 'zu']) 597 | 'nl' 598 | 599 | >>> closest_supported_match('und', ['en', 'und']) 600 | 'und' 601 | 602 | >>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10)) 603 | None 604 | 605 | A similar function is `closest_match(desired_language, supported_language)`, 606 | which returns both the best matching language tag and the distance. If there is 607 | no match, it returns ('und', 1000). 608 | 609 | >>> closest_match('fr', ['de', 'en', 'fr']) 610 | ('fr', 0) 611 | 612 | >>> closest_match('sh', ['hr', 'bs', 'sr-Latn', 'sr-Cyrl']) 613 | ('sr-Latn', 0) 614 | 615 | >>> closest_match('id', ['zsm', 'mhp']) 616 | ('zsm', 14) 617 | 618 | >>> closest_match('ja', ['ja-Latn-hepburn', 'en']) 619 | ('und', 1000) 620 | 621 | >>> closest_match('ja', ['ja-Latn-hepburn', 'en'], max_distance=60) 622 | ('ja-Latn-hepburn', 50) 623 | 624 | ## Further API documentation 625 | 626 | There are many more methods for manipulating and comparing language codes, 627 | and you will find them documented thoroughly in [the code itself][code]. 628 | 629 | The interesting functions all live in this one file, with extensive docstrings 630 | and annotations. Making a separate Sphinx page out of the docstrings would be 631 | the traditional thing to do, but here it just seems redundant. You can go read 632 | the docstrings in context, in their native habitat, and they'll always be up to 633 | date. 634 | 635 | [Code with documentation][code] 636 | 637 | [code]: https://github.com/rspeer/langcodes/blob/master/langcodes/__init__.py 638 | 639 | # Changelog 640 | 641 | ## Version 3.3 (November 2021) 642 | 643 | - Updated to CLDR v40. 644 | 645 | - Updated the IANA subtag registry to version 2021-08-06. 646 | 647 | - Bug fix: recognize script codes that appear in the IANA registry even if 648 | they're missing from CLDR for some reason. 'cu-Cyrs' is valid, for example. 649 | 650 | - Switched the build system from `setuptools` to `poetry`. 651 | 652 | To install the package in editable mode before PEP 660 is better supported, use 653 | `poetry install` instead of `pip install -e .`. 654 | 655 | ## Version 3.2 (October 2021) 656 | 657 | - Supports Python 3.6 through 3.10. 658 | 659 | - Added the top-level function `tag_is_valid(tag)`, for determining if a string 660 | is a valid language tag without having to parse it first. 661 | 662 | - Added the top-level function `closest_supported_match(desired, supported)`, 663 | which is similar to `closest_match` but with a simpler return value. It 664 | returns the language tag of the closest match, or None if no match is close 665 | enough. 666 | 667 | - Bug fix: a lot of well-formed but invalid language codes appeared to be 668 | valid, such as 'aaj' or 'en-Latnx', because the regex could match a prefix of 669 | a subtag. The validity regex is now required to match completely. 670 | 671 | - Bug fixes that address some edge cases of validity: 672 | 673 | - A language tag that is entirely private use, like 'x-private', is valid 674 | - A language tag that uses the same extension twice, like 'en-a-bbb-a-ccc', 675 | is invalid 676 | - A language tag that uses the same variant twice, like 'de-1901-1901', is 677 | invalid 678 | - A language tag with two extlangs, like 'sgn-ase-bfi', is invalid 679 | 680 | - Updated dependencies so they are compatible with Python 3.10, including 681 | switching back from `marisa-trie-m` to `marisa-trie` in `language_data`. 682 | 683 | - In bugfix release 3.2.1, corrected cases where the parser accepted 684 | ill-formed language tags: 685 | 686 | - All subtags must be made of between 1 and 8 alphanumeric ASCII characters 687 | - Tags with two extension 'singletons' in a row (`en-a-b-ccc`) should be 688 | rejected 689 | 690 | ## Version 3.1 (February 2021) 691 | 692 | - Added the `Language.to_alpha3()` method, for getting a three-letter code for a 693 | language according to ISO 639-2. 694 | 695 | - Updated the type annotations from obiwan-style to mypy-style. 696 | 697 | 698 | ## Version 3.0 (February 2021) 699 | 700 | - Moved bulky data, particularly language names, into a separate 701 | `language_data` package. In situations where the data isn't needed, 702 | `langcodes` becomes a smaller, pure-Python package with no dependencies. 703 | 704 | - Language codes where the language segment is more than 4 letters no longer 705 | parse: Language.get('nonsense') now returns an error. 706 | 707 | (This is technically stricter than the parse rules of BCP 47, but there are 708 | no valid language codes of this form and there should never be any. An 709 | attempt to parse a language code with 5-8 letters is most likely a mistake or 710 | an attempt to make up a code.) 711 | 712 | - Added a method for checking the validity of a language code. 713 | 714 | - Added methods for estimating language population. 715 | 716 | - Updated to CLDR 38.1, which includes differences in language matching. 717 | 718 | - Tested on Python 3.6 through 3.9; no longer tested on Python 3.5. 719 | 720 | 721 | ## Version 2.2 (February 2021) 722 | 723 | - Replaced `marisa-trie` dependency with `marisa-trie-m`, to achieve 724 | compatibility with Python 3.9. 725 | 726 | 727 | ## Version 2.1 (June 2020) 728 | 729 | - Added the `display_name` method to be a more intuitive way to get a string 730 | describing a language code, and made the `autonym` method use it instead of 731 | `language_name`. 732 | 733 | - Updated to CLDR v37. 734 | 735 | - Previously, some attempts to get the name of a language would return its 736 | language code instead, perhaps because the name was being requested in a 737 | language for which CLDR doesn't have name data. This is unfortunate because 738 | names and codes should not be interchangeable. 739 | 740 | Now we fall back on English names instead, which exists for all IANA codes. 741 | If the code is unknown, we return a string such as "Unknown language [xx]". 742 | 743 | 744 | ## Version 2.0 (April 2020) 745 | 746 | Version 2.0 involves some significant changes that may break compatibility with 1.4, 747 | in addition to updating to version 36.1 of the Unicode CLDR data and the April 2020 748 | version of the IANA subtag registry. 749 | 750 | This version requires Python 3.5 or later. 751 | 752 | ### Match scores replaced with distances 753 | 754 | Originally, the goodness of a match between two different language codes was defined 755 | in terms of a "match score" with a maximum of 100. Around 2016, Unicode started 756 | replacing this with a different measure, the "match distance", which was defined 757 | much more clearly, but we had to keep using the "match score". 758 | 759 | As of langcodes version 2.0, the "score" functions (such as 760 | `Language.match_score`, `tag_match_score`, and `best_match`) are deprecated. 761 | They'll keep using the deprecated language match tables from around CLDR 27. 762 | 763 | For a better measure of the closeness of two language codes, use `Language.distance`, 764 | `tag_distance`, and `closest_match`. 765 | 766 | ### 'region' renamed to 'territory' 767 | 768 | We were always out of step with CLDR here. Following the example of the IANA 769 | database, we referred to things like the 'US' in 'en-US' as a "region code", 770 | but the Unicode standards consistently call it a "territory code". 771 | 772 | In langcodes 2.0, parameters, dictionary keys, and attributes named `region` 773 | have been renamed to `territory`. We try to support a few common cases with 774 | deprecation warnings, such as looking up the `region` property of a Language 775 | object. 776 | 777 | A nice benefit of this is that when a dictionary is displayed with 'language', 778 | 'script', and 'territory' keys in alphabetical order, they are in the same 779 | order as they are in a language code. 780 | 781 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | The only supported version is the latest minor version released. As soon as 6 | a new minor version is released, support for the older one drops. 7 | 8 | ## Reporting a Vulnerability 9 | 10 | In order to report a security vulnerability, please contact me at 11 | [mail@georg-krause.net](mailto:mail@georg-krause.net). 12 | [Use GPG if possible](https://www.georg-krause.net/statics/public.key). 13 | 14 | If the vulnerability is confirmed, I will work on a fix and a new version as 15 | soon as possible. Since maintaining this package isn't my day job, this could 16 | take a few days. 17 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'langcodes' 21 | copyright = '2021, Robyn Speer at Luminoso' 22 | author = 'Robyn Speer at Luminoso' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'sphinx.ext.autodoc', 32 | # https://docs.readthedocs.io/en/stable/intro/getting-started-with-sphinx.html#using-markdown-with-sphinx 33 | 'myst_parser', 34 | ] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # List of patterns, relative to source directory, that match files and 40 | # directories to ignore when looking for source files. 41 | # This pattern also affects html_static_path and html_extra_path. 42 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 43 | 44 | 45 | # -- Options for HTML output ------------------------------------------------- 46 | 47 | # The theme to use for HTML and HTML Help pages. See the documentation for 48 | # a list of builtin themes. 49 | # 50 | html_theme = 'alabaster' 51 | 52 | # Add any paths that contain custom static files (such as style sheets) here, 53 | # relative to this directory. They are copied after the builtin static files, 54 | # so a file named "default.css" will overwrite the builtin "default.css". 55 | html_static_path = ['_static'] 56 | 57 | 58 | # -- Extension configuration ------------------------------------------------- 59 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. langcodes documentation master file, created by 2 | sphinx-quickstart on Fri Apr 16 21:32:52 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to langcodes's documentation! 7 | ===================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | source/modules 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | myst_parser 2 | 3 | -------------------------------------------------------------------------------- /docs/source/langcodes.rst: -------------------------------------------------------------------------------- 1 | langcodes package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | langcodes.build\_data module 8 | ---------------------------- 9 | 10 | .. automodule:: langcodes.build_data 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | langcodes.data\_dicts module 16 | ---------------------------- 17 | 18 | .. automodule:: langcodes.data_dicts 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | langcodes.language\_distance module 24 | ----------------------------------- 25 | 26 | .. automodule:: langcodes.language_distance 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | langcodes.language\_lists module 32 | -------------------------------- 33 | 34 | .. automodule:: langcodes.language_lists 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | langcodes.registry\_parser module 40 | --------------------------------- 41 | 42 | .. automodule:: langcodes.registry_parser 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | langcodes.tag\_parser module 48 | ---------------------------- 49 | 50 | .. automodule:: langcodes.tag_parser 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | langcodes.util module 56 | --------------------- 57 | 58 | .. automodule:: langcodes.util 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | Module contents 64 | --------------- 65 | 66 | .. automodule:: langcodes 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | langcodes 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | langcodes 8 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import string 2 | import langcodes 3 | 4 | # Iterate through all 2- and 3-letter language codes, and for all languages 5 | # that have enough data to represent their own name, show: 6 | # 7 | # - The original code 8 | # - The code after normalization 9 | # - The language's name in English 10 | # - The language's name in that language (its autonym) 11 | 12 | en = langcodes.get('en') 13 | 14 | for let1 in string.ascii_lowercase: 15 | for let2 in string.ascii_lowercase: 16 | for let3 in [''] + list(string.ascii_lowercase): 17 | code = let1 + let2 + let3 18 | lcode = langcodes.get(code) 19 | if lcode.has_name_data(): 20 | autonym = lcode.autonym() 21 | name = lcode.language_name() 22 | print('%-3s %-3s %-30s %s' % (code, lcode.language, name, autonym)) 23 | -------------------------------------------------------------------------------- /langcodes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | langcodes knows what languages are. It knows the standardized codes that 3 | refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi. 4 | Often, it knows what these languages are called *in* a language, and that 5 | language doesn't have to be English. 6 | 7 | See README.md for the main documentation, or read it on GitHub at 8 | https://github.com/LuminosoInsight/langcodes/ . For more specific documentation 9 | on the functions in langcodes, scroll down and read the docstrings. 10 | 11 | Some of these functions, particularly those that work with the names of 12 | languages, require the `language_data` module to be installed. 13 | """ 14 | from operator import itemgetter 15 | from typing import Any, List, Tuple, Dict, Sequence, Iterable, Optional, Mapping, Union 16 | import warnings 17 | import sys 18 | 19 | from langcodes.tag_parser import LanguageTagError, parse_tag, normalize_characters 20 | from langcodes.language_distance import tuple_distance_cached 21 | from langcodes.data_dicts import ( 22 | ALL_SCRIPTS, 23 | DEFAULT_SCRIPTS, 24 | LANGUAGE_REPLACEMENTS, 25 | LANGUAGE_ALPHA3, 26 | LANGUAGE_ALPHA3_BIBLIOGRAPHIC, 27 | TERRITORY_REPLACEMENTS, 28 | NORMALIZED_MACROLANGUAGES, 29 | LIKELY_SUBTAGS, 30 | VALIDITY, 31 | ) 32 | 33 | # When we're getting natural language information *about* languages, it's in 34 | # English if you don't specify the language. 35 | DEFAULT_LANGUAGE = 'en' 36 | 37 | 38 | LANGUAGE_NAME_IMPORT_MESSAGE = """ 39 | Looking up language names now requires the `language_data` package. 40 | 41 | Install it with: 42 | pip install language_data 43 | Or as an optional feature of langcodes: 44 | pip install langcodes[data] 45 | """ 46 | 47 | 48 | class Language: 49 | """ 50 | The Language class defines the results of parsing a language tag. 51 | Language objects have the following attributes, any of which may be 52 | unspecified (in which case their value is None): 53 | 54 | - *language*: the code for the language itself. 55 | - *script*: the 4-letter code for the writing system being used. 56 | - *territory*: the 2-letter or 3-digit code for the country or similar territory 57 | of the world whose usage of the language appears in this text. 58 | - *extlangs*: a list of more specific language codes that follow the language 59 | code. (This is allowed by the language code syntax, but deprecated.) 60 | - *variants*: codes for specific variations of language usage that aren't 61 | covered by the *script* or *territory* codes. 62 | - *extensions*: information that's attached to the language code for use in 63 | some specific system, such as Unicode collation orders. 64 | - *private*: a code starting with `x-` that has no defined meaning. 65 | 66 | The `Language.get` method converts a string to a Language instance. 67 | It's also available at the top level of this module as the `get` function. 68 | """ 69 | 70 | ATTRIBUTES = [ 71 | 'language', 72 | 'extlangs', 73 | 'script', 74 | 'territory', 75 | 'variants', 76 | 'extensions', 77 | 'private', 78 | ] 79 | 80 | # When looking up "likely subtags" data, we try looking up the data for 81 | # increasingly less specific versions of the language code. 82 | BROADER_KEYSETS = [ 83 | {'language', 'script', 'territory'}, 84 | {'language', 'territory'}, 85 | {'language', 'script'}, 86 | {'language'}, 87 | {'script'}, 88 | {}, 89 | ] 90 | 91 | MATCHABLE_KEYSETS = [ 92 | {'language', 'script', 'territory'}, 93 | {'language', 'script'}, 94 | {'language'}, 95 | ] 96 | 97 | # Values cached at the class level 98 | _INSTANCES: Dict[tuple, 'Language'] = {} 99 | _PARSE_CACHE: Dict[Tuple[str, bool], 'Language'] = {} 100 | 101 | def __init__( 102 | self, 103 | language: Optional[str] = None, 104 | extlangs: Optional[Sequence[str]] = None, 105 | script: Optional[str] = None, 106 | territory: Optional[str] = None, 107 | variants: Optional[Sequence[str]] = None, 108 | extensions: Optional[Sequence[str]] = None, 109 | private: Optional[str] = None, 110 | ): 111 | """ 112 | The constructor for Language objects. 113 | 114 | It's inefficient to call this directly, because it can't return 115 | an existing instance. Instead, call Language.make(), which 116 | has the same signature. 117 | """ 118 | self.language = language 119 | self.extlangs = extlangs 120 | self.script = script 121 | self.territory = territory 122 | self.variants = variants 123 | self.extensions = extensions 124 | self.private = private 125 | 126 | # Cached values 127 | self._simplified: 'Language' = None 128 | self._searchable: 'Language' = None 129 | self._broader: List[str] = None 130 | self._assumed: 'Language' = None 131 | self._filled: 'Language' = None 132 | self._macrolanguage: Optional['Language'] = None 133 | self._str_tag: str = None 134 | self._dict: dict = None 135 | self._disp_separator: str = None 136 | self._disp_pattern: str = None 137 | 138 | # Make sure the str_tag value is cached 139 | self.to_tag() 140 | 141 | @classmethod 142 | def make( 143 | cls, 144 | language: Optional[str] = None, 145 | extlangs: Optional[Sequence[str]] = None, 146 | script: Optional[str] = None, 147 | territory: Optional[str] = None, 148 | variants: Optional[Sequence[str]] = None, 149 | extensions: Optional[Sequence[str]] = None, 150 | private: Optional[str] = None, 151 | ) -> 'Language': 152 | """ 153 | Create a Language object by giving any subset of its attributes. 154 | 155 | If this value has been created before, return the existing value. 156 | """ 157 | values = ( 158 | language, 159 | tuple(extlangs or ()), 160 | script, 161 | territory, 162 | tuple(variants or ()), 163 | tuple(extensions or ()), 164 | private, 165 | ) 166 | if values in cls._INSTANCES: 167 | return cls._INSTANCES[values] 168 | 169 | instance = cls( 170 | language=language, 171 | extlangs=extlangs, 172 | script=script, 173 | territory=territory, 174 | variants=variants, 175 | extensions=extensions, 176 | private=private, 177 | ) 178 | cls._INSTANCES[values] = instance 179 | return instance 180 | 181 | @staticmethod 182 | def get(tag: Union[str, 'Language'], normalize=True) -> 'Language': 183 | """ 184 | Create a Language object from a language tag string. 185 | 186 | If normalize=True, non-standard or overlong tags will be replaced as 187 | they're interpreted. This is recommended. 188 | 189 | Here are several examples of language codes, which are also test cases. 190 | Most language codes are straightforward, but these examples will get 191 | pretty obscure toward the end. 192 | 193 | >>> Language.get('en-US') 194 | Language.make(language='en', territory='US') 195 | 196 | >>> Language.get('zh-Hant') 197 | Language.make(language='zh', script='Hant') 198 | 199 | >>> Language.get('und') 200 | Language.make() 201 | 202 | This function is idempotent, in case you already have a Language object: 203 | 204 | >>> Language.get(Language.get('en-us')) 205 | Language.make(language='en', territory='US') 206 | 207 | The non-code 'root' is sometimes used to represent the lack of any 208 | language information, similar to 'und'. 209 | 210 | >>> Language.get('root') 211 | Language.make() 212 | 213 | By default, getting a Language object will automatically convert 214 | deprecated tags: 215 | 216 | >>> Language.get('iw') 217 | Language.make(language='he') 218 | 219 | >>> Language.get('in') 220 | Language.make(language='id') 221 | 222 | One type of deprecated tag that should be replaced is for sign 223 | languages, which used to all be coded as regional variants of a 224 | fictitious global sign language called 'sgn'. Of course, there is no 225 | global sign language, so sign languages now have their own language 226 | codes. 227 | 228 | >>> Language.get('sgn-US') 229 | Language.make(language='ase') 230 | 231 | >>> Language.get('sgn-US', normalize=False) 232 | Language.make(language='sgn', territory='US') 233 | 234 | 'en-gb-oed' is a tag that's grandfathered into the standard because it 235 | has been used to mean "spell-check this with Oxford English Dictionary 236 | spelling", but that tag has the wrong shape. We interpret this as the 237 | new standardized tag 'en-gb-oxendict', unless asked not to normalize. 238 | 239 | >>> Language.get('en-gb-oed') 240 | Language.make(language='en', territory='GB', variants=['oxendict']) 241 | 242 | >>> Language.get('en-gb-oed', normalize=False) 243 | Language.make(language='en-gb-oed') 244 | 245 | 'zh-min-nan' is another oddly-formed tag, used to represent the 246 | Southern Min language, which includes Taiwanese as a regional form. It 247 | now has its own language code. 248 | 249 | >>> Language.get('zh-min-nan') 250 | Language.make(language='nan') 251 | 252 | The vague tag 'zh-min' is now also interpreted as 'nan', with a private 253 | extension indicating that it had a different form: 254 | 255 | >>> Language.get('zh-min') 256 | Language.make(language='nan', private='x-zh-min') 257 | 258 | Occasionally Wiktionary will use 'extlang' tags in strange ways, such 259 | as using the tag 'und-ibe' for some unspecified Iberian language. 260 | 261 | >>> Language.get('und-ibe') 262 | Language.make(extlangs=['ibe']) 263 | 264 | Here's an example of replacing multiple deprecated tags. 265 | 266 | The language tag 'sh' (Serbo-Croatian) ended up being politically 267 | problematic, and different standards took different steps to address 268 | this. The IANA made it into a macrolanguage that contains 'sr', 'hr', 269 | and 'bs'. Unicode further decided that it's a legacy tag that should 270 | be interpreted as 'sr-Latn', which the language matching rules say 271 | is mutually intelligible with all those languages. 272 | 273 | We complicate the example by adding on the territory tag 'QU', an old 274 | provisional tag for the European Union, which is now standardized as 275 | 'EU'. 276 | 277 | >>> Language.get('sh-QU') 278 | Language.make(language='sr', script='Latn', territory='EU') 279 | """ 280 | if isinstance(tag, Language): 281 | if not normalize: 282 | # shortcut: we have the tag already 283 | return tag 284 | 285 | # We might need to normalize this tag. Convert it back into a 286 | # string tag, to cover all the edge cases of normalization in a 287 | # way that we've already solved. 288 | tag = tag.to_tag() 289 | 290 | if (tag, normalize) in Language._PARSE_CACHE: 291 | return Language._PARSE_CACHE[tag, normalize] 292 | 293 | data: Dict[str, Any] = {} 294 | 295 | # If the complete tag appears as something to normalize, do the 296 | # normalization right away. Smash case and convert underscores to 297 | # hyphens when checking, because the case normalization that comes from 298 | # parse_tag() hasn't been applied yet. 299 | 300 | tag_lower = normalize_characters(tag) 301 | if normalize and tag_lower in LANGUAGE_REPLACEMENTS: 302 | tag = LANGUAGE_REPLACEMENTS[tag_lower] 303 | 304 | components = parse_tag(tag) 305 | 306 | for typ, value in components: 307 | if typ == 'extlang' and normalize and 'language' in data: 308 | # smash extlangs when possible 309 | minitag = f"{data['language']}-{value}" 310 | norm = LANGUAGE_REPLACEMENTS.get(normalize_characters(minitag)) 311 | if norm is not None: 312 | data.update(Language.get(norm, normalize).to_dict()) 313 | else: 314 | data.setdefault('extlangs', []).append(value) 315 | elif typ in {'extlang', 'variant', 'extension'}: 316 | data.setdefault(typ + 's', []).append(value) 317 | elif typ == 'language': 318 | if value == 'und': 319 | pass 320 | elif normalize: 321 | replacement = LANGUAGE_REPLACEMENTS.get(value.lower()) 322 | if replacement is not None: 323 | # parse the replacement if necessary -- this helps with 324 | # Serbian and Moldovan 325 | data.update(Language.get(replacement, normalize).to_dict()) 326 | else: 327 | data['language'] = value 328 | else: 329 | data['language'] = value 330 | elif typ == 'territory': 331 | if normalize: 332 | data['territory'] = TERRITORY_REPLACEMENTS.get(value.lower(), value) 333 | else: 334 | data['territory'] = value 335 | elif typ == 'grandfathered': 336 | # If we got here, we got a grandfathered tag but we were asked 337 | # not to normalize it, or the CLDR data doesn't know how to 338 | # normalize it. The best we can do is set the entire tag as the 339 | # language. 340 | data['language'] = value 341 | else: 342 | data[typ] = value 343 | 344 | result = Language.make(**data) 345 | Language._PARSE_CACHE[tag, normalize] = result 346 | return result 347 | 348 | def to_tag(self) -> str: 349 | """ 350 | Convert a Language back to a standard language tag, as a string. 351 | This is also the str() representation of a Language object. 352 | 353 | >>> Language.make(language='en', territory='GB').to_tag() 354 | 'en-GB' 355 | 356 | >>> Language.make(language='yue', script='Hant', territory='HK').to_tag() 357 | 'yue-Hant-HK' 358 | 359 | >>> Language.make(script='Arab').to_tag() 360 | 'und-Arab' 361 | 362 | >>> str(Language.make(territory='IN')) 363 | 'und-IN' 364 | """ 365 | if self._str_tag is not None: 366 | return self._str_tag 367 | subtags = ['und'] 368 | if self.language: 369 | subtags[0] = self.language 370 | if self.extlangs: 371 | for extlang in sorted(self.extlangs): 372 | subtags.append(extlang) 373 | if self.script: 374 | subtags.append(self.script) 375 | if self.territory: 376 | subtags.append(self.territory) 377 | if self.variants: 378 | for variant in sorted(self.variants): 379 | subtags.append(variant) 380 | if self.extensions: 381 | for ext in self.extensions: 382 | subtags.append(ext) 383 | if self.private: 384 | subtags.append(self.private) 385 | self._str_tag = '-'.join(subtags) 386 | return self._str_tag 387 | 388 | def simplify_script(self) -> 'Language': 389 | """ 390 | Remove the script from some parsed language data, if the script is 391 | redundant with the language. 392 | 393 | >>> Language.make(language='en', script='Latn').simplify_script() 394 | Language.make(language='en') 395 | 396 | >>> Language.make(language='yi', script='Latn').simplify_script() 397 | Language.make(language='yi', script='Latn') 398 | 399 | >>> Language.make(language='yi', script='Hebr').simplify_script() 400 | Language.make(language='yi') 401 | """ 402 | if self._simplified is not None: 403 | return self._simplified 404 | 405 | if self.language and self.script: 406 | if DEFAULT_SCRIPTS.get(self.language) == self.script: 407 | result = self.update_dict({'script': None}) 408 | self._simplified = result 409 | return self._simplified 410 | 411 | self._simplified = self 412 | return self._simplified 413 | 414 | def assume_script(self) -> 'Language': 415 | """ 416 | Fill in the script if it's missing, and if it can be assumed from the 417 | language subtag. This is the opposite of `simplify_script`. 418 | 419 | >>> Language.make(language='en').assume_script() 420 | Language.make(language='en', script='Latn') 421 | 422 | >>> Language.make(language='yi').assume_script() 423 | Language.make(language='yi', script='Hebr') 424 | 425 | >>> Language.make(language='yi', script='Latn').assume_script() 426 | Language.make(language='yi', script='Latn') 427 | 428 | This fills in nothing when the script cannot be assumed -- such as when 429 | the language has multiple scripts, or it has no standard orthography: 430 | 431 | >>> Language.make(language='sr').assume_script() 432 | Language.make(language='sr') 433 | 434 | >>> Language.make(language='eee').assume_script() 435 | Language.make(language='eee') 436 | 437 | It also doesn't fill anything in when the language is unspecified. 438 | 439 | >>> Language.make(territory='US').assume_script() 440 | Language.make(territory='US') 441 | """ 442 | if self._assumed is not None: 443 | return self._assumed 444 | if self.language and not self.script: 445 | try: 446 | self._assumed = self.update_dict( 447 | {'script': DEFAULT_SCRIPTS[self.language]} 448 | ) 449 | except KeyError: 450 | self._assumed = self 451 | else: 452 | self._assumed = self 453 | return self._assumed 454 | 455 | def prefer_macrolanguage(self) -> 'Language': 456 | """ 457 | BCP 47 doesn't specify what to do with macrolanguages and the languages 458 | they contain. The Unicode CLDR, on the other hand, says that when a 459 | macrolanguage has a dominant standardized language, the macrolanguage 460 | code should be used for that language. For example, Mandarin Chinese 461 | is 'zh', not 'cmn', according to Unicode, and Malay is 'ms', not 'zsm'. 462 | 463 | This isn't a rule you'd want to follow in all cases -- for example, you may 464 | want to be able to specifically say that 'ms' (the Malay macrolanguage) 465 | contains both 'zsm' (Standard Malay) and 'id' (Indonesian). But applying 466 | this rule helps when interoperating with the Unicode CLDR. 467 | 468 | So, applying `prefer_macrolanguage` to a Language object will 469 | return a new object, replacing the language with the macrolanguage if 470 | it is the dominant language within that macrolanguage. It will leave 471 | non-dominant languages that have macrolanguages alone. 472 | 473 | >>> Language.get('arb').prefer_macrolanguage() 474 | Language.make(language='ar') 475 | 476 | >>> Language.get('cmn-Hant').prefer_macrolanguage() 477 | Language.make(language='zh', script='Hant') 478 | 479 | >>> Language.get('yue-Hant').prefer_macrolanguage() 480 | Language.make(language='yue', script='Hant') 481 | """ 482 | if self._macrolanguage is not None: 483 | return self._macrolanguage 484 | language = self.language or 'und' 485 | if language in NORMALIZED_MACROLANGUAGES: 486 | self._macrolanguage = self.update_dict( 487 | {'language': NORMALIZED_MACROLANGUAGES[language]} 488 | ) 489 | else: 490 | self._macrolanguage = self 491 | return self._macrolanguage 492 | 493 | def to_alpha3(self, variant: str = 'T') -> str: 494 | """ 495 | Get the three-letter language code for this language, even if it's 496 | canonically written with a two-letter code. 497 | 498 | These codes are the 'alpha3' codes defined by ISO 639-2. 499 | 500 | When this function returns, it always returns a 3-letter string. If 501 | there is no known alpha3 code for the language, it raises a LookupError. 502 | 503 | In cases where the distinction matters, we default to the 'terminology' 504 | code. You can pass `variant='B'` to get the 'bibliographic' code instead. 505 | For example, the terminology code for German is 'deu', while the 506 | bibliographic code is 'ger'. 507 | 508 | (The confusion between these two sets of codes is a good reason to avoid 509 | using alpha3 codes. Every language that has two different alpha3 codes 510 | also has an alpha2 code that's preferred, such as 'de' for German.) 511 | 512 | >>> Language.get('fr').to_alpha3() 513 | 'fra' 514 | >>> Language.get('fr-CA').to_alpha3() 515 | 'fra' 516 | >>> Language.get('fr').to_alpha3(variant='B') 517 | 'fre' 518 | >>> Language.get('de').to_alpha3(variant='T') 519 | 'deu' 520 | >>> Language.get('ja').to_alpha3() 521 | 'jpn' 522 | >>> Language.get('un').to_alpha3() 523 | Traceback (most recent call last): 524 | ... 525 | LookupError: 'un' is not a known language code, and has no alpha3 code. 526 | 527 | 528 | All valid two-letter language codes have corresponding alpha3 codes, 529 | even the un-normalized ones. If they were assigned an alpha3 code by ISO 530 | before they were assigned a normalized code by CLDR, these codes may be 531 | different: 532 | 533 | >>> Language.get('tl', normalize=False).to_alpha3() 534 | 'tgl' 535 | >>> Language.get('tl').to_alpha3() 536 | 'fil' 537 | >>> Language.get('sh', normalize=False).to_alpha3() 538 | 'hbs' 539 | 540 | 541 | Three-letter codes are preserved, even if they're unknown: 542 | 543 | >>> Language.get('qqq').to_alpha3() 544 | 'qqq' 545 | >>> Language.get('und').to_alpha3() 546 | 'und' 547 | """ 548 | variant = variant.upper() 549 | if variant not in 'BT': 550 | raise ValueError("Variant must be 'B' or 'T'") 551 | 552 | language = self.language 553 | if language is None: 554 | return 'und' 555 | elif len(language) == 3: 556 | return language 557 | else: 558 | if variant == 'B' and language in LANGUAGE_ALPHA3_BIBLIOGRAPHIC: 559 | return LANGUAGE_ALPHA3_BIBLIOGRAPHIC[language] 560 | elif language in LANGUAGE_ALPHA3: 561 | return LANGUAGE_ALPHA3[language] 562 | else: 563 | raise LookupError( 564 | f"{language!r} is not a known language code, " 565 | "and has no alpha3 code." 566 | ) 567 | 568 | def broader_tags(self) -> List[str]: 569 | """ 570 | Iterate through increasingly general tags for this language. 571 | 572 | This isn't actually that useful for matching two arbitrary language tags 573 | against each other, but it is useful for matching them against a known 574 | standardized form, such as in the CLDR data. 575 | 576 | The list of broader versions to try appears in UTR 35, section 4.3, 577 | "Likely Subtags". 578 | 579 | >>> Language.get('nn-Latn-NO-x-thingy').broader_tags() 580 | ['nn-Latn-NO-x-thingy', 'nn-Latn-NO', 'nn-NO', 'nn-Latn', 'nn', 'und-Latn', 'und'] 581 | 582 | >>> Language.get('arb-Arab').broader_tags() 583 | ['arb-Arab', 'ar-Arab', 'arb', 'ar', 'und-Arab', 'und'] 584 | """ 585 | if self._broader is not None: 586 | return self._broader 587 | self._broader = [self.to_tag()] 588 | seen = set([self.to_tag()]) 589 | for keyset in self.BROADER_KEYSETS: 590 | for start_language in (self, self.prefer_macrolanguage()): 591 | filtered = start_language._filter_attributes(keyset) 592 | tag = filtered.to_tag() 593 | if tag not in seen: 594 | self._broader.append(tag) 595 | seen.add(tag) 596 | return self._broader 597 | 598 | def broaden(self) -> 'List[Language]': 599 | """ 600 | Like `broader_tags`, but returrns Language objects instead of strings. 601 | """ 602 | return [Language.get(tag) for tag in self.broader_tags()] 603 | 604 | def maximize(self) -> 'Language': 605 | """ 606 | The Unicode CLDR contains a "likelySubtags" data file, which can guess 607 | reasonable values for fields that are missing from a language tag. 608 | 609 | This is particularly useful for comparing, for example, "zh-Hant" and 610 | "zh-TW", two common language tags that say approximately the same thing 611 | via rather different information. (Using traditional Han characters is 612 | not the same as being in Taiwan, but each implies that the other is 613 | likely.) 614 | 615 | These implications are provided in the CLDR supplemental data, and are 616 | based on the likelihood of people using the language to transmit text 617 | on the Internet. (This is why the overall default is English, not 618 | Chinese.) 619 | 620 | It's important to recognize that these tags amplify majorities, and 621 | that not all language support fits into a "likely" language tag. 622 | 623 | >>> str(Language.get('zh-Hant').maximize()) 624 | 'zh-Hant-TW' 625 | >>> str(Language.get('zh-TW').maximize()) 626 | 'zh-Hant-TW' 627 | >>> str(Language.get('ja').maximize()) 628 | 'ja-Jpan-JP' 629 | >>> str(Language.get('pt').maximize()) 630 | 'pt-Latn-BR' 631 | >>> str(Language.get('und-Arab').maximize()) 632 | 'ar-Arab-EG' 633 | >>> str(Language.get('und-CH').maximize()) 634 | 'de-Latn-CH' 635 | 636 | As many standards are, this is US-centric: 637 | 638 | >>> str(Language.make().maximize()) 639 | 'en-Latn-US' 640 | 641 | "Extlangs" have no likely-subtags information, so they will give 642 | maximized results that make no sense: 643 | 644 | >>> str(Language.get('und-ibe').maximize()) 645 | 'en-ibe-Latn-US' 646 | """ 647 | if self._filled is not None: 648 | return self._filled 649 | 650 | for tag in self.broader_tags(): 651 | if tag in LIKELY_SUBTAGS: 652 | result = Language.get(LIKELY_SUBTAGS[tag], normalize=False) 653 | result = result.update(self) 654 | self._filled = result 655 | return result 656 | 657 | raise RuntimeError( 658 | "Couldn't fill in likely values. This represents a problem with " 659 | "the LIKELY_SUBTAGS data." 660 | ) 661 | 662 | # Support an old, wordier name for the method 663 | fill_likely_values = maximize 664 | 665 | def match_score(self, supported: 'Language') -> int: 666 | """ 667 | DEPRECATED: use .distance() instead, which uses newer data and is _lower_ 668 | for better matching languages. 669 | """ 670 | warnings.warn( 671 | "`match_score` is deprecated because it's based on deprecated CLDR info. " 672 | "Use `distance` instead, which is _lower_ for better matching languages. ", 673 | DeprecationWarning, 674 | ) 675 | return 100 - min(self.distance(supported), 100) 676 | 677 | def distance(self, supported: 'Language', ignore_script: bool = False) -> int: 678 | """ 679 | Suppose that `self` is the language that the user desires, and 680 | `supported` is a language that is actually supported. 681 | 682 | This method returns a number from 0 to 134 measuring the 'distance' 683 | between the languages (lower numbers are better). This is not a 684 | symmetric relation. If `ignore_script` is `True`, the script will 685 | not be used in the comparison, possibly resulting in a smaller 686 | 'distance'. 687 | 688 | The language distance is not really about the linguistic similarity or 689 | history of the languages; instead, it's based largely on sociopolitical 690 | factors, indicating which language speakers are likely to know which 691 | other languages in the present world. Much of the heuristic is about 692 | finding a widespread 'world language' like English, Chinese, French, or 693 | Russian that speakers of a more localized language will accept. 694 | 695 | A version that works on language tags, as strings, is in the function 696 | `tag_distance`. See that function for copious examples. 697 | """ 698 | if supported == self: 699 | return 0 700 | 701 | # CLDR has realized that these matching rules are undermined when the 702 | # unspecified language 'und' gets maximized to 'en-Latn-US', so this case 703 | # is specifically not maximized: 704 | if self.language is None and self.script is None and self.territory is None: 705 | desired_triple = ('und', 'Zzzz', 'ZZ') 706 | else: 707 | desired_complete = self.prefer_macrolanguage().maximize() 708 | 709 | desired_triple = ( 710 | desired_complete.language, 711 | None if ignore_script else desired_complete.script, 712 | desired_complete.territory, 713 | ) 714 | 715 | if ( 716 | supported.language is None 717 | and supported.script is None 718 | and supported.territory is None 719 | ): 720 | supported_triple = ('und', 'Zzzz', 'ZZ') 721 | else: 722 | supported_complete = supported.prefer_macrolanguage().maximize() 723 | 724 | supported_triple = ( 725 | supported_complete.language, 726 | None if ignore_script else supported_complete.script, 727 | supported_complete.territory, 728 | ) 729 | 730 | return tuple_distance_cached(desired_triple, supported_triple) 731 | 732 | def is_valid(self) -> bool: 733 | """ 734 | Checks whether the language, script, territory, and variants 735 | (if present) are all tags that have meanings assigned by IANA. 736 | For example, 'ja' (Japanese) is a valid tag, and 'jp' is not. 737 | 738 | The data is current as of CLDR 40. 739 | 740 | >>> Language.get('ja').is_valid() 741 | True 742 | >>> Language.get('jp').is_valid() 743 | False 744 | >>> Language.get('en-001').is_valid() 745 | True 746 | >>> Language.get('en-000').is_valid() 747 | False 748 | >>> Language.get('en-Latn').is_valid() 749 | True 750 | >>> Language.get('en-Latnx').is_valid() 751 | False 752 | >>> Language.get('und').is_valid() 753 | True 754 | >>> Language.get('en-GB-oxendict').is_valid() 755 | True 756 | >>> Language.get('en-GB-oxenfree').is_valid() 757 | False 758 | >>> Language.get('x-heptapod').is_valid() 759 | True 760 | 761 | Some scripts are, confusingly, not included in CLDR's 'validity' pattern. 762 | If a script appears in the IANA registry, we consider it valid. 763 | 764 | >>> Language.get('ur-Aran').is_valid() 765 | True 766 | >>> Language.get('cu-Cyrs').is_valid() 767 | True 768 | 769 | A language tag with multiple extlangs will parse, but is not valid. 770 | The only allowed example is 'zh-min-nan', which normalizes to the 771 | language 'nan'. 772 | 773 | >>> Language.get('zh-min-nan').is_valid() 774 | True 775 | >>> Language.get('sgn-ase-bfi').is_valid() 776 | False 777 | 778 | These examples check that duplicate tags are not valid: 779 | 780 | >>> Language.get('de-1901').is_valid() 781 | True 782 | >>> Language.get('de-1901-1901').is_valid() 783 | False 784 | >>> Language.get('en-a-bbb-c-ddd').is_valid() 785 | True 786 | >>> Language.get('en-a-bbb-a-ddd').is_valid() 787 | False 788 | 789 | Of course, you should be prepared to catch a failure to parse the 790 | language code at all: 791 | 792 | >>> Language.get('C').is_valid() 793 | Traceback (most recent call last): 794 | ... 795 | langcodes.tag_parser.LanguageTagError: Expected a language code, got 'c' 796 | """ 797 | if self.extlangs is not None: 798 | # An erratum to BCP 47 says that tags with more than one extlang are 799 | # invalid. 800 | if len(self.extlangs) > 1: 801 | return False 802 | 803 | subtags = [self.language, self.script, self.territory] 804 | checked_subtags = [] 805 | if self.variants is not None: 806 | subtags.extend(self.variants) 807 | for subtag in subtags: 808 | if subtag is not None: 809 | checked_subtags.append(subtag) 810 | if not subtag.startswith('x-') and not VALIDITY.match(subtag): 811 | if subtag not in ALL_SCRIPTS: 812 | return False 813 | 814 | # We check extensions for validity by ensuring that there aren't 815 | # two extensions introduced by the same letter. For example, you can't 816 | # have two 'u-' extensions. 817 | if self.extensions: 818 | checked_subtags.extend([extension[:2] for extension in self.extensions]) 819 | if len(set(checked_subtags)) != len(checked_subtags): 820 | return False 821 | return True 822 | 823 | def has_name_data(self) -> bool: 824 | """ 825 | Return True when we can name languages in this language. Requires 826 | `language_data` to be installed. 827 | 828 | This is true when the language, or one of its 'broader' versions, is in 829 | the list of CLDR target languages. 830 | 831 | >>> Language.get('fr').has_name_data() 832 | True 833 | >>> Language.get('so').has_name_data() 834 | True 835 | >>> Language.get('enc').has_name_data() 836 | False 837 | >>> Language.get('und').has_name_data() 838 | False 839 | """ 840 | try: 841 | from language_data.name_data import LANGUAGES_WITH_NAME_DATA 842 | except ImportError: 843 | print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) 844 | raise 845 | 846 | matches = set(self.broader_tags()) & LANGUAGES_WITH_NAME_DATA 847 | return bool(matches) 848 | 849 | # These methods help to show what the language tag means in natural 850 | # language. They actually apply the language-matching algorithm to find 851 | # the right language to name things in. 852 | 853 | def _get_name( 854 | self, attribute: str, language: Union[str, 'Language'], max_distance: int 855 | ) -> str: 856 | try: 857 | from language_data.names import code_to_names 858 | except ImportError: 859 | print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) 860 | raise 861 | 862 | assert attribute in self.ATTRIBUTES 863 | if isinstance(language, str): 864 | language = Language.get(language) 865 | 866 | attr_value = getattr(self, attribute) 867 | if attr_value is None: 868 | if attribute == 'language': 869 | attr_value = 'und' 870 | else: 871 | return None 872 | names = code_to_names(attr_value) 873 | 874 | result = self._best_name(names, language, max_distance) 875 | if result is not None: 876 | return result 877 | else: 878 | # Construct a string like "Unknown language [zzz]" 879 | placeholder = None 880 | if attribute == 'language': 881 | placeholder = 'und' 882 | elif attribute == 'script': 883 | placeholder = 'Zzzz' 884 | elif attribute == 'territory': 885 | placeholder = 'ZZ' 886 | 887 | unknown_name = None 888 | if placeholder is not None: 889 | names = code_to_names(placeholder) 890 | unknown_name = self._best_name(names, language, max_distance) 891 | if unknown_name is None: 892 | unknown_name = 'Unknown language subtag' 893 | return f'{unknown_name} [{attr_value}]' 894 | 895 | def _best_name( 896 | self, names: Mapping[str, str], language: 'Language', max_distance: int 897 | ): 898 | matchable_languages = set(language.broader_tags()) 899 | possible_languages = [ 900 | key for key in sorted(names.keys()) if key in matchable_languages 901 | ] 902 | 903 | target_language, score = closest_match( 904 | language, possible_languages, max_distance 905 | ) 906 | if target_language in names: 907 | return names[target_language] 908 | else: 909 | return names.get(DEFAULT_LANGUAGE) 910 | 911 | def language_name( 912 | self, 913 | language: Union[str, 'Language'] = DEFAULT_LANGUAGE, 914 | max_distance: int = 25, 915 | ) -> str: 916 | """ 917 | Give the name of the language (not the entire tag, just the language part) 918 | in a natural language. The target language can be given as a string or 919 | another Language object. 920 | 921 | By default, things are named in English: 922 | 923 | >>> Language.get('fr').language_name() 924 | 'French' 925 | >>> Language.get('el').language_name() 926 | 'Greek' 927 | 928 | But you can ask for language names in numerous other languages: 929 | 930 | >>> Language.get('fr').language_name('fr') 931 | 'français' 932 | >>> Language.get('el').language_name('fr') 933 | 'grec' 934 | 935 | Why does everyone get Slovak and Slovenian confused? Let's ask them. 936 | 937 | >>> Language.get('sl').language_name('sl') 938 | 'slovenščina' 939 | >>> Language.get('sk').language_name('sk') 940 | 'slovenčina' 941 | >>> Language.get('sl').language_name('sk') 942 | 'slovinčina' 943 | >>> Language.get('sk').language_name('sl') 944 | 'slovaščina' 945 | """ 946 | return self._get_name('language', language, max_distance) 947 | 948 | def display_name( 949 | self, 950 | language: Union[str, 'Language'] = DEFAULT_LANGUAGE, 951 | max_distance: int = 25, 952 | ) -> str: 953 | """ 954 | It's often helpful to be able to describe a language code in a way that a user 955 | (or you) can understand, instead of in inscrutable short codes. The 956 | `display_name` method lets you describe a Language object *in a language*. 957 | 958 | The `.display_name(language, min_score)` method will look up the name of the 959 | language. The names come from the IANA language tag registry, which is only in 960 | English, plus CLDR, which names languages in many commonly-used languages. 961 | 962 | The default language for naming things is English: 963 | 964 | >>> Language.make(language='fr').display_name() 965 | 'French' 966 | 967 | >>> Language.make().display_name() 968 | 'Unknown language' 969 | 970 | >>> Language.get('zh-Hans').display_name() 971 | 'Chinese (Simplified)' 972 | 973 | >>> Language.get('en-US').display_name() 974 | 'English (United States)' 975 | 976 | But you can ask for language names in numerous other languages: 977 | 978 | >>> Language.get('fr').display_name('fr') 979 | 'français' 980 | 981 | >>> Language.get('fr').display_name('es') 982 | 'francés' 983 | 984 | >>> Language.make().display_name('es') 985 | 'lengua desconocida' 986 | 987 | >>> Language.get('zh-Hans').display_name('de') 988 | 'Chinesisch (Vereinfacht)' 989 | 990 | >>> Language.get('en-US').display_name('zh-Hans') 991 | '英语(美国)' 992 | """ 993 | reduced = self.simplify_script() 994 | language = Language.get(language) 995 | language_name = reduced.language_name(language, max_distance) 996 | extra_parts = [] 997 | 998 | if reduced.script is not None: 999 | extra_parts.append(reduced.script_name(language, max_distance)) 1000 | if reduced.territory is not None: 1001 | extra_parts.append(reduced.territory_name(language, max_distance)) 1002 | 1003 | if extra_parts: 1004 | clarification = language._display_separator().join(extra_parts) 1005 | pattern = language._display_pattern() 1006 | return pattern.format(language_name, clarification) 1007 | else: 1008 | return language_name 1009 | 1010 | def _display_pattern(self) -> str: 1011 | """ 1012 | Get the pattern, according to CLDR, that should be used for clarifying 1013 | details of a language code. 1014 | """ 1015 | # Technically we are supposed to look up this pattern in each language. 1016 | # Practically, it's the same in every language except Chinese, where the 1017 | # parentheses are full-width. 1018 | if self._disp_pattern is not None: 1019 | return self._disp_pattern 1020 | if self.distance(Language.get('zh')) <= 25 or self.distance(Language.get('zh-Hant')) <= 25: 1021 | self._disp_pattern = "{0}({1})" 1022 | else: 1023 | self._disp_pattern = "{0} ({1})" 1024 | return self._disp_pattern 1025 | 1026 | def _display_separator(self) -> str: 1027 | """ 1028 | Get the symbol that should be used to separate multiple clarifying 1029 | details -- such as a comma in English, or an ideographic comma in 1030 | Japanese. 1031 | 1032 | Requires that `language_data` is installed. 1033 | """ 1034 | try: 1035 | from language_data.names import DISPLAY_SEPARATORS 1036 | except ImportError: 1037 | print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) 1038 | raise 1039 | 1040 | if self._disp_separator is not None: 1041 | return self._disp_separator 1042 | matched, _dist = closest_match(self, DISPLAY_SEPARATORS.keys()) 1043 | self._disp_separator = DISPLAY_SEPARATORS[matched] 1044 | return self._disp_separator 1045 | 1046 | def autonym(self, max_distance: int = 9) -> str: 1047 | """ 1048 | Give the display name of this language *in* this language. 1049 | Requires that `language_data` is installed. 1050 | 1051 | >>> Language.get('fr').autonym() 1052 | 'français' 1053 | >>> Language.get('es').autonym() 1054 | 'español' 1055 | >>> Language.get('ja').autonym() 1056 | '日本語' 1057 | 1058 | This uses the `display_name()` method, so it can include the name of a 1059 | script or territory when appropriate. 1060 | 1061 | >>> Language.get('en-AU').autonym() 1062 | 'English (Australia)' 1063 | >>> Language.get('sr-Latn').autonym() 1064 | 'srpski (latinica)' 1065 | >>> Language.get('sr-Cyrl').autonym() 1066 | 'српски (ћирилица)' 1067 | >>> Language.get('pa').autonym() 1068 | 'ਪੰਜਾਬੀ' 1069 | >>> Language.get('pa-Arab').autonym() 1070 | 'پنجابی (عربی)' 1071 | 1072 | This only works for language codes that CLDR has locale data for. You 1073 | can't ask for the autonym of 'ja-Latn' and get 'nihongo (rōmaji)'. 1074 | """ 1075 | lang = self.prefer_macrolanguage() 1076 | return lang.display_name(language=lang, max_distance=max_distance) 1077 | 1078 | def script_name( 1079 | self, 1080 | language: Union[str, 'Language'] = DEFAULT_LANGUAGE, 1081 | max_distance: int = 25, 1082 | ) -> str: 1083 | """ 1084 | Describe the script part of the language tag in a natural language. 1085 | Requires that `language_data` is installed. 1086 | """ 1087 | return self._get_name('script', language, max_distance) 1088 | 1089 | def territory_name( 1090 | self, 1091 | language: Union[str, 'Language'] = DEFAULT_LANGUAGE, 1092 | max_distance: int = 25, 1093 | ) -> str: 1094 | """ 1095 | Describe the territory part of the language tag in a natural language. 1096 | Requires that `language_data` is installed. 1097 | """ 1098 | return self._get_name('territory', language, max_distance) 1099 | 1100 | def region_name( 1101 | self, 1102 | language: Union[str, 'Language'] = DEFAULT_LANGUAGE, 1103 | max_distance: int = 25, 1104 | ) -> str: 1105 | warnings.warn( 1106 | "`region_name` has been renamed to `territory_name` for consistency", 1107 | DeprecationWarning, 1108 | ) 1109 | return self.territory_name(language, max_distance) 1110 | 1111 | @property 1112 | def region(self): 1113 | warnings.warn( 1114 | "The `region` property has been renamed to `territory` for consistency", 1115 | DeprecationWarning, 1116 | ) 1117 | return self.territory 1118 | 1119 | def variant_names( 1120 | self, 1121 | language: Union[str, 'Language'] = DEFAULT_LANGUAGE, 1122 | max_distance: int = 25, 1123 | ) -> Sequence[str]: 1124 | """ 1125 | Deprecated in version 3.0. 1126 | 1127 | We don't store names for variants anymore, so this just returns the list 1128 | of variant codes, such as ['oxendict'] for en-GB-oxendict. 1129 | """ 1130 | warnings.warn( 1131 | "variant_names is deprecated and just returns the variant codes", 1132 | DeprecationWarning, 1133 | ) 1134 | return self.variants or [] 1135 | 1136 | def describe( 1137 | self, 1138 | language: Union[str, 'Language'] = DEFAULT_LANGUAGE, 1139 | max_distance: int = 25, 1140 | ) -> dict: 1141 | """ 1142 | Return a dictionary that describes a given language tag in a specified 1143 | natural language. Requires that `language_data` is installed. 1144 | 1145 | See `language_name` and related methods for more specific versions of this. 1146 | 1147 | The desired `language` will in fact be matched against the available 1148 | options using the matching technique that this module provides. We can 1149 | illustrate many aspects of this by asking for a description of Shavian 1150 | script (a phonetic script for English devised by author George Bernard 1151 | Shaw), and where you might find it, in various languages. 1152 | 1153 | >>> shaw = Language.make(script='Shaw').maximize() 1154 | >>> shaw.describe('en') 1155 | {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'} 1156 | 1157 | >>> shaw.describe('fr') 1158 | {'language': 'anglais', 'script': 'shavien', 'territory': 'Royaume-Uni'} 1159 | 1160 | >>> shaw.describe('es') 1161 | {'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'} 1162 | 1163 | >>> shaw.describe('pt') 1164 | {'language': 'inglês', 'script': 'shaviano', 'territory': 'Reino Unido'} 1165 | 1166 | >>> shaw.describe('uk') 1167 | {'language': 'англійська', 'script': 'шоу', 'territory': 'Велика Британія'} 1168 | 1169 | >>> shaw.describe('arb') 1170 | {'language': 'الإنجليزية', 'script': 'الشواني', 'territory': 'المملكة المتحدة'} 1171 | 1172 | >>> shaw.describe('th') 1173 | {'language': 'อังกฤษ', 'script': 'ซอเวียน', 'territory': 'สหราชอาณาจักร'} 1174 | 1175 | >>> shaw.describe('zh-Hans') 1176 | {'language': '英语', 'script': '萧伯纳式文', 'territory': '英国'} 1177 | 1178 | >>> shaw.describe('zh-Hant') 1179 | {'language': '英文', 'script': '簫柏納字符', 'territory': '英國'} 1180 | 1181 | >>> shaw.describe('ja') 1182 | {'language': '英語', 'script': 'ショー文字', 'territory': 'イギリス'} 1183 | 1184 | When we don't have a localization for the language, we fall back on English, 1185 | because the IANA provides names for all known codes in English. 1186 | 1187 | >>> shaw.describe('lol') 1188 | {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'} 1189 | 1190 | When the language tag itself is a valid tag but with no known meaning, we 1191 | say so in the appropriate language. 1192 | 1193 | >>> Language.get('xyz-ZY').display_name() 1194 | 'Unknown language [xyz] (Unknown Region [ZY])' 1195 | 1196 | >>> Language.get('xyz-ZY').display_name('es') 1197 | 'lengua desconocida [xyz] (Región desconocida [ZY])' 1198 | """ 1199 | names = {} 1200 | if self.language: 1201 | names['language'] = self.language_name(language, max_distance) 1202 | if self.script: 1203 | names['script'] = self.script_name(language, max_distance) 1204 | if self.territory: 1205 | names['territory'] = self.territory_name(language, max_distance) 1206 | return names 1207 | 1208 | def speaking_population(self) -> int: 1209 | """ 1210 | Get an estimate of how many people in the world speak this language, 1211 | derived from CLDR data. Requires that `language_data` is installed. 1212 | 1213 | Only the language and territory codes will be considered. If a 1214 | territory code is included, the population will count only the 1215 | speakers of the language in that territory. 1216 | 1217 | Script subtags are disregarded, because it doesn't make sense to ask 1218 | how many people speak in a particular writing script. 1219 | 1220 | >>> Language.get('es').speaking_population() 1221 | 493528077 1222 | >>> Language.get('pt').speaking_population() 1223 | 237496885 1224 | >>> Language.get('es-BR').speaking_population() 1225 | 76218 1226 | >>> Language.get('pt-BR').speaking_population() 1227 | 192661560 1228 | >>> Language.get('vo').speaking_population() 1229 | 0 1230 | """ 1231 | try: 1232 | from language_data.population_data import LANGUAGE_SPEAKING_POPULATION 1233 | except ImportError: 1234 | print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) 1235 | raise 1236 | 1237 | lang = self._filter_attributes(['language', 'territory']) 1238 | return LANGUAGE_SPEAKING_POPULATION.get(str(lang), 0) 1239 | 1240 | def writing_population(self) -> int: 1241 | """ 1242 | Get an estimate of how many people in the world read and write 1243 | this language, derived from CLDR data. Requires that `language_data` 1244 | is installed. 1245 | 1246 | For many languages that aren't typically written, this is an 1247 | overestimate, according to CLDR -- the data often includes people who 1248 | speak that language but write in a different language. 1249 | 1250 | Only the language, script, and territory codes will be considered. 1251 | If a territory code is included, the population will count only the 1252 | speakers of the language in that territory. 1253 | 1254 | >>> all = Language.get('zh').writing_population() 1255 | >>> all 1256 | 1240841517 1257 | 1258 | >>> traditional = Language.get('zh-Hant').writing_population() 1259 | >>> traditional 1260 | 36863340 1261 | 1262 | >>> simplified = Language.get('zh-Hans').writing_population() 1263 | >>> all == traditional + simplified 1264 | True 1265 | 1266 | >>> Language.get('zh-Hant-HK').writing_population() 1267 | 6439733 1268 | >>> Language.get('zh-Hans-HK').writing_population() 1269 | 338933 1270 | 1271 | Note that if you want to get the total Chinese writing population 1272 | of Hong Kong, you need to avoid normalization that would interpret 1273 | 'zh-HK' as 'zh-Hant-HK'. 1274 | 1275 | >>> Language.get('zh-HK', normalize=False).writing_population() 1276 | 6778666 1277 | 1278 | Unknown or unspecified language codes get a population of 0. 1279 | 1280 | >>> Language.get('xyz').writing_population() 1281 | 0 1282 | 1283 | >>> Language.get('und').writing_population() 1284 | 0 1285 | """ 1286 | try: 1287 | from language_data.population_data import LANGUAGE_WRITING_POPULATION 1288 | except ImportError: 1289 | print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) 1290 | raise 1291 | 1292 | lang = self._filter_attributes(['language', 'script', 'territory']) 1293 | if str(lang) in LANGUAGE_WRITING_POPULATION: 1294 | return LANGUAGE_WRITING_POPULATION[str(lang)] 1295 | else: 1296 | lang = lang.simplify_script() 1297 | return LANGUAGE_WRITING_POPULATION.get(str(lang), 0) 1298 | 1299 | @staticmethod 1300 | def find_name( 1301 | tagtype: str, name: str, language: Optional[Union[str, 'Language']] = None 1302 | ) -> 'Language': 1303 | """ 1304 | Find the subtag of a particular `tagtype` that has the given `name`. 1305 | Requires that `language_data` is installed. 1306 | 1307 | The default language, "und", will allow matching names in any language, 1308 | so you can get the code 'fr' by looking up "French", "Français", or 1309 | "francés". 1310 | 1311 | Occasionally, names are ambiguous in a way that can be resolved by 1312 | specifying what name the language is supposed to be in. For example, 1313 | there is a language named 'Malayo' in English, but it's different from 1314 | the language named 'Malayo' in Spanish (which is Malay). Specifying the 1315 | language will look up the name in a trie that is only in that language. 1316 | 1317 | In a previous version, we thought we were going to deprecate the 1318 | `language` parameter, as there weren't significant cases of conflicts 1319 | in names of things between languages. Well, we got more data, and 1320 | conflicts in names are everywhere. 1321 | 1322 | Specifying the language that the name should be in is still not 1323 | required, but it will help to make sure that names can be 1324 | round-tripped. 1325 | 1326 | >>> Language.find_name('language', 'francés') 1327 | Language.make(language='fr') 1328 | 1329 | >>> Language.find_name('territory', 'United Kingdom') 1330 | Language.make(territory='GB') 1331 | 1332 | >>> Language.find_name('script', 'Arabic') 1333 | Language.make(script='Arab') 1334 | 1335 | >>> Language.find_name('language', 'norsk bokmål') 1336 | Language.make(language='nb') 1337 | 1338 | >>> Language.find_name('language', 'norsk') 1339 | Language.make(language='no') 1340 | 1341 | >>> Language.find_name('language', 'norsk', 'en') 1342 | Traceback (most recent call last): 1343 | ... 1344 | LookupError: Can't find any language named 'norsk' 1345 | 1346 | >>> Language.find_name('language', 'norsk', 'no') 1347 | Language.make(language='no') 1348 | 1349 | >>> Language.find_name('language', 'malayo', 'en') 1350 | Language.make(language='mbp') 1351 | 1352 | >>> Language.find_name('language', 'malayo', 'es') 1353 | Language.make(language='ms') 1354 | 1355 | Some language names resolve to more than a language. For example, 1356 | the name 'Brazilian Portuguese' resolves to a language and a territory, 1357 | and 'Simplified Chinese' resolves to a language and a script. In these 1358 | cases, a Language object with multiple subtags will be returned. 1359 | 1360 | >>> Language.find_name('language', 'Brazilian Portuguese', 'en') 1361 | Language.make(language='pt', territory='BR') 1362 | 1363 | >>> Language.find_name('language', 'Simplified Chinese', 'en') 1364 | Language.make(language='zh', script='Hans') 1365 | 1366 | A small amount of fuzzy matching is supported: if the name can be 1367 | shortened to match a single language name, you get that language. 1368 | This allows, for example, "Hakka dialect" to match "Hakka". 1369 | 1370 | >>> Language.find_name('language', 'Hakka dialect') 1371 | Language.make(language='hak') 1372 | """ 1373 | try: 1374 | from language_data.names import name_to_code 1375 | except ImportError: 1376 | print(LANGUAGE_NAME_IMPORT_MESSAGE, file=sys.stdout) 1377 | raise 1378 | 1379 | # No matter what form of language we got, normalize it to a single 1380 | # language subtag 1381 | if isinstance(language, Language): 1382 | language = language.language 1383 | elif isinstance(language, str): 1384 | language = get(language).language 1385 | if language is None: 1386 | language = 'und' 1387 | 1388 | code = name_to_code(tagtype, name, language) 1389 | if code is None: 1390 | raise LookupError(f"Can't find any {tagtype} named {name!r}") 1391 | if '-' in code: 1392 | return Language.get(code) 1393 | else: 1394 | data = {tagtype: code} 1395 | return Language.make(**data) 1396 | 1397 | @staticmethod 1398 | def find( 1399 | name: str, language: Optional[Union[str, 'Language']] = None 1400 | ) -> 'Language': 1401 | """ 1402 | A concise version of `find_name`, used to get a language tag by its 1403 | name in a natural language. The language can be omitted in the large 1404 | majority of cases, where the language name is not ambiguous. 1405 | 1406 | >>> Language.find('Türkçe') 1407 | Language.make(language='tr') 1408 | >>> Language.find('brazilian portuguese') 1409 | Language.make(language='pt', territory='BR') 1410 | >>> Language.find('simplified chinese') 1411 | Language.make(language='zh', script='Hans') 1412 | 1413 | Some language names are ambiguous: for example, there is a language 1414 | named 'Fala' in English (with code 'fax'), but 'Fala' is also the 1415 | Kwasio word for French. In this case, specifying the language that 1416 | the name is in is necessary for disambiguation. 1417 | 1418 | >>> Language.find('fala') 1419 | Language.make(language='fr') 1420 | >>> Language.find('fala', 'nmg') 1421 | Language.make(language='fr') 1422 | >>> Language.find('fala', 'en') 1423 | Language.make(language='fax') 1424 | """ 1425 | return Language.find_name('language', name, language) 1426 | 1427 | def to_dict(self) -> dict: 1428 | """ 1429 | Get a dictionary of the attributes of this Language object, which 1430 | can be useful for constructing a similar object. 1431 | """ 1432 | if self._dict is not None: 1433 | return self._dict 1434 | 1435 | result = {} 1436 | for key in self.ATTRIBUTES: 1437 | value = getattr(self, key) 1438 | if value: 1439 | result[key] = value 1440 | self._dict = result 1441 | return result 1442 | 1443 | def update(self, other: 'Language') -> 'Language': 1444 | """ 1445 | Update this Language with the fields of another Language. 1446 | """ 1447 | return Language.make( 1448 | language=other.language or self.language, 1449 | extlangs=other.extlangs or self.extlangs, 1450 | script=other.script or self.script, 1451 | territory=other.territory or self.territory, 1452 | variants=other.variants or self.variants, 1453 | extensions=other.extensions or self.extensions, 1454 | private=other.private or self.private, 1455 | ) 1456 | 1457 | def update_dict(self, newdata: dict) -> 'Language': 1458 | """ 1459 | Update the attributes of this Language from a dictionary. 1460 | """ 1461 | return Language.make( 1462 | language=newdata.get('language', self.language), 1463 | extlangs=newdata.get('extlangs', self.extlangs), 1464 | script=newdata.get('script', self.script), 1465 | territory=newdata.get('territory', self.territory), 1466 | variants=newdata.get('variants', self.variants), 1467 | extensions=newdata.get('extensions', self.extensions), 1468 | private=newdata.get('private', self.private), 1469 | ) 1470 | 1471 | @staticmethod 1472 | def _filter_keys(d: dict, keys: Iterable[str]) -> dict: 1473 | """ 1474 | Select a subset of keys from a dictionary. 1475 | """ 1476 | return {key: d[key] for key in keys if key in d} 1477 | 1478 | def _filter_attributes(self, keyset: Iterable[str]) -> 'Language': 1479 | """ 1480 | Return a copy of this object with a subset of its attributes set. 1481 | """ 1482 | filtered = self._filter_keys(self.to_dict(), keyset) 1483 | return Language.make(**filtered) 1484 | 1485 | def _searchable_form(self) -> 'Language': 1486 | """ 1487 | Convert a parsed language tag so that the information it contains is in 1488 | the best form for looking up information in the CLDR. 1489 | """ 1490 | if self._searchable is not None: 1491 | return self._searchable 1492 | 1493 | self._searchable = ( 1494 | self._filter_attributes({'language', 'script', 'territory'}) 1495 | .simplify_script() 1496 | .prefer_macrolanguage() 1497 | ) 1498 | return self._searchable 1499 | 1500 | def __eq__(self, other): 1501 | if self is other: 1502 | return True 1503 | if not isinstance(other, Language): 1504 | return False 1505 | return self._str_tag == other._str_tag 1506 | 1507 | def __hash__(self) -> int: 1508 | return hash(self._str_tag) 1509 | 1510 | def __getitem__(self, key: str) -> Optional[Union[str, List[str]]]: 1511 | if key in self.ATTRIBUTES: 1512 | return getattr(self, key) 1513 | else: 1514 | raise KeyError(key) 1515 | 1516 | def __contains__(self, key: str) -> bool: 1517 | return key in self.ATTRIBUTES and getattr(self, key) 1518 | 1519 | def __repr__(self) -> str: 1520 | items = [] 1521 | for attr in self.ATTRIBUTES: 1522 | if getattr(self, attr): 1523 | value = getattr(self, attr) 1524 | items.append(f'{attr}={value!r}') 1525 | joined = ', '.join(items) 1526 | return f"Language.make({joined})" 1527 | 1528 | def __str__(self) -> str: 1529 | return self.to_tag() 1530 | 1531 | 1532 | # Make the get(), find(), and find_name() functions available at the top level 1533 | get = Language.get 1534 | find = Language.find 1535 | find_name = Language.find_name 1536 | 1537 | # Make the Language object available under the old name LanguageData 1538 | LanguageData = Language 1539 | 1540 | 1541 | def standardize_tag(tag: Union[str, Language], macro: bool = False) -> str: 1542 | """ 1543 | Standardize a language tag: 1544 | 1545 | - Replace deprecated values with their updated versions (if those exist) 1546 | - Remove script tags that are redundant with the language 1547 | - If *macro* is True, use a macrolanguage to represent the most common 1548 | standardized language within that macrolanguage. For example, 'cmn' 1549 | (Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic) 1550 | becomes 'ar' (Arabic). 1551 | - Format the result according to the conventions of BCP 47 1552 | 1553 | Macrolanguage replacement is not required by BCP 47, but it is required 1554 | by the Unicode CLDR. 1555 | 1556 | >>> standardize_tag('en_US') 1557 | 'en-US' 1558 | 1559 | >>> standardize_tag('en-Latn') 1560 | 'en' 1561 | 1562 | >>> standardize_tag('en-uk') 1563 | 'en-GB' 1564 | 1565 | >>> standardize_tag('eng') 1566 | 'en' 1567 | 1568 | >>> standardize_tag('arb-Arab', macro=True) 1569 | 'ar' 1570 | 1571 | >>> standardize_tag('sh-QU') 1572 | 'sr-Latn-EU' 1573 | 1574 | >>> standardize_tag('sgn-US') 1575 | 'ase' 1576 | 1577 | >>> standardize_tag('zh-cmn-hans-cn') 1578 | 'zh-Hans-CN' 1579 | 1580 | >>> standardize_tag('zsm', macro=True) 1581 | 'ms' 1582 | 1583 | >>> standardize_tag('ja-latn-hepburn') 1584 | 'ja-Latn-hepburn' 1585 | 1586 | >>> standardize_tag('spa-latn-mx') 1587 | 'es-MX' 1588 | 1589 | If the tag can't be parsed according to BCP 47, this will raise a 1590 | LanguageTagError (a subclass of ValueError): 1591 | 1592 | >>> standardize_tag('spa-mx-latn') 1593 | Traceback (most recent call last): 1594 | ... 1595 | langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string. 1596 | """ 1597 | langdata = Language.get(tag, normalize=True) 1598 | if macro: 1599 | langdata = langdata.prefer_macrolanguage() 1600 | 1601 | return langdata.simplify_script().to_tag() 1602 | 1603 | 1604 | def tag_is_valid(tag: Union[str, Language]) -> bool: 1605 | """ 1606 | Determines whether a string is a valid language tag. This is similar to 1607 | Language.get(tag).is_valid(), but can return False in the case where 1608 | the tag doesn't parse. 1609 | 1610 | >>> tag_is_valid('ja') 1611 | True 1612 | >>> tag_is_valid('jp') 1613 | False 1614 | >>> tag_is_valid('spa-Latn-MX') 1615 | True 1616 | >>> tag_is_valid('spa-MX-Latn') 1617 | False 1618 | >>> tag_is_valid('') 1619 | False 1620 | >>> tag_is_valid('C.UTF-8') 1621 | False 1622 | """ 1623 | try: 1624 | langdata = Language.get(tag) 1625 | return langdata.is_valid() 1626 | except LanguageTagError: 1627 | return False 1628 | 1629 | 1630 | def tag_match_score( 1631 | desired: Union[str, Language], supported: Union[str, Language] 1632 | ) -> int: 1633 | """ 1634 | DEPRECATED: use .distance() instead, which uses newer data and is _lower_ 1635 | for better matching languages. 1636 | 1637 | Return a number from 0 to 100 indicating the strength of match between the 1638 | language the user desires, D, and a supported language, S. Higher numbers 1639 | are better. A reasonable cutoff for not messing with your users is to 1640 | only accept scores of 75 or more. 1641 | 1642 | A score of 100 means the languages are the same, possibly after normalizing 1643 | and filling in likely values. 1644 | """ 1645 | warnings.warn( 1646 | "tag_match_score is deprecated because it's based on deprecated CLDR info. " 1647 | "Use tag_distance instead, which is _lower_ for better matching languages. ", 1648 | DeprecationWarning, 1649 | ) 1650 | desired_ld = Language.get(desired) 1651 | supported_ld = Language.get(supported) 1652 | return desired_ld.match_score(supported_ld) 1653 | 1654 | 1655 | def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int: 1656 | """ 1657 | Tags that expand to the same thing when likely values are filled in get a 1658 | distance of 0. 1659 | 1660 | >>> tag_distance('en', 'en') 1661 | 0 1662 | >>> tag_distance('en', 'en-US') 1663 | 0 1664 | >>> tag_distance('zh-Hant', 'zh-TW') 1665 | 0 1666 | >>> tag_distance('ru-Cyrl', 'ru') 1667 | 0 1668 | 1669 | As a specific example, Serbo-Croatian is a politically contentious idea, 1670 | but in CLDR, it's considered equivalent to Serbian in Latin characters. 1671 | 1672 | >>> tag_distance('sh', 'sr-Latn') 1673 | 0 1674 | 1675 | ... which is very similar to Croatian but sociopolitically not the same. 1676 | 1677 | >>> tag_distance('sh', 'hr') 1678 | 9 1679 | 1680 | Unicode reorganized its distinction between 'no' (Norwegian) and 'nb' 1681 | (Norwegian Bokmål) in 2021. 'no' is preferred in most contexts, and the more 1682 | specific 'nb' is a distance of 1 from it: 1683 | 1684 | >>> tag_distance('nb', 'no') 1685 | 1 1686 | 1687 | These distances can be asymmetrical: this data includes the fact that speakers 1688 | of Swiss German (gsw) know High German (de), but not at all the other way around. 1689 | 1690 | The difference seems a little bit extreme, but the asymmetry is certainly 1691 | there. And if your text is tagged as 'gsw', it must be that way for a 1692 | reason. 1693 | 1694 | >>> tag_distance('gsw', 'de') 1695 | 8 1696 | >>> tag_distance('de', 'gsw') 1697 | 84 1698 | 1699 | Unconnected languages get a distance of 80 to 134. 1700 | 1701 | >>> tag_distance('en', 'zh') 1702 | 134 1703 | >>> tag_distance('es', 'fr') 1704 | 84 1705 | >>> tag_distance('fr-CH', 'de-CH') 1706 | 80 1707 | 1708 | Different local variants of the same language get a distance from 3 to 5. 1709 | >>> tag_distance('zh-HK', 'zh-MO') # Chinese is similar in Hong Kong and Macao 1710 | 4 1711 | >>> tag_distance('en-AU', 'en-GB') # Australian English is similar to British English 1712 | 3 1713 | >>> tag_distance('en-IN', 'en-GB') # Indian English is also similar to British English 1714 | 3 1715 | >>> tag_distance('es-PE', 'es-419') # Peruvian Spanish is Latin American Spanish 1716 | 1 1717 | >>> tag_distance('es-419', 'es-PE') # but Latin American Spanish is not necessarily Peruvian 1718 | 4 1719 | >>> tag_distance('es-ES', 'es-419') # Spanish in Spain is further from Latin American Spanish 1720 | 5 1721 | >>> tag_distance('en-US', 'en-GB') # American and British English are somewhat different 1722 | 5 1723 | >>> tag_distance('es-MX', 'es-ES') # Mexican Spanish is different from Spanish Spanish 1724 | 5 1725 | >>> # European Portuguese is different from the most common form (Brazilian Portuguese) 1726 | >>> tag_distance('pt', 'pt-PT') 1727 | 5 1728 | 1729 | >>> # Serbian has two scripts, and people might prefer one but understand both 1730 | >>> tag_distance('sr-Latn', 'sr-Cyrl') 1731 | 5 1732 | 1733 | A distance of 10 is used for matching a specific language to its 1734 | more-commonly-used macrolanguage tag. 1735 | 1736 | >>> tag_distance('arz', 'ar') # Egyptian Arabic to Modern Standard Arabic 1737 | 10 1738 | >>> tag_distance('wuu', 'zh') # Wu Chinese to (Mandarin) Chinese 1739 | 10 1740 | 1741 | Higher distances can arrive due to particularly contentious differences in 1742 | the script for writing the language, where people who understand one script 1743 | can learn the other but may not be happy with it. This specifically applies 1744 | to Chinese. 1745 | 1746 | >>> tag_distance('zh-TW', 'zh-CN') 1747 | 54 1748 | >>> tag_distance('zh-Hans', 'zh-Hant') 1749 | 54 1750 | >>> tag_distance('zh-CN', 'zh-HK') 1751 | 54 1752 | >>> tag_distance('zh-CN', 'zh-TW') 1753 | 54 1754 | >>> tag_distance('zh-Hant', 'zh-Hans') 1755 | 54 1756 | 1757 | This distance range also applies to the differences between Norwegian 1758 | Bokmål, Nynorsk, and Danish. 1759 | 1760 | >>> tag_distance('no', 'da') 1761 | 12 1762 | >>> tag_distance('no', 'nn') 1763 | 20 1764 | 1765 | Differences of 20 to 50 can represent substantially different languages, 1766 | in cases where speakers of the first may understand the second for demographic 1767 | reasons. 1768 | 1769 | >>> tag_distance('eu', 'es') # Basque to Spanish 1770 | 20 1771 | >>> tag_distance('af', 'nl') # Afrikaans to Dutch 1772 | 24 1773 | >>> tag_distance('mr', 'hi') # Marathi to Hindi 1774 | 30 1775 | >>> tag_distance('ms', 'id') # Malay to Indonesian 1776 | 34 1777 | >>> tag_distance('mg', 'fr') # Malagasy to French 1778 | 34 1779 | >>> tag_distance('ta', 'en') # Tamil to English 1780 | 44 1781 | 1782 | A complex example is the tag 'yue' for Cantonese. Written Chinese is usually 1783 | presumed to be Mandarin Chinese, but colloquial Cantonese can be written as 1784 | well. (Some things could not be written any other way, such as Cantonese 1785 | song lyrics.) 1786 | 1787 | The difference between Cantonese and Mandarin also implies script and 1788 | territory differences by default, adding to the distance. 1789 | 1790 | >>> tag_distance('yue', 'zh') 1791 | 64 1792 | 1793 | When the supported script is a different one than desired, this is usually 1794 | a major difference with score of 50 or more. 1795 | 1796 | >>> tag_distance('ja', 'ja-Latn-US-hepburn') 1797 | 54 1798 | 1799 | If `ignore_script` is used, the script difference is ignored and a smaller 1800 | difference with lower score will be found. 1801 | 1802 | >>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True) 1803 | 0 1804 | 1805 | >>> # You can read the Shavian script, right? 1806 | >>> tag_distance('en', 'en-Shaw') 1807 | 54 1808 | """ 1809 | desired_obj = Language.get(desired) 1810 | supported_obj = Language.get(supported) 1811 | return desired_obj.distance(supported_obj, ignore_script) 1812 | 1813 | 1814 | def best_match( 1815 | desired_language: Union[str, Language], 1816 | supported_languages: Sequence[str], 1817 | min_score: int = 75, 1818 | ) -> Tuple[str, int]: 1819 | """ 1820 | DEPRECATED: use .closest_match() instead. This function emulates the old 1821 | matching behavior by subtracting the language distance from 100. 1822 | 1823 | You have software that supports any of the `supported_languages`. You want 1824 | to use `desired_language`. This function lets you choose the right language, 1825 | even if there isn't an exact match. 1826 | 1827 | Returns: 1828 | 1829 | - The best-matching language code, which will be one of the 1830 | `supported_languages` or 'und' 1831 | - The score of the match, from 0 to 100; higher is better. 1832 | 1833 | `min_score` sets the minimum match score. If all languages match with a lower 1834 | score than that, the result will be 'und' with a score of 0. 1835 | """ 1836 | max_distance = 100 - min_score 1837 | supported, distance = closest_match( 1838 | desired_language, supported_languages, max_distance 1839 | ) 1840 | score = max(0, 100 - distance) 1841 | return supported, score 1842 | 1843 | 1844 | def closest_match( 1845 | desired_language: Union[str, Language], 1846 | supported_languages: Sequence[str], 1847 | max_distance: int = 25, 1848 | ignore_script: bool = False, 1849 | ) -> Tuple[str, int]: 1850 | """ 1851 | You have software that supports any of the `supported_languages`. You want 1852 | to use `desired_language`. This function lets you choose the right language, 1853 | even if there isn't an exact match. 1854 | 1855 | Returns: 1856 | 1857 | - The best-matching language code, which will be one of the 1858 | `supported_languages` or 'und' for no match 1859 | - The distance of the match, which is 0 for a perfect match and increases 1860 | from there (see `tag_distance`) 1861 | 1862 | `max_distance` sets the maximum match distance. If all matches are farther 1863 | than that, the result will be 'und' with a distance of 1000. The default 1864 | value is 25, and raising it can cause data to be processed in significantly 1865 | the wrong language. The documentation for `tag_distance` describes the 1866 | distance values in more detail. 1867 | 1868 | `ignore_script` makes the matching ignore scripts, allowing matches to be 1869 | found when they wouldn't otherwise be due to different scripts. 1870 | 1871 | When there is a tie for the best matching language, the first one in the 1872 | tie will be used. 1873 | 1874 | >>> closest_match('fr', ['de', 'en', 'fr']) 1875 | ('fr', 0) 1876 | 1877 | >>> closest_match('pt', ['pt-BR', 'pt-PT']) 1878 | ('pt-BR', 0) 1879 | 1880 | >>> closest_match('en-AU', ['en-GB', 'en-US']) 1881 | ('en-GB', 3) 1882 | 1883 | >>> closest_match('af', ['en', 'nl', 'zu']) 1884 | ('nl', 24) 1885 | 1886 | >>> closest_match('ja', ['ja-Latn-hepburn', 'en']) 1887 | ('und', 1000) 1888 | 1889 | >>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True) 1890 | ('ja-Latn-hepburn', 0) 1891 | """ 1892 | desired_language = str(desired_language) 1893 | 1894 | # Quickly return if the desired language is directly supported 1895 | if desired_language in supported_languages: 1896 | return desired_language, 0 1897 | 1898 | # Reduce the desired language to a standard form that could also match 1899 | desired_language = standardize_tag(desired_language) 1900 | if desired_language in supported_languages: 1901 | return desired_language, 0 1902 | 1903 | match_distances = [ 1904 | (supported, tag_distance(desired_language, supported, ignore_script)) 1905 | for supported in supported_languages 1906 | ] 1907 | match_distances = [ 1908 | (supported, distance) 1909 | for (supported, distance) in match_distances 1910 | if distance <= max_distance 1911 | ] + [('und', 1000)] 1912 | 1913 | match_distances.sort(key=itemgetter(1)) 1914 | return match_distances[0] 1915 | 1916 | 1917 | def closest_supported_match( 1918 | desired_language: Union[str, Language], 1919 | supported_languages: Sequence[str], 1920 | max_distance: int = 25, 1921 | ) -> Optional[str]: 1922 | """ 1923 | Wraps `closest_match` with a simpler return type. Returns the language 1924 | tag of the closest match if there is one, or None if there is not. 1925 | 1926 | >>> closest_supported_match('fr', ['de', 'en', 'fr']) 1927 | 'fr' 1928 | 1929 | >>> closest_supported_match('pt', ['pt-BR', 'pt-PT']) 1930 | 'pt-BR' 1931 | 1932 | >>> closest_supported_match('en-AU', ['en-GB', 'en-US']) 1933 | 'en-GB' 1934 | 1935 | >>> closest_supported_match('und', ['en', 'und']) 1936 | 'und' 1937 | 1938 | >>> closest_supported_match('af', ['en', 'nl', 'zu']) 1939 | 'nl' 1940 | 1941 | >>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10)) 1942 | None 1943 | """ 1944 | code, distance = closest_match(desired_language, supported_languages, max_distance) 1945 | if distance == 1000: 1946 | return None 1947 | else: 1948 | return code 1949 | -------------------------------------------------------------------------------- /langcodes/build_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import xml.etree.ElementTree as ET 3 | from langcodes.util import data_filename 4 | from langcodes.registry_parser import parse_registry 5 | 6 | 7 | def read_cldr_supplemental(dataname): 8 | cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental') 9 | filename = data_filename(f'{cldr_supp_path}/{dataname}.json') 10 | fulldata = json.load(open(filename, encoding='utf-8')) 11 | if dataname == 'aliases': 12 | data = fulldata['supplemental']['metadata']['alias'] 13 | else: 14 | data = fulldata['supplemental'][dataname] 15 | return data 16 | 17 | 18 | def read_iana_registry_suppress_scripts(): 19 | scripts = {} 20 | for entry in parse_registry(): 21 | if entry['Type'] == 'language' and 'Suppress-Script' in entry: 22 | scripts[entry['Subtag']] = entry['Suppress-Script'] 23 | return scripts 24 | 25 | 26 | def read_iana_registry_scripts(): 27 | scripts = set() 28 | for entry in parse_registry(): 29 | if entry['Type'] == 'script': 30 | scripts.add(entry['Subtag']) 31 | return scripts 32 | 33 | 34 | def read_iana_registry_macrolanguages(): 35 | macros = {} 36 | for entry in parse_registry(): 37 | if entry['Type'] == 'language' and 'Macrolanguage' in entry: 38 | macros[entry['Subtag']] = entry['Macrolanguage'] 39 | return macros 40 | 41 | 42 | def read_iana_registry_replacements(): 43 | replacements = {} 44 | for entry in parse_registry(): 45 | if entry['Type'] == 'language' and 'Preferred-Value' in entry: 46 | # Replacements for language codes 47 | replacements[entry['Subtag']] = entry['Preferred-Value'] 48 | elif 'Tag' in entry and 'Preferred-Value' in entry: 49 | # Replacements for entire tags 50 | replacements[entry['Tag'].lower()] = entry['Preferred-Value'] 51 | return replacements 52 | 53 | 54 | def write_python_dict(outfile, name, d): 55 | print(f"{name} = {{", file=outfile) 56 | for key in sorted(d): 57 | value = d[key] 58 | print(f" {key!r}: {value!r},", file=outfile) 59 | print("}", file=outfile) 60 | 61 | 62 | def write_python_set(outfile, name, s): 63 | print(f"{name} = {{", file=outfile) 64 | for key in sorted(set(s)): 65 | print(f" {key!r},", file=outfile) 66 | print("}", file=outfile) 67 | 68 | 69 | GENERATED_HEADER = "# This file is generated by build_data.py." 70 | 71 | 72 | def read_validity_regex(): 73 | validity_options = [] 74 | for codetype in ('language', 'region', 'script', 'variant'): 75 | validity_path = data_filename(f'cldr/common/validity/{codetype}.xml') 76 | root = ET.fromstring(open(validity_path).read()) 77 | matches = root.findall('./idValidity/id') 78 | for match in matches: 79 | for item in match.text.strip().split(): 80 | if '~' in item: 81 | assert item[-2] == '~' 82 | prefix = item[:-3] 83 | range_start = item[-3] 84 | range_end = item[-1] 85 | option = f"{prefix}[{range_start}-{range_end}]" 86 | validity_options.append(option) 87 | else: 88 | validity_options.append(item) 89 | options = '|'.join(validity_options) 90 | return f'^({options})$' 91 | 92 | 93 | def read_language_distances(): 94 | language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml') 95 | root = ET.fromstring(open(language_info_path).read()) 96 | matches = root.findall( 97 | './languageMatching/languageMatches[@type="written_new"]/languageMatch' 98 | ) 99 | tag_distances = {} 100 | for match in matches: 101 | attribs = match.attrib 102 | n_parts = attribs['desired'].count('_') + 1 103 | if n_parts < 3: 104 | if attribs.get('oneway') == 'true': 105 | pairs = [(attribs['desired'], attribs['supported'])] 106 | else: 107 | pairs = [ 108 | (attribs['desired'], attribs['supported']), 109 | (attribs['supported'], attribs['desired']), 110 | ] 111 | for (desired, supported) in pairs: 112 | desired_distance = tag_distances.setdefault(desired, {}) 113 | desired_distance[supported] = int(attribs['distance']) 114 | 115 | # The 'languageInfo' data file contains distances for the unnormalized 116 | # tag 'sh', but we work mostly with normalized tags, and they don't 117 | # describe at all how to cope with this. 118 | # 119 | # 'sh' normalizes to 'sr-Latn', and when we're matching languages we 120 | # aren't matching scripts yet, so when 'sh' appears we'll add a 121 | # corresponding match for 'sr'. 122 | # 123 | # Then because we're kind of making this plan up, add 1 to the distance 124 | # so it's a worse match than ones that are actually clearly defined 125 | # in languageInfo. 126 | if desired == 'sh' or supported == 'sh': 127 | if desired == 'sh': 128 | desired = 'sr' 129 | if supported == 'sh': 130 | supported = 'sr' 131 | if desired != supported: 132 | # don't try to define a non-zero distance for sr <=> sr 133 | desired_distance = tag_distances.setdefault(desired, {}) 134 | desired_distance[supported] = int(attribs['distance']) + 1 135 | 136 | return tag_distances 137 | 138 | 139 | def build_data(): 140 | lang_scripts = read_iana_registry_suppress_scripts() 141 | all_scripts = read_iana_registry_scripts() 142 | macrolanguages = read_iana_registry_macrolanguages() 143 | iana_replacements = read_iana_registry_replacements() 144 | language_distances = read_language_distances() 145 | 146 | alias_data = read_cldr_supplemental('aliases') 147 | likely_subtags = read_cldr_supplemental('likelySubtags') 148 | replacements = {} 149 | 150 | # Aliased codes can still have alpha3 codes, and there's no unified source 151 | # about what they are. It depends on whether the alias predates or postdates 152 | # ISO 639-2, which nobody should have to care about. So let's set all the 153 | # alpha3 codes for aliased alpha2 codes here. 154 | alpha3_mapping = { 155 | 'tl': 'tgl', # even though it normalizes to 'fil' 156 | 'in': 'ind', 157 | 'iw': 'heb', 158 | 'ji': 'yid', 159 | 'jw': 'jav', 160 | 'sh': 'hbs', 161 | } 162 | alpha3_biblio = {} 163 | norm_macrolanguages = {} 164 | for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']: 165 | aliases = alias_data[alias_type] 166 | # Initially populate 'languageAlias' with the aliases from the IANA file 167 | if alias_type == 'languageAlias': 168 | replacements[alias_type] = iana_replacements 169 | replacements[alias_type]['root'] = 'und' 170 | else: 171 | replacements[alias_type] = {} 172 | for code, value in aliases.items(): 173 | # Make all keys lowercase so they can be looked up 174 | # case-insensitively 175 | code = code.lower() 176 | 177 | # If there are multiple replacements, take the first one. For example, 178 | # we just replace the Soviet Union (SU) with Russia (RU), instead of 179 | # trying to do something context-sensitive and poorly standardized 180 | # that selects one of the successor countries to the Soviet Union. 181 | replacement = value['_replacement'].split()[0] 182 | if value['_reason'] == 'macrolanguage': 183 | norm_macrolanguages[code] = replacement 184 | else: 185 | # CLDR tries to oversimplify some codes as it assigns aliases. 186 | # For example, 'nor' is the ISO alpha3 code for 'no', but CLDR 187 | # would prefer you use 'nb' over 'no', so it makes 'nor' an 188 | # alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'. 189 | # 190 | # We undo this oversimplification so that we can get a 191 | # canonical mapping between alpha2 and alpha3 codes. 192 | if code == 'nor': 193 | replacement = 'no' 194 | elif code == 'mol': 195 | replacement = 'mo' 196 | elif code == 'twi': 197 | replacement = 'tw' 198 | elif code == 'bih': 199 | replacement = 'bh' 200 | 201 | replacements[alias_type][code] = replacement 202 | if alias_type == 'languageAlias': 203 | if value['_reason'] == 'overlong': 204 | if replacement in alpha3_mapping: 205 | raise ValueError( 206 | "{code!r} is an alpha3 for {replacement!r}, which" 207 | " already has an alpha3: {orig!r}".format( 208 | code=code, 209 | replacement=replacement, 210 | orig=alpha3_mapping[replacement], 211 | ) 212 | ) 213 | alpha3_mapping[replacement] = code 214 | elif value['_reason'] == 'bibliographic': 215 | alpha3_biblio[replacement] = code 216 | 217 | validity_regex = read_validity_regex() 218 | 219 | # Write the contents of data_dicts.py. 220 | with open('data_dicts.py', 'w', encoding='utf-8') as outfile: 221 | print(GENERATED_HEADER, file=outfile) 222 | print("import re\n", file=outfile) 223 | write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts) 224 | write_python_dict( 225 | outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias'] 226 | ) 227 | write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping) 228 | write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio) 229 | write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias']) 230 | write_python_set(outfile, 'ALL_SCRIPTS', all_scripts) 231 | write_python_dict( 232 | outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias'] 233 | ) 234 | write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages) 235 | write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages) 236 | write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags) 237 | write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances) 238 | print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile) 239 | 240 | 241 | if __name__ == '__main__': 242 | build_data() 243 | -------------------------------------------------------------------------------- /langcodes/language_distance.py: -------------------------------------------------------------------------------- 1 | from .data_dicts import LANGUAGE_DISTANCES 2 | from typing import Dict, Tuple 3 | 4 | 5 | TagTriple = Tuple[str, str, str] 6 | _DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {} 7 | DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"] 8 | DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"] 9 | DEFAULT_TERRITORY_DISTANCE = 4 10 | 11 | 12 | # Territory clusters used in territory matching: 13 | # Maghreb (the western Arab world) 14 | MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"} 15 | 16 | # United States and its territories 17 | US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"} 18 | 19 | # Special Autonomous Regions of China 20 | CNSAR = {"HK", "MO"} 21 | 22 | LATIN_AMERICA = { 23 | "419", 24 | # Central America 25 | "013", 26 | "BZ", 27 | "CR", 28 | "SV", 29 | "GT", 30 | "HN", 31 | "MX", 32 | "NI", 33 | "PA", 34 | # South America 35 | "005", 36 | "AR", 37 | "BO", 38 | "BR", 39 | "CL", 40 | "CO", 41 | "EC", 42 | "FK", 43 | "GF", 44 | "GY", 45 | "PY", 46 | "PE", 47 | "SR", 48 | "UY", 49 | "VE", 50 | } 51 | 52 | # North and South America 53 | AMERICAS = { 54 | "019", 55 | # Caribbean 56 | "029", 57 | "AI", 58 | "AG", 59 | "AW", 60 | "BS", 61 | "BB", 62 | "VG", 63 | "BQ", 64 | "KY", 65 | "CU", 66 | "CW", 67 | "DM", 68 | "DO", 69 | "GD", 70 | "GP", 71 | "HT", 72 | "JM", 73 | "MQ", 74 | "MS", 75 | "PR", 76 | "SX", 77 | "BL", 78 | "KN", 79 | "LC", 80 | "MF", 81 | "VC", 82 | "TT", 83 | "TC", 84 | "VI", 85 | # Northern America 86 | "021", 87 | "BM", 88 | "CA", 89 | "GL", 90 | "PM", 91 | "US", 92 | # North America as a whole 93 | "003", 94 | } | LATIN_AMERICA 95 | 96 | 97 | def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int: 98 | """ 99 | Takes in triples of (language, script, territory), which can be derived by 100 | 'maximizing' a language tag. Returns a number from 0 to 135 indicating the 101 | 'distance' between these for the purposes of language matching. 102 | """ 103 | # First of all, if these are identical, return quickly: 104 | if supported == desired: 105 | return 0 106 | 107 | # If we've already figured it out, return the cached distance. 108 | if (desired, supported) in _DISTANCE_CACHE: 109 | return _DISTANCE_CACHE[desired, supported] 110 | else: 111 | result = _tuple_distance(desired, supported) 112 | _DISTANCE_CACHE[desired, supported] = result 113 | return result 114 | 115 | 116 | def _get2(dictionary: dict, key1: str, key2: str, default): 117 | return dictionary.get(key1, {}).get(key2, default) 118 | 119 | 120 | def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int: 121 | desired_language, desired_script, desired_territory = desired 122 | supported_language, supported_script, supported_territory = supported 123 | distance = 0 124 | 125 | if desired_language != supported_language: 126 | distance += _get2( 127 | LANGUAGE_DISTANCES, 128 | desired_language, 129 | supported_language, 130 | DEFAULT_LANGUAGE_DISTANCE, 131 | ) 132 | 133 | desired_script_pair = f"{desired_language}_{desired_script}" 134 | supported_script_pair = f"{supported_language}_{supported_script}" 135 | 136 | if desired_script != supported_script: 137 | # Scripts can match other scripts, but only when paired with a 138 | # language. For example, there is no reason to assume someone who can 139 | # read 'Latn' can read 'Cyrl', but there is plenty of reason to believe 140 | # someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is 141 | # a language written in two scripts. 142 | distance += _get2( 143 | LANGUAGE_DISTANCES, 144 | desired_script_pair, 145 | supported_script_pair, 146 | DEFAULT_SCRIPT_DISTANCE, 147 | ) 148 | 149 | if desired_territory != supported_territory: 150 | # The rules for matching territories are too weird to implement the 151 | # general case efficiently. Instead of implementing all the possible 152 | # match rules the XML could define, instead we just reimplement the 153 | # rules of CLDR 36.1 here in code. 154 | 155 | tdist = DEFAULT_TERRITORY_DISTANCE 156 | if desired_script_pair == supported_script_pair: 157 | if desired_language == "ar": 158 | if (desired_territory in MAGHREB) != (supported_territory in MAGHREB): 159 | tdist = 5 160 | elif desired_language == "en": 161 | if (desired_territory == "GB") and (supported_territory not in US): 162 | tdist = 3 163 | elif (desired_territory not in US) and (supported_territory == "GB"): 164 | tdist = 3 165 | elif (desired_territory in US) != (supported_territory in US): 166 | tdist = 5 167 | # This is not a rule that's spelled out in CLDR, but is implied by things 168 | # about territory containment mentioned in other standards. Numeric values 169 | # for territories, like '003', represent broad regions that contain more 170 | # specific territories. 171 | # 172 | # 419 is the numeric value most often seen in language codes, particularly 173 | # 'es-419' for Latin American Spanish. If you have a language code that 174 | # differs only in that its territory is more specific, like 'es-PY', it should 175 | # be closer to a supported 'es-419' than anything with a territory difference. 176 | # 177 | # We can implement this for 419 without becoming responsible for keeping up 178 | # with which countries/territories/regions contain others in the general case. 179 | elif desired_territory in LATIN_AMERICA and supported_territory == "419": 180 | tdist = 1 181 | elif desired_language == "es" or desired_language == "pt": 182 | if (desired_territory in AMERICAS) != (supported_territory in AMERICAS): 183 | tdist = 5 184 | elif desired_script_pair == "zh_Hant": 185 | if (desired_territory in CNSAR) != (supported_territory in CNSAR): 186 | tdist = 5 187 | distance += tdist 188 | return distance 189 | -------------------------------------------------------------------------------- /langcodes/language_lists.py: -------------------------------------------------------------------------------- 1 | # This is the list of language codes with the 'modern' level of support in CLDR 2 | # (compared to 'full', which contains many more languages). We use this as the 3 | # list of languages that we store specific name-to-code mappings for. 4 | 5 | CLDR_LANGUAGES = { 6 | 'af', 7 | 'am', 8 | 'ar', 9 | 'az', 10 | 'be', 11 | 'bg', 12 | 'bn', 13 | 'bs', 14 | 'ca', 15 | 'cs', 16 | 'cy', 17 | 'da', 18 | 'de', 19 | 'el', 20 | 'en', 21 | 'es', 22 | 'et', 23 | 'eu', 24 | 'fa', 25 | 'fi', 26 | 'fil', 27 | 'fo', 28 | 'fr', 29 | 'ga', 30 | 'gl', 31 | 'gu', 32 | 'he', 33 | 'hi', 34 | 'hr', 35 | 'hu', 36 | 'hy', 37 | 'id', 38 | 'is', 39 | 'it', 40 | 'ja', 41 | 'ka', 42 | 'kk', 43 | 'km', 44 | 'kn', 45 | 'ko', 46 | 'ky', 47 | 'lo', 48 | 'lt', 49 | 'lv', 50 | 'mk', 51 | 'ml', 52 | 'mn', 53 | 'mr', 54 | 'ms', 55 | 'my', 56 | 'nb', 57 | 'ne', 58 | 'nl', 59 | 'pa', 60 | 'pl', 61 | 'pt', 62 | 'ro', 63 | 'ru', 64 | 'si', 65 | 'sk', 66 | 'sl', 67 | 'sq', 68 | 'sr', 69 | 'sv', 70 | 'sw', 71 | 'ta', 72 | 'te', 73 | 'th', 74 | 'ti', 75 | 'to', 76 | 'tr', 77 | 'uk', 78 | 'und', 79 | 'ur', 80 | 'uz', 81 | 'vi', 82 | 'yue', 83 | 'zh', 84 | 'zu', 85 | } 86 | 87 | 88 | # These are the names languages that have the most entries on the English and 89 | # German Wiktionaries. Wiktionary only consistently identifies languages by their 90 | # name, making it important to be able to recognize the names. 91 | # 92 | # These lists of names are used in `tests/test_wikt_languages.py`. 93 | WIKT_LANGUAGE_NAMES = {} 94 | 95 | WIKT_LANGUAGE_NAMES['en'] = [ 96 | "Spanish", 97 | "French", 98 | "Latvian", 99 | "Latin", 100 | "English", 101 | "Mandarin", 102 | "Italian", 103 | "Portuguese", 104 | "Cantonese", 105 | "Japanese", 106 | "German", 107 | "Swedish", 108 | "Korean", 109 | "Serbo-Croatian", 110 | "Serbian", 111 | "Croatian", 112 | "Bosnian", 113 | "Finnish", 114 | "Vietnamese", 115 | "Dutch", 116 | "Galician", 117 | "Catalan", 118 | "Polish", 119 | "Danish", 120 | "Norwegian Nynorsk", 121 | "Turkish", 122 | "Romanian", 123 | "Lithuanian", 124 | "Ido", 125 | "Old French", 126 | "Czech", 127 | "Norwegian", 128 | # Jèrriais -- same as Norman 129 | "Esperanto", 130 | "Icelandic", 131 | # Old Armenian 132 | "Norwegian Bokmål", 133 | "Asturian", 134 | "Hungarian", 135 | "Proto-Germanic", 136 | "Russian", 137 | "Slovene", 138 | "Min Nan", 139 | "Scottish Gaelic", 140 | "Greek", 141 | "Irish", 142 | "Lojban", 143 | "Middle French", 144 | "Malay", 145 | "Luxembourgish", 146 | "Slovak", 147 | "Estonian", 148 | "Persian", 149 | "Venetian", 150 | "Old English", 151 | "Volapük", 152 | "Ladin", 153 | "Faroese", 154 | "Scots", 155 | "Interlingua", 156 | "Romansch", 157 | "Urdu", 158 | # Middle Chinese 159 | "Indonesian", 160 | "Swahili", 161 | "Middle English", 162 | "Occitan", 163 | "Welsh", 164 | "Old Norse", 165 | "Albanian", 166 | "Old Irish", 167 | "Old Saxon", 168 | "Lower Sorbian", 169 | "Afrikaans", 170 | "Ukrainian", 171 | "Proto-Slavic", 172 | "Ancient Greek", 173 | "Gothic", 174 | "Hawaiian", 175 | "Kurdish", 176 | "Tagalog", 177 | "Old High German", 178 | "Crimean Tatar", 179 | "Manx", 180 | "Sanskrit", 181 | "Hiligaynon", 182 | "West Frisian", 183 | "Hebrew", 184 | "Tok Pisin", 185 | "Proto-Indo-European", 186 | "Macedonian", 187 | "Novial", 188 | "Armenian", 189 | "Arabic", 190 | "Maltese", 191 | "Hakka", 192 | "Sicilian", 193 | # "Ladino", -- same as Ladin 194 | "Basque", 195 | "Breton", 196 | # Guernésiais -- same as Norman 197 | "Vai", 198 | "Navajo", 199 | "Azeri", 200 | "Vilamovian", 201 | # Tarantino 202 | "Maori", 203 | "Friulian", 204 | "Hausa", 205 | "Haitian Creole", 206 | "Yiddish", 207 | "Tatar", 208 | "Proto-Malayo-Polynesian", 209 | "Aromanian", 210 | "Ottoman Turkish", 211 | "Old Provençal", 212 | "Northern Sami", 213 | "Dalmatian", 214 | "Bulgarian", 215 | "Neapolitan", 216 | "Cornish", 217 | "Middle Dutch", 218 | "Rapa Nui", 219 | # Old Portuguese 220 | "Egyptian Arabic", 221 | "Romani", 222 | "Tahitian", 223 | "Thai", 224 | "Limburgish", 225 | "Karelian", 226 | "Tajik", 227 | "Turkmen", 228 | "Kabardian", 229 | "Uzbek", 230 | "Samoan", 231 | "Mongolian", 232 | "Zulu", 233 | "Upper Sorbian", 234 | "Walloon", 235 | # Proto-Finnic 236 | "Frankish", 237 | "Mapudungun", 238 | "Pashto", 239 | "Low German", 240 | "Bashkir", 241 | "Kashubian", 242 | "Sranan Tongo", 243 | "Proto-Sino-Tibetan", 244 | "Norman", 245 | "Proto-Austronesian", 246 | "Marathi", 247 | "Rohingya", 248 | "Classical Nahuatl", 249 | # Proto-Malayic 250 | # German Low German 251 | "Fijian", 252 | "Zazaki", 253 | "Proto-Italic", 254 | "Old Dutch", 255 | "Egyptian", 256 | "Old Frisian", 257 | "Greenlandic", 258 | "Burmese", 259 | "Votic", 260 | "Ewe", 261 | "Cherokee", 262 | "Old Church Slavonic", 263 | "Quechua", 264 | "Mirandese", 265 | "Livonian", 266 | "Bengali", 267 | "Skolt Sami", 268 | # Proto-Balto-Slavic 269 | "Pitjantjatjara", 270 | "Georgian", 271 | "North Frisian", 272 | "Tetum", 273 | "Tongan", 274 | # Mauritian Creole 275 | "Torres Strait Creole", 276 | "Papiamentu", 277 | "Lao", 278 | "Malagasy", 279 | "Interlingue", 280 | "Aragonese", 281 | "Istriot", 282 | "Sumerian", 283 | "Proto-Celtic", 284 | "Võro", 285 | # Proto-Polynesian 286 | "Nepali", 287 | "Chickasaw", 288 | "Akkadian", 289 | "Middle Armenian", 290 | "Cimbrian", 291 | "Somali", 292 | "Sardinian", 293 | "Tocharian B", 294 | "Telugu", 295 | "Javanese", 296 | "Taos", 297 | "Proto-Semitic", 298 | # Old Prussian 299 | "Kyrgyz", 300 | "Corsican", 301 | "Veps", 302 | "Baluchi", 303 | "Middle Low German", 304 | "Middle High German", 305 | "Uyghur", 306 | # Dutch Low Saxon 307 | "Belarusian", 308 | "Guaraní", 309 | "Undetermined", 310 | "Inuktitut", 311 | "Tocharian A", 312 | "Nigerian Pidgin", 313 | # Gallo 314 | # Saterland Frisian 315 | "Punjabi", 316 | "Proto-Algonquian", 317 | # Istro-Romanian 318 | "Wiradhuri", 319 | "Sichuan Yi", 320 | "Wu", 321 | # White Hmong 322 | "Ugaritic", 323 | "Sundanese", 324 | # Old East Slavic 325 | # Fala 326 | # Elfdalian 327 | "Tamil", 328 | "Pijin", 329 | "Okinawan", 330 | "Kazakh", 331 | "Hindi", 332 | "Tuvan", 333 | "Polabian", 334 | "Aramaic", 335 | "Malayalam", 336 | "Kumyk", 337 | "Inari Sami", 338 | "Ilocano", 339 | "Tswana", 340 | "Libyan Arabic", 341 | "Latgalian", 342 | "Yakut", 343 | "Sindhi", 344 | "Khmer", 345 | "Gamilaraay", 346 | "Ojibwe", 347 | "Choctaw", 348 | "Chinese", 349 | "Chamorro", 350 | "Yucatec Maya", 351 | "Picard", 352 | "Ngarrindjeri", 353 | "Kott", 354 | "Ingrian", 355 | # Crimean Gothic 356 | "Chamicuro", 357 | "Rajasthani", 358 | # Old Tupi 359 | "Old Spanish", 360 | "Gagauz", 361 | "Extremaduran", 362 | "Chinook Jargon", 363 | "Cahuilla", 364 | "Kannada", 365 | "Iban", 366 | "American Sign Language", 367 | "Adyghe", 368 | "Warlpiri", 369 | "Tibetan", 370 | "Ossetian", 371 | "Meriam", 372 | "Marshallese", 373 | "Khakas", 374 | "Balinese", 375 | "Zhuang", 376 | "Tuvaluan", 377 | "Niuean", 378 | "Martuthunira", 379 | "Guugu Yimidhirr", 380 | "Chechen", 381 | "Campidanese Sardinian", 382 | "Tolai", 383 | # Old Javanese 384 | "Nahuatl", 385 | "Lombard", 386 | "West Coast Bajau", 387 | "Romagnol", 388 | "Middle Irish", 389 | "Yoruba", 390 | "Wangaaybuwan-Ngiyambaa", 391 | # Old Swedish 392 | "Lingala", 393 | "Fiji Hindi", 394 | "Shabo", 395 | "Sasak", 396 | "Judeo-Arabic", 397 | "Central Kurdish", 398 | "Bislama", 399 | ] 400 | 401 | WIKT_LANGUAGE_NAMES['de'] = [ 402 | "Deutsch", 403 | "Englisch", 404 | "Polnisch", 405 | "Italienisch", 406 | "Französisch", 407 | "Esperanto", 408 | "Schwedisch", 409 | "Lateinisch", 410 | "Tschechisch", 411 | "Katalanisch", 412 | "Spanisch", 413 | "Okzitanisch", 414 | "Ungarisch", 415 | "Litauisch", 416 | "Finnisch", 417 | "Russisch", 418 | "Altgriechisch", 419 | "Niederländisch", 420 | "Kurdisch", 421 | "Baskisch", 422 | "Armenisch", 423 | "Isländisch", 424 | "Bulgarisch", 425 | "Färöisch", 426 | "Dänisch", 427 | "Portugiesisch", 428 | "Slowakisch", 429 | "Türkisch", 430 | "Maori", 431 | "Albanisch", 432 | "Japanisch", 433 | "Norwegisch", 434 | "Irisch", 435 | "Koreanisch", 436 | "Chinesisch", 437 | "Venezianisch", 438 | "Friaulisch", 439 | "Serbisch", 440 | "Indonesisch", 441 | "Walisisch", 442 | "Arabisch", 443 | "Zentral-Nahuatl", 444 | "Neugriechisch", 445 | "Sumerisch", 446 | "Obersorbisch", 447 | "Sesotho", 448 | "Rumänisch", 449 | "Suaheli", 450 | "Persisch", 451 | "Krimtatarisch", 452 | "Plattdeutsch", 453 | "Prußisch", 454 | "Thai", 455 | "Bosnisch", 456 | "Sardisch", 457 | "Maltesisch", 458 | "Akkadisch", 459 | "Hawaiianisch", 460 | "Hebräisch", 461 | "Gotisch", 462 | "Afrikaans", 463 | "Rätoromanisch", 464 | "Tamil", 465 | "Bretonisch", 466 | "Ukrainisch", 467 | "Hindi", 468 | "Georgisch", 469 | "Panjabi", 470 | "Papiamentu", 471 | "Slowenisch", 472 | "Nauruisch", 473 | "Schottisch-Gälisch", 474 | "Balinesisch", 475 | "Estnisch", 476 | "Manx", 477 | "Korsisch", 478 | # "Frühneuhochdeutsch", 479 | "Lettisch", 480 | "isiZulu", 481 | "Tagalog", 482 | "Tok Pisin", 483 | # "Südpikenisch", 484 | "Kroatisch", 485 | "Niedersorbisch", 486 | "Kannada", 487 | "Guanche", 488 | "Belarussisch", 489 | "Sanskrit", 490 | "Aserbaidschanisch", 491 | "Mittelhochdeutsch", 492 | "Laotisch", 493 | "Altnordisch", 494 | "Altenglisch", 495 | "Vietnamesisch", 496 | "Tadschikisch", 497 | "Samoanisch", 498 | "Mazedonisch", 499 | "Luxemburgisch", 500 | "Hethitisch", 501 | # "Yukatekisch", 502 | "Kaschubisch", 503 | "Wallonisch", 504 | # "Klassisches Nahuatl", 505 | "Telugu", 506 | "Rapanui", 507 | "Jiddisch", 508 | "Ido", 509 | # "Galicisch", 510 | "Volapük", 511 | "Bengalisch", 512 | "Mapudungun", 513 | "Lojban", 514 | "Tuvaluisch", 515 | "Gujarati", 516 | "Assamesisch", 517 | ] 518 | -------------------------------------------------------------------------------- /langcodes/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rspeer/langcodes/0aebfa862ed86d820d0c96ce311ef661cf0a798a/langcodes/py.typed -------------------------------------------------------------------------------- /langcodes/registry_parser.py: -------------------------------------------------------------------------------- 1 | from langcodes.util import data_filename 2 | 3 | LIST_KEYS = {'Description', 'Prefix'} 4 | 5 | 6 | def parse_file(file): 7 | """ 8 | Take an open file containing the IANA subtag registry, and yield a 9 | dictionary of information for each subtag it describes. 10 | """ 11 | lines = [] 12 | for line in file: 13 | line = line.rstrip('\n') 14 | if line == '%%': 15 | # This is a separator between items. Parse the data we've 16 | # collected and yield the result. 17 | yield from parse_item(lines) 18 | lines.clear() 19 | elif line.startswith(' '): 20 | # This is a continuation line. Concatenate it to the previous 21 | # line, including one of the spaces. 22 | lines[-1] += line[1:] 23 | else: 24 | lines.append(line) 25 | yield from parse_item(lines) 26 | 27 | 28 | def parse_item(lines): 29 | """ 30 | Given the lines that form a subtag entry (after joining wrapped lines 31 | back together), parse the data they contain. 32 | 33 | Returns a generator that yields once if there was any data there 34 | (and an empty generator if this was just the header). 35 | """ 36 | info = {} 37 | for line in lines: 38 | key, value = line.split(': ', 1) 39 | if key in LIST_KEYS: 40 | info.setdefault(key, []).append(value) 41 | else: 42 | assert key not in info 43 | info[key] = value 44 | 45 | if 'Subtag' in info or 'Tag' in info: 46 | yield info 47 | 48 | 49 | def parse_registry(): 50 | """ 51 | Yield a sequence of dictionaries, containing the info in the included 52 | IANA subtag registry file. 53 | """ 54 | with open( 55 | data_filename('language-subtag-registry.txt'), encoding='utf-8' 56 | ) as data_file: 57 | # 'yield from' instead of returning, so that we only close the file 58 | # when finished. 59 | yield from parse_file(data_file) 60 | -------------------------------------------------------------------------------- /langcodes/tag_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements a parser for language tags, according to the RFC 5646 3 | (BCP 47) standard. 4 | 5 | Here, we're only concerned with the syntax of the language tag. Looking up 6 | what they actually mean in a data file is a separate step. 7 | 8 | For a full description of the syntax of a language tag, see page 3 of 9 | http://tools.ietf.org/html/bcp47 10 | 11 | >>> parse_tag('en') 12 | [('language', 'en')] 13 | 14 | >>> parse_tag('en_US') 15 | [('language', 'en'), ('territory', 'US')] 16 | 17 | >>> parse_tag('en-Latn') 18 | [('language', 'en'), ('script', 'Latn')] 19 | 20 | >>> parse_tag('es-419') 21 | [('language', 'es'), ('territory', '419')] 22 | 23 | >>> parse_tag('zh-hant-tw') 24 | [('language', 'zh'), ('script', 'Hant'), ('territory', 'TW')] 25 | 26 | >>> parse_tag('zh-tw-hant') 27 | Traceback (most recent call last): 28 | ... 29 | langcodes.tag_parser.LanguageTagError: This script subtag, 'hant', is out of place. Expected variant, extension, or end of string. 30 | 31 | >>> parse_tag('de-DE-1901') 32 | [('language', 'de'), ('territory', 'DE'), ('variant', '1901')] 33 | 34 | >>> parse_tag('ja-latn-hepburn') 35 | [('language', 'ja'), ('script', 'Latn'), ('variant', 'hepburn')] 36 | 37 | >>> parse_tag('ja-hepburn-latn') 38 | Traceback (most recent call last): 39 | ... 40 | langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string. 41 | 42 | >>> parse_tag('zh-yue') 43 | [('language', 'zh'), ('extlang', 'yue')] 44 | 45 | >>> parse_tag('zh-yue-Hant') 46 | [('language', 'zh'), ('extlang', 'yue'), ('script', 'Hant')] 47 | 48 | >>> parse_tag('zh-min-nan') 49 | [('grandfathered', 'zh-min-nan')] 50 | 51 | >>> parse_tag('x-dothraki') 52 | [('language', 'x-dothraki')] 53 | 54 | >>> parse_tag('en-u-co-backward-x-pig-latin') 55 | [('language', 'en'), ('extension', 'u-co-backward'), ('private', 'x-pig-latin')] 56 | 57 | >>> parse_tag('en-x-pig-latin-u-co-backward') 58 | [('language', 'en'), ('private', 'x-pig-latin-u-co-backward')] 59 | 60 | >>> parse_tag('u-co-backward') 61 | Traceback (most recent call last): 62 | ... 63 | langcodes.tag_parser.LanguageTagError: Expected a language code, got 'u' 64 | 65 | >>> parse_tag('x-') 66 | Traceback (most recent call last): 67 | ... 68 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '' 69 | 70 | >>> parse_tag('und-u-') 71 | Traceback (most recent call last): 72 | ... 73 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '' 74 | 75 | >>> parse_tag('und-0-foo') 76 | [('language', 'und'), ('extension', '0-foo')] 77 | 78 | >>> parse_tag('und-?-foo') 79 | Traceback (most recent call last): 80 | ... 81 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '?' 82 | 83 | >>> parse_tag('und-x-123456789') 84 | Traceback (most recent call last): 85 | ... 86 | langcodes.tag_parser.LanguageTagError: Expected 1-8 alphanumeric characters, got '123456789' 87 | 88 | >>> parse_tag('en-a-b-foo') 89 | Traceback (most recent call last): 90 | ... 91 | langcodes.tag_parser.LanguageTagError: Tag extensions may not contain two singletons in a row 92 | 93 | >>> parse_tag('ar-٠٠١') 94 | Traceback (most recent call last): 95 | ... 96 | langcodes.tag_parser.LanguageTagError: Language tags must be made of ASCII characters 97 | """ 98 | 99 | # These tags should not be parsed by the usual parser; they're grandfathered 100 | # in from RFC 3066. The 'irregular' ones don't fit the syntax at all; the 101 | # 'regular' ones do, but would give meaningless results when parsed. 102 | # 103 | # These are all lowercased so they can be matched case-insensitively, as the 104 | # standard requires. 105 | EXCEPTIONS = { 106 | # Irregular exceptions 107 | "en-gb-oed", 108 | "i-ami", 109 | "i-bnn", 110 | "i-default", 111 | "i-enochian", 112 | "i-hak", 113 | "i-klingon", 114 | "i-lux", 115 | "i-mingo", 116 | "i-navajo", 117 | "i-pwn", 118 | "i-tao", 119 | "i-tay", 120 | "i-tsu", 121 | "sgn-be-fr", 122 | "sgn-be-nl", 123 | "sgn-ch-de", 124 | # Regular exceptions 125 | "art-lojban", 126 | "cel-gaulish", 127 | "no-bok", 128 | "no-nyn", 129 | "zh-guoyu", 130 | "zh-hakka", 131 | "zh-min", 132 | "zh-min-nan", 133 | "zh-xiang", 134 | } 135 | 136 | # Define the order of subtags as integer constants, but also give them names 137 | # so we can describe them in error messages 138 | EXTLANG, SCRIPT, TERRITORY, VARIANT, EXTENSION = range(5) 139 | SUBTAG_TYPES = [ 140 | 'extlang', 141 | 'script', 142 | 'territory', 143 | 'variant', 144 | 'extension', 145 | 'end of string', 146 | ] 147 | 148 | 149 | def normalize_characters(tag): 150 | """ 151 | BCP 47 is case-insensitive, and CLDR's use of it considers underscores 152 | equivalent to hyphens. So here we smash tags into lowercase with hyphens, 153 | so we can make exact comparisons. 154 | 155 | >>> normalize_characters('en_US') 156 | 'en-us' 157 | >>> normalize_characters('zh-Hant_TW') 158 | 'zh-hant-tw' 159 | """ 160 | return tag.lower().replace('_', '-') 161 | 162 | 163 | def parse_tag(tag): 164 | """ 165 | Parse the syntax of a language tag, without looking up anything in the 166 | registry, yet. Returns a list of (type, value) tuples indicating what 167 | information will need to be looked up. 168 | """ 169 | if not tag.isascii(): 170 | raise LanguageTagError("Language tags must be made of ASCII characters") 171 | 172 | tag = normalize_characters(tag) 173 | if tag in EXCEPTIONS: 174 | return [('grandfathered', tag)] 175 | else: 176 | # The first subtag is always either the language code, or 'x' to mark 177 | # the entire tag as private-use. Other subtags are distinguished 178 | # by their length and format, but the language code is distinguished 179 | # by the fact that it is required to come first. 180 | subtags = tag.split('-') 181 | 182 | # check all subtags for their shape: 1-8 alphanumeric characters 183 | for subtag in subtags: 184 | if len(subtag) < 1 or len(subtag) > 8 or not subtag.isalnum(): 185 | raise LanguageTagError( 186 | f"Expected 1-8 alphanumeric characters, got {subtag!r}" 187 | ) 188 | 189 | if subtags[0] == 'x': 190 | if len(subtags) == 1: 191 | raise LanguageTagError("'x' is not a language tag on its own") 192 | # the entire language tag is private use, but we know that, 193 | # whatever it is, it fills the "language" slot 194 | return [('language', tag)] 195 | elif 2 <= len(subtags[0]) <= 4: 196 | # Language codes should be 2 or 3 letters, but 4-letter codes 197 | # are allowed to parse for legacy Unicode reasons 198 | return [('language', subtags[0])] + parse_subtags(subtags[1:]) 199 | else: 200 | subtag_error(subtags[0], 'a language code') 201 | 202 | 203 | def parse_subtags(subtags, expect=EXTLANG): 204 | """ 205 | Parse everything that comes after the language tag: scripts, territories, 206 | variants, and assorted extensions. 207 | """ 208 | # We parse the parts of a language code recursively: each step of 209 | # language code parsing handles one component of the code, recurses 210 | # to handle the rest of the code, and adds what it found onto the 211 | # list of things that were in the rest of the code. 212 | # 213 | # This could just as well have been iterative, but the loops would have 214 | # been convoluted. 215 | # 216 | # So here's the base case. 217 | if not subtags: 218 | return [] 219 | 220 | # There's a subtag that comes next. We need to find out what it is. 221 | # 222 | # The primary thing that distinguishes different types of subtags is 223 | # length, but the subtags also come in a specified order. The 'expect' 224 | # parameter keeps track of where we are in that order. expect=TERRITORY, 225 | # for example, means we're expecting a territory code, or anything later 226 | # (because everything but the language is optional). 227 | subtag = subtags[0] 228 | tag_length = len(subtag) 229 | 230 | # In the usual case, our goal is to recognize what kind of tag this is, 231 | # and set it in 'tagtype' -- as an integer, so we can compare where it 232 | # should go in order. You can see the enumerated list of tagtypes above, 233 | # where the SUBTAG_TYPES global is defined. 234 | tagtype = None 235 | 236 | if tag_length == 1: 237 | # A one-letter subtag introduces an extension, which can itself have 238 | # sub-subtags, so we dispatch to a different function at this point. 239 | # 240 | # We don't need to check anything about the order, because extensions 241 | # necessarily come last. 242 | if subtag.isalnum(): 243 | return parse_extension(subtags) 244 | else: 245 | subtag_error(subtag) 246 | 247 | elif tag_length == 2: 248 | if subtag.isalpha(): 249 | # Two-letter alphabetic subtags are territories. These are the only 250 | # two-character subtags after the language. 251 | tagtype = TERRITORY 252 | 253 | elif tag_length == 3: 254 | if subtag.isalpha(): 255 | # Three-letter alphabetic subtags are 'extended languages'. 256 | # It's allowed for there to be up to three of them in a row, so we 257 | # need another function to enforce that. Before we dispatch to that 258 | # function, though, we need to check whether we're in the right 259 | # place in order. 260 | if expect <= EXTLANG: 261 | return parse_extlang(subtags) 262 | else: 263 | order_error(subtag, EXTLANG, expect) 264 | elif subtag.isdigit(): 265 | # Three-digit subtags are territories representing broad regions, 266 | # such as Latin America (419). 267 | tagtype = TERRITORY 268 | 269 | elif tag_length == 4: 270 | if subtag.isalpha(): 271 | # Four-letter alphabetic subtags are scripts. 272 | tagtype = SCRIPT 273 | elif subtag[0].isdigit(): 274 | # Four-character subtags that start with a digit are variants. 275 | tagtype = VARIANT 276 | 277 | else: 278 | # Tags of length 5-8 are variants. 279 | tagtype = VARIANT 280 | 281 | # That's the end of the big elif block for figuring out what kind of 282 | # subtag we have based on its length. Now we should do something with that 283 | # kind of subtag. 284 | 285 | if tagtype is None: 286 | # We haven't recognized a type of tag. This subtag just doesn't fit the 287 | # standard. 288 | subtag_error(subtag) 289 | 290 | elif tagtype < expect: 291 | # We got a tag type that was supposed to appear earlier in the order. 292 | order_error(subtag, tagtype, expect) 293 | 294 | else: 295 | # We've recognized a subtag of a particular type. If it's a territory or 296 | # script, we expect the next subtag to be a strictly later type, because 297 | # there can be at most one territory and one script. Otherwise, we expect 298 | # the next subtag to be the type we got or later. 299 | 300 | if tagtype in (SCRIPT, TERRITORY): 301 | expect = tagtype + 1 302 | else: 303 | expect = tagtype 304 | 305 | # Get the name of this subtag type instead of its integer value. 306 | typename = SUBTAG_TYPES[tagtype] 307 | 308 | # Some subtags are conventionally written with capitalization. Apply 309 | # those conventions. 310 | if tagtype == SCRIPT: 311 | subtag = subtag.title() 312 | elif tagtype == TERRITORY: 313 | subtag = subtag.upper() 314 | 315 | # Recurse on the remaining subtags. 316 | return [(typename, subtag)] + parse_subtags(subtags[1:], expect) 317 | 318 | 319 | def parse_extlang(subtags): 320 | """ 321 | Parse an 'extended language' tag, which consists of 1 to 3 three-letter 322 | language codes. 323 | 324 | Extended languages are used for distinguishing dialects/sublanguages 325 | (depending on your view) of macrolanguages such as Arabic, Bahasa Malay, 326 | and Chinese. 327 | 328 | It's supposed to also be acceptable to just use the sublanguage as the 329 | primary language code, and your code should know what's a macrolanguage of 330 | what. For example, 'zh-yue' and 'yue' are the same language (Cantonese), 331 | and differ only in whether they explicitly spell out that Cantonese is a 332 | kind of Chinese. 333 | """ 334 | index = 0 335 | parsed = [] 336 | while index < len(subtags) and len(subtags[index]) == 3 and index < 3: 337 | parsed.append(('extlang', subtags[index])) 338 | index += 1 339 | return parsed + parse_subtags(subtags[index:], SCRIPT) 340 | 341 | 342 | def parse_extension(subtags): 343 | """ 344 | An extension tag consists of a 'singleton' -- a one-character subtag -- 345 | followed by other subtags. Extension tags are in the BCP 47 syntax, but 346 | their meaning is outside the scope of the standard. 347 | 348 | For example, there's the u- extension, which is used for setting Unicode 349 | properties in some context I'm not aware of. 350 | 351 | If the singleton is 'x', it's a private use extension, and consumes the 352 | rest of the tag. Otherwise, it stops at the next singleton. 353 | """ 354 | subtag = subtags[0] 355 | if len(subtags) == 1: 356 | raise LanguageTagError(f"The subtag {subtag!r} must be followed by something") 357 | 358 | if subtag == 'x': 359 | # Private use. Everything after this is arbitrary codes that we 360 | # can't look up. 361 | return [('private', '-'.join(subtags))] 362 | 363 | else: 364 | # Look for the next singleton, if there is one. 365 | boundary = 1 366 | while boundary < len(subtags) and len(subtags[boundary]) != 1: 367 | boundary += 1 368 | 369 | if boundary == 1: 370 | raise LanguageTagError( 371 | "Tag extensions may not contain two singletons in a row" 372 | ) 373 | # We've parsed a complete extension subtag. Return to the main 374 | # parse_subtags function, but expect to find nothing but more 375 | # extensions at this point. 376 | return [('extension', '-'.join(subtags[:boundary]))] + parse_subtags( 377 | subtags[boundary:], EXTENSION 378 | ) 379 | 380 | 381 | class LanguageTagError(ValueError): 382 | pass 383 | 384 | 385 | def order_error(subtag, got, expected): 386 | """ 387 | Output an error indicating that tags were out of order. 388 | """ 389 | options = SUBTAG_TYPES[expected:] 390 | if len(options) == 1: 391 | expect_str = options[0] 392 | elif len(options) == 2: 393 | expect_str = f'{options[0]} or {options[1]}' 394 | else: 395 | joined = ', '.join(options[:-1]) 396 | last = options[-1] 397 | expect_str = f'{joined}, or {last}' 398 | got_str = SUBTAG_TYPES[got] 399 | raise LanguageTagError( 400 | f"This {got_str} subtag, {subtag!r}, is out of place. Expected {expect_str}." 401 | ) 402 | 403 | 404 | def subtag_error(subtag, expected='a valid subtag'): 405 | """ 406 | Try to output a reasonably helpful error message based on our state of 407 | parsing. Most of this code is about how to list, in English, the kinds 408 | of things we were expecting to find. 409 | """ 410 | raise LanguageTagError(f"Expected {expected}, got {subtag!r}") 411 | -------------------------------------------------------------------------------- /langcodes/tests/README.md: -------------------------------------------------------------------------------- 1 | Most of the tests for langcodes are in doctests, intended to be run on Python 3. This directory contains additional tests that ensure langcodes can recognize language names as they are used on Wiktionary, the free multilingual dictionary. 2 | -------------------------------------------------------------------------------- /langcodes/tests/test_alpha3.py: -------------------------------------------------------------------------------- 1 | import string 2 | import langcodes 3 | 4 | def test_alpha2_to_alpha3(): 5 | """ 6 | Test that each valid alpha2 code has a corresponding, unique alpha3 code. 7 | """ 8 | seen = set() 9 | for letter1 in string.ascii_lowercase: 10 | for letter2 in string.ascii_lowercase: 11 | code = letter1 + letter2 12 | language = langcodes.get(code, normalize=False) 13 | if language.is_valid(): 14 | alpha3 = language.to_alpha3() 15 | 16 | # These four 2-letter codes exist only as aliases, and don't have 17 | # their own unique 3-letter codes. All other 2-letter codes should 18 | # uniquely map to 3-letter codes. 19 | if code not in {'in', 'iw', 'ji', 'jw'}: 20 | assert alpha3 not in seen 21 | seen.add(alpha3) 22 | -------------------------------------------------------------------------------- /langcodes/tests/test_issue_59.py: -------------------------------------------------------------------------------- 1 | from langcodes import closest_match 2 | 3 | 4 | def test_language_less_than(): 5 | spoken_language_1 = 'pa' 6 | spoken_language_2 = 'pa-PK' 7 | match = closest_match( 8 | spoken_language_1, [spoken_language_2], ignore_script=True 9 | ) 10 | print(match) 11 | assert match[0] != "und" 12 | 13 | 14 | def test_language_more_than(): 15 | spoken_language_1 = 'pa-PK' 16 | spoken_language_2 = 'pa' 17 | match = closest_match( 18 | spoken_language_1, [spoken_language_2], ignore_script=True 19 | ) 20 | print(match) 21 | assert match[0] != "und" -------------------------------------------------------------------------------- /langcodes/tests/test_language.py: -------------------------------------------------------------------------------- 1 | from langcodes import Language 2 | 3 | 4 | def test__hash__(): 5 | en1 = Language.get("en") 6 | # Disable caching 7 | Language._INSTANCES = {} 8 | Language._PARSE_CACHE = {} 9 | en2 = Language.get("en") 10 | assert hash(en1) == hash(en2) 11 | 12 | # Again, disable caching 13 | Language._INSTANCES = {} 14 | Language._PARSE_CACHE = {} 15 | en_us = Language.get("en-US") 16 | assert hash(en1) != hash(en_us) 17 | -------------------------------------------------------------------------------- /langcodes/tests/test_language_data.py: -------------------------------------------------------------------------------- 1 | import langcodes 2 | 3 | 4 | def test_updated_iana(): 5 | aqk = langcodes.get('aqk') 6 | assert aqk.language_name('en') == 'Aninka' 7 | 8 | 9 | def test_cldr_v40(): 10 | en = langcodes.get('en') 11 | assert en.language_name('dsb') == 'engelšćina' 12 | -------------------------------------------------------------------------------- /langcodes/tests/test_wikt_languages.py: -------------------------------------------------------------------------------- 1 | """ 2 | Here, we test that we can associate a language code with each language name 3 | that is commonly used on Wiktionary, that all the language codes are 4 | different, and that each language name matches only one code. 5 | """ 6 | import pytest 7 | import langcodes 8 | from langcodes.language_lists import WIKT_LANGUAGE_NAMES 9 | 10 | LANGUAGES = ['en', 'de'] 11 | 12 | 13 | @pytest.mark.parametrize("target_lang", LANGUAGES) 14 | def test_check_wiktionary_language(target_lang): 15 | seen_codes = {} 16 | for lang_name in WIKT_LANGUAGE_NAMES[target_lang]: 17 | if lang_name.startswith('Proto-'): 18 | continue 19 | code = str(langcodes.find(lang_name)) 20 | assert code not in seen_codes, "%r and %r have the same code" % ( 21 | seen_codes[code], 22 | lang_name, 23 | ) 24 | seen_codes[code] = lang_name 25 | -------------------------------------------------------------------------------- /langcodes/util.py: -------------------------------------------------------------------------------- 1 | from importlib.resources import files 2 | 3 | DATA_ROOT = files('langcodes').joinpath('data') 4 | import os 5 | 6 | 7 | def data_filename(filename): 8 | return os.path.join(DATA_ROOT, filename) 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "langcodes" 3 | description = "Tools for labeling human languages with IETF language tags" 4 | authors = [{name = "Robyn Speer", email = "rspeer@arborelia.net"}] 5 | maintainers = [{name = "Georg Krause", email = "mail@georg-krause.net"}] 6 | readme = "README.md" 7 | classifiers = [ 8 | "Development Status :: 5 - Production/Stable", 9 | "License :: OSI Approved :: MIT License", 10 | "Programming Language :: Python :: 3", 11 | "Programming Language :: Python :: 3.9", 12 | "Programming Language :: Python :: 3.10", 13 | "Programming Language :: Python :: 3.11", 14 | "Programming Language :: Python :: 3.12", 15 | "Programming Language :: Python :: 3.13", 16 | ] 17 | dynamic = ["version"] 18 | 19 | requires-python = ">= 3.9" 20 | 21 | [project.urls] 22 | Homepage = "https://github.com/rspeer/langcodes" 23 | Repository = "https://github.com/rspeer/langcodes" 24 | Issues = "https://github.com/rspeer/langcodes/issues" 25 | 26 | [project.optional-dependencies] 27 | test = [ 28 | 'pytest', 29 | 'pytest-cov', 30 | 'language-data>=1.2' 31 | ] 32 | build = [ 33 | 'build', 34 | 'twine' 35 | ] 36 | data = [ 37 | "language-data>=1.2" 38 | ] 39 | 40 | [build-system] 41 | requires = ["setuptools>=60", "setuptools-scm>=8.0"] 42 | build-backend = "setuptools.build_meta" 43 | 44 | [tool.setuptools_scm] 45 | 46 | [tool.pytest.ini_options] 47 | addopts = "--doctest-modules --doctest-glob=README.md --ignore=setup.py --ignore=example.py --ignore=langcodes/data" 48 | norecursedirs = ".git ignore build __pycache__" 49 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37, py38, py39, py310, py311, py312 3 | skipsdist = True 4 | 5 | [testenv] 6 | deps = 7 | pytest 8 | marisa_trie 9 | language_data 10 | commands = pip install . 11 | pytest 12 | --------------------------------------------------------------------------------