├── tests ├── __init__.py ├── test_uni_to_beta.py └── test_beta_to_uni.py ├── betacode ├── __init__.py ├── conv.py └── _map.py ├── .coveragerc ├── MANIFEST.in ├── .gitignore ├── .travis.yml ├── requirements.txt ├── Makefile ├── LICENSE ├── CHANGELOG.md ├── setup.py ├── README ├── README.md └── README.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /betacode/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['conv'] 2 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit = 3 | */python?.?/* 4 | */site-packages/* 5 | *__init__* 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG.md 2 | include LICENSE 3 | include README.md 4 | include README.rst 5 | include README 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .pytest_cache/ 3 | dist/ 4 | build/ 5 | venv/ 6 | 7 | MANIFEST 8 | .*.swp 9 | *.pyc 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.4" 4 | - "3.5" 5 | - "3.6" 6 | install: 7 | - pip install -r requirements.txt 8 | - pip install coveralls 9 | script: 10 | - coverage run --source betacode -m pytest 11 | after_success: 12 | - coveralls 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs==17.4.0 2 | certifi==2018.4.16 3 | chardet==3.0.4 4 | idna==2.6 5 | more-itertools==4.1.0 6 | pkginfo==1.4.2 7 | pluggy==0.6.0 8 | py==1.5.3 9 | pygtrie==2.2 10 | pytest==3.5.0 11 | requests==2.18.4 12 | requests-toolbelt==0.8.0 13 | six==1.11.0 14 | tqdm==4.23.3 15 | twine==1.11.0 16 | urllib3==1.22 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | python setup.py build 3 | 4 | sdist: 5 | python setup.py sdist 6 | 7 | publish: 8 | python setup.py sdist 9 | twine upload dist/* 10 | 11 | publishtest: 12 | python setup.py sdist 13 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 14 | 15 | test: 16 | pytest 17 | 18 | docs: 19 | pandoc --from=markdown --to=rst --output=README.rst README.md 20 | pandoc --from=markdown --to=plain --output=README README.md 21 | # Remove the first 3 lines of the README file which are badge related. 22 | sed -i 1,3d README 23 | 24 | clean: 25 | if [ -d 'dist' ]; then \ 26 | rm -r dist; \ 27 | fi 28 | 29 | if [ -d 'build' ]; then \ 30 | rm -r build; \ 31 | fi 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 Matias Grioni 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All changes between versions will be kept track of in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). 6 | 7 | Although there have been previous versions of this software, I do not think there have been previous users so this will be the first version that is kept track of. 8 | 9 | ## 1.0 - 2020-03-08 10 | ### Fixed 11 | - Windows installation did not work since default encoding on Windows is CP-1252 and README read in during setup.py is encoded in UTF-8. 12 | 13 | ## 0.2 - 2018-05-25 14 | ### Added 15 | - Use strict or non-strict mode when coverting from betacode to unicode 16 | - Fix bug with word final sigma used when word final apostrophe after 17 | 18 | ## 0.1.6 - 2018-05-24 19 | ### Removed 20 | - Unnecessary test file 21 | 22 | ## 0.1.5 - 2018-05-24 23 | ### Added 24 | - Convert from betacode to unicode and back 25 | - Case insensitive 26 | - Diacritic order insensitive 27 | - Use oxeîa rather than tónos 28 | - This changelog 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | import os 3 | 4 | def read(fn): 5 | """ 6 | Read in the given file. 7 | 8 | Args: 9 | fn: The filename to read in, relative to the current folder. 10 | 11 | Returns: 12 | The text contents of the file. 13 | """ 14 | with open(os.path.join(os.path.dirname(__file__), fn), encoding='utf-8') as f: 15 | return f.read() 16 | 17 | setup( 18 | name = 'betacode', 19 | packages = ['betacode'], 20 | version = '1.0', 21 | description = 'Betacode to Unicode converter.', 22 | long_description = read('README.rst'), 23 | author = 'Matias Grioni', 24 | author_email = 'matgrioni@gmail.com', 25 | url = 'https://github.com/matgrioni/betacode', 26 | license = 'MIT', 27 | keywords = ['encoding', 'unicode', 'betacode', 'greek'], 28 | classifiers = [ 29 | 'Programming Language :: Python', 30 | 'Programming Language :: Python :: 3', 31 | 'Operating System :: OS Independent', 32 | 'License :: OSI Approved :: MIT License', 33 | 'Development Status :: 4 - Beta', 34 | 'Intended Audience :: Science/Research', 35 | 'Topic :: Text Processing :: Linguistic', 36 | 'Natural Language :: Greek', 37 | ], 38 | ) 39 | -------------------------------------------------------------------------------- /tests/test_uni_to_beta.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | 3 | import betacode.conv 4 | 5 | def _test_uni_beta_equality(uni, beta): 6 | """ 7 | Test that the result of converting uni is beta. 8 | 9 | Comparison is done via the NFC normalization for unicode. 10 | 11 | Args: 12 | uni: The unicode to convert. 13 | beta: The expected beta code result from conversion. 14 | """ 15 | conv = betacode.conv.uni_to_beta(uni) 16 | conv_normalized = unicodedata.normalize('NFC', conv) 17 | beta_normalized = unicodedata.normalize('NFC', beta) 18 | 19 | assert conv_normalized == beta_normalized 20 | 21 | def test_empty(): 22 | uni = '' 23 | beta = '' 24 | 25 | _test_uni_beta_equality(uni, beta) 26 | 27 | def test_simple_conv(): 28 | uni = 'αβ' 29 | beta = 'ab' 30 | 31 | _test_uni_beta_equality(uni, beta) 32 | 33 | def test_multi_word(): 34 | uni = 'βίον τέχνης καὶ εὐδαιμονίας.' 35 | beta = 'bi/on te/xnhs kai\ eu)daimoni/as.' 36 | 37 | _test_uni_beta_equality(uni, beta) 38 | 39 | def test_many_accents(): 40 | uni = 'Ἔφορος καὶ ἄλλοι' 41 | beta = '*)/eforos kai\ a)/lloi' 42 | 43 | _test_uni_beta_equality(uni, beta) 44 | 45 | def test_colon_punc(): 46 | uni = 'πλείους: ἔτι δὲ οἱ μετὰ' 47 | beta = 'plei/ous: e)/ti de\ oi( meta\\' 48 | 49 | _test_uni_beta_equality(uni, beta) 50 | 51 | def test_mixed_conversion(): 52 | uni = 'Many python packages cannot convert this: ἔτι δὲ οἱ' 53 | beta = 'Many python packages cannot convert this: e)/ti de\ oi(' 54 | 55 | _test_uni_beta_equality(uni, beta) 56 | -------------------------------------------------------------------------------- /tests/test_beta_to_uni.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | 3 | import betacode.conv 4 | 5 | def _test_beta_uni_equality(beta, uni, strict=False): 6 | """ 7 | Test that the result of converting beta is uni. 8 | 9 | Comparison is done via the NFC normalization for unicode. 10 | 11 | Args: 12 | beta: The beta code to convert. 13 | uni: The expected unicode result from conversion. 14 | strict: Flag to set the strictness of betacode parsing. 15 | """ 16 | conv = betacode.conv.beta_to_uni(beta, strict) 17 | conv_normalized = unicodedata.normalize('NFC', conv) 18 | uni_normalized = unicodedata.normalize('NFC', uni) 19 | 20 | assert conv_normalized == uni_normalized 21 | 22 | def test_empty(): 23 | beta = '' 24 | uni = '' 25 | 26 | _test_beta_uni_equality(beta, uni) 27 | 28 | def test_simple_conv(): 29 | beta = 'tou=' 30 | uni = 'τοῦ' 31 | 32 | _test_beta_uni_equality(beta, uni) 33 | 34 | def test_final_sigma(): 35 | beta = 'th=s' 36 | uni = 'τῆς' 37 | 38 | _test_beta_uni_equality(beta, uni) 39 | 40 | def test_numeric_sigma_id(): 41 | beta = 'th=s2' 42 | uni = 'τῆς' 43 | 44 | _test_beta_uni_equality(beta, uni) 45 | 46 | def test_keep_non_final_sigma_numeric(): 47 | beta = 'th=s3 tou=' 48 | uni = 'τῆϲ τοῦ' 49 | 50 | _test_beta_uni_equality(beta, uni) 51 | 52 | def test_final_sigma_word(): 53 | beta = 'th=s tou=' 54 | uni = 'τῆς τοῦ' 55 | 56 | _test_beta_uni_equality(beta, uni) 57 | 58 | def test_final_sigma_whitespace(): 59 | beta = 'th=s\ttou=' 60 | uni = 'τῆς\tτοῦ' 61 | 62 | _test_beta_uni_equality(beta, uni) 63 | 64 | def test_final_sigma_punctuation(): 65 | beta = 'th=s; tou=' 66 | uni = 'τῆς; τοῦ' 67 | 68 | _test_beta_uni_equality(beta, uni) 69 | 70 | def test_final_sigma_apostrophe(): 71 | beta = 'th=s\' tou=' 72 | uni = 'τῆσ’ τοῦ' 73 | 74 | _test_beta_uni_equality(beta, uni) 75 | 76 | def test_multi_word(): 77 | beta = 'analabo/ntes de\ kaq\' e(/kaston' 78 | uni = 'αναλαβόντες δὲ καθ’ ἕκαστον' 79 | 80 | _test_beta_uni_equality(beta, uni) 81 | 82 | def test_punctuation_semicolon(): 83 | beta = 'e)/oiken h)\ dida/skonti; nh\\' 84 | uni = 'ἔοικεν ἢ διδάσκοντι; νὴ' 85 | 86 | _test_beta_uni_equality(beta, uni) 87 | 88 | def test_punctuation_colon(): 89 | beta = 'dh=lon: oi(/ te' 90 | uni = 'δῆλον· οἵ τε' 91 | 92 | _test_beta_uni_equality(beta, uni) 93 | 94 | def test_out_of_order(): 95 | beta = 'e/)oiken h\) dida/skonti; nh\\ a=|)i+\\' 96 | uni = 'ἔοικεν ἢ διδάσκοντι; νὴ ᾆῒ' 97 | 98 | def test_cap_out_of_order(): 99 | beta = '*)/eforos ka*)/ei\ a/)lloi' 100 | uni = 'Ἔφορος καἜὶ ἄλλοι' 101 | 102 | _test_beta_uni_equality(beta, uni) 103 | 104 | def test_cap_out_of_order_with_iota(): 105 | beta = '*)/eforos ka*)/ei\ a/)lloi *)h\|' 106 | uni = 'Ἔφορος καἜὶ ἄλλοι ᾛ' 107 | 108 | _test_beta_uni_equality(beta, uni) 109 | 110 | def test_strict_correct(): 111 | beta = 'e)n d\' e)\pes\' w)keanw=|' 112 | uni = 'ἐν δ’ ἒπεσ’ ὠκεανῷ' 113 | 114 | _test_beta_uni_equality(beta, uni, strict=True) 115 | 116 | def test_strict_incorrect(): 117 | beta = 'e)n d\' e)\pes\' w)keanw|=' 118 | uni = 'ἐν δ’ ἒπεσ’ ὠκεανῳ=' 119 | 120 | _test_beta_uni_equality(beta, uni, strict=True) 121 | 122 | def test_unstrict(): 123 | beta = 'e)n d\' e)\pes\' w)keanw|=' 124 | uni = 'ἐν δ’ ἒπεσ’ ὠκεανῷ' 125 | 126 | _test_beta_uni_equality(beta, uni, strict=False) 127 | 128 | def test_unstrict_capitalization(): 129 | beta = '*)e/foros ka*e)/i\ a/)lloi *)\h|' 130 | uni = 'Ἔφορος καἜὶ ἄλλοι ᾛ' 131 | 132 | _test_beta_uni_equality(beta, uni, strict=False) 133 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | betacode 2 | 3 | Convert betacode to unicode and vice-versa easily. Tested on python 3.4, 4 | 3.5, and 3.6. The definition used is based off what is found at the TLG 5 | Beta Code Manual. Only the Greek sections were paid attention to. 6 | 7 | 8 | Motivation 9 | 10 | I was working a classics research project and had to use the Perseus 11 | catalog to extract some Greek work. Much to my surprise however, the 12 | only download I could find was a betacode version. An encoding that is 13 | over 30 years old, rather than modern, fancy, clean unicode. There was 14 | no nice pip package that I could easily go to for this simple task, so I 15 | decided to roll my own. 16 | 17 | Install 18 | 19 | Installation is easy. Use pip or your preferred method to download from 20 | PyPI. 21 | 22 | pip install betacode 23 | 24 | Usage 25 | 26 | Note that in all examples, strings are unicode encoded. Input can be in 27 | upper or lower case. The official definition from TLG uses only 28 | uppercase, but many resources, such as the Perseus catalog, are encoded 29 | in lowercase, so this package accepts both. This package also can 30 | disregard the unnecessary cannonical order of Greek diacritics from the 31 | official definition. The only thing that matters in order for the 32 | betacode to be unambiguous is that each unit must either begin with a * 33 | or a letter. As long as these constraints are followed, breathing marks, 34 | accents, and such can go in any order. However, the cannonical order 35 | will be returned when going from unicode to betacode. Also note that 36 | currently, only individual, non-combining characters are handled. This 37 | means that you cannot do all combinations of letters and diacritics. 38 | Only those defined as composite characters in the Greek and Extended 39 | Greek sections of unicode. 40 | 41 | Betacode to unicode 42 | 43 | import betacode.conv 44 | 45 | beta = 'analabo/ntes de\ kaq\' e(/kaston' 46 | betacode.conv.beta_to_uni(beta) # αναλαβόντες δὲ καθ᾽ ἕκαστον 47 | 48 | Note that polytonic accent marks will be used, and not monotonic accent 49 | marks. Both are de jure equivalent in Greece, but betacode was initially 50 | developed to encode classic works so the polytonic diacritics are more 51 | fitting. In other words, the oxeîa will be used rather than tónos. The 52 | oxeîa form can be converted to the modern accent form easily either 53 | through search and replace, or unicode normalization since oxeîa has 54 | canonical decomposition into tónos. 55 | 56 | Conversion can also be made more strict by using the strict flag. 57 | 58 | beta_to_uni(text, strict=False) 59 | 60 | If set, only the cannonical order of diacritics is accepted in betacode. 61 | If it is not set, then any order is allowed as long as capital letters 62 | begin with a * and lowercase letters begin with the letter and not a 63 | diacritic. 64 | 65 | Unicode to betacode 66 | 67 | import betacode.conv 68 | 69 | uni = 'αναλαβόντες δὲ καθ᾽ ἕκαστον' 70 | betacode.conv.uni_to_beta(uni) # analabo/ntes de\ kaq\' e(/kaston 71 | 72 | The unicode text can use polytonic (oxeîa) accent marks or monotonic 73 | (tónos) accent marks can be used. 74 | 75 | Speed 76 | 77 | The original implementation used a custom made trie. This maybe was not 78 | the fastest (I wasn't sure). So, I compared against a third party trie 79 | implementation, pygtrie. The pygtrie had nicer prefix methods which 80 | allowed for much faster processing of large texts. This changed 81 | converting all of Strabo or Herodotus in the Perseus catalog from a many 82 | minute operation to a ~3-4 second operation. I have seen implementations 83 | that use regular expressions which I suspsect might be faster since the 84 | underlying implementation is in C. However, this package is much smaller 85 | and simpler if betacode conversion is all that is needed than CLTK, for 86 | example. 87 | 88 | Modified Betacode 89 | 90 | There is talk of a modified betacode that I have seen around on the 91 | internet. I have never been able to find a definitive definition of this 92 | so I have not implemented it. Among some differences is word final sigma 93 | usage, _ as macron, and uppercase and lowercase roman letters instead of 94 | using *. 95 | 96 | 97 | Development 98 | 99 | I am no classicist, and this was done in my free time. It is very 100 | possible that there are some letters missing that are not accounted for, 101 | or some punctuation that is not properly handled. If that is the case, 102 | please tell me as it is easy to fix, or please open a PR for your own 103 | branch. Write tests if you do add a feature. 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/matgrioni/betacode.svg?branch=master)](https://travis-ci.org/matgrioni/betacode) 2 | [![Coverage Status](https://coveralls.io/repos/github/matgrioni/betacode/badge.svg?branch=master)](https://coveralls.io/github/matgrioni/betacode?branch=master) 3 | 4 | ## betacode 5 | 6 | Convert betacode to unicode and vice-versa easily. Tested on python 3.4, 3.5, and 3.6. The definition used is based off what is found at the [TLG Beta Code Manual](http://www.tlg.uci.edu/encoding/BCM.pdf). Only the Greek sections were paid attention to. 7 | 8 | ## Motivation 9 | 10 | I was working a classics research project and had to use the Perseus catalog to extract some Greek work. Much to my surprise however, the only download I could find was a betacode version. An encoding that is over 30 years old, rather than modern, fancy, clean unicode. There was no nice pip package that I could easily go to for this simple task, so I decided to roll my own. 11 | 12 | ### Install 13 | 14 | Installation is easy. Use `pip` or your preferred method to download from PyPI. 15 | 16 | ``` 17 | pip install betacode 18 | ``` 19 | 20 | ### Usage 21 | 22 | Note that in all examples, strings are unicode encoded. Input can be in upper or lower case. The official definition from TLG uses only uppercase, but many resources, such as the Perseus catalog, are encoded in lowercase, so this package accepts both. This package also can disregard the unnecessary cannonical order of Greek diacritics from the official definition. The only thing that matters in order for the betacode to be unambiguous is that each unit must either begin with a `*` or a letter. As long as these constraints are followed, breathing marks, accents, and such can go in any order. However, the cannonical order will be returned when going from unicode to betacode. Also note that currently, only individual, non-combining characters are handled. This means that you cannot do all combinations of letters and diacritics. Only those defined as composite characters in the Greek and Extended Greek sections of unicode. 23 | 24 | #### Betacode to unicode 25 | 26 | ``` 27 | import betacode.conv 28 | 29 | beta = 'analabo/ntes de\ kaq\' e(/kaston' 30 | betacode.conv.beta_to_uni(beta) # αναλαβόντες δὲ καθ᾽ ἕκαστον 31 | ``` 32 | 33 | Note that polytonic accent marks will be used, and not monotonic accent marks. Both are de jure equivalent in Greece, but betacode was initially developed to encode classic works so the polytonic diacritics are more fitting. In other words, the oxeîa will be used rather than tónos. The oxeîa form can be converted to the modern accent form easily either through search and replace, or unicode normalization since oxeîa has canonical decomposition into tónos. 34 | 35 | Conversion can also be made more strict by using the `strict` flag. 36 | 37 | ``` 38 | beta_to_uni(text, strict=False) 39 | ``` 40 | 41 | If set, only the cannonical order of diacritics is accepted in betacode. If it is not set, then any order is allowed as long as capital letters begin with a `*` and lowercase letters begin with the letter and not a diacritic. 42 | 43 | #### Unicode to betacode 44 | ``` 45 | import betacode.conv 46 | 47 | uni = 'αναλαβόντες δὲ καθ᾽ ἕκαστον' 48 | betacode.conv.uni_to_beta(uni) # analabo/ntes de\ kaq\' e(/kaston 49 | ``` 50 | 51 | The unicode text can use polytonic (oxeîa) accent marks or monotonic (tónos) accent marks can be used. 52 | 53 | ### Speed 54 | 55 | The original implementation used a custom made trie. This maybe was not the fastest (I wasn't sure). So, I compared against a third party trie implementation, pygtrie. The pygtrie had nicer prefix methods which allowed for much faster processing of large texts. This changed converting all of Strabo or Herodotus in the Perseus catalog from a many minute operation to a ~3-4 second operation. I have seen implementations that use regular expressions which I suspsect might be faster since the underlying implementation is in C. However, this package is much smaller and simpler if betacode conversion is all that is needed than CLTK, for example. 56 | 57 | ### Modified Betacode 58 | 59 | There is talk of a modified betacode that I have seen around on the internet. I have never been able to find a definitive definition of this so I have not implemented it. Among some differences is word final sigma usage, `_` as macron, and uppercase and lowercase roman letters instead of using `*`. 60 | 61 | 62 | ## Development 63 | 64 | I am no classicist, and this was done in my free time. It is very possible that there are some letters missing that are not accounted for, or some punctuation that is not properly handled. If that is the case, please tell me as it is easy to fix, or please open a PR for your own branch. Write tests if you do add a feature. 65 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Build Status| |Coverage Status| 2 | 3 | betacode 4 | -------- 5 | 6 | Convert betacode to unicode and vice-versa easily. Tested on python 3.4, 7 | 3.5, and 3.6. The definition used is based off what is found at the `TLG 8 | Beta Code Manual `__. Only the 9 | Greek sections were paid attention to. 10 | 11 | Motivation 12 | ---------- 13 | 14 | I was working a classics research project and had to use the Perseus 15 | catalog to extract some Greek work. Much to my surprise however, the 16 | only download I could find was a betacode version. An encoding that is 17 | over 30 years old, rather than modern, fancy, clean unicode. There was 18 | no nice pip package that I could easily go to for this simple task, so I 19 | decided to roll my own. 20 | 21 | Install 22 | ~~~~~~~ 23 | 24 | Installation is easy. Use ``pip`` or your preferred method to download 25 | from PyPI. 26 | 27 | :: 28 | 29 | pip install betacode 30 | 31 | Usage 32 | ~~~~~ 33 | 34 | Note that in all examples, strings are unicode encoded. Input can be in 35 | upper or lower case. The official definition from TLG uses only 36 | uppercase, but many resources, such as the Perseus catalog, are encoded 37 | in lowercase, so this package accepts both. This package also can 38 | disregard the unnecessary cannonical order of Greek diacritics from the 39 | official definition. The only thing that matters in order for the 40 | betacode to be unambiguous is that each unit must either begin with a 41 | ``*`` or a letter. As long as these constraints are followed, breathing 42 | marks, accents, and such can go in any order. However, the cannonical 43 | order will be returned when going from unicode to betacode. Also note 44 | that currently, only individual, non-combining characters are handled. 45 | This means that you cannot do all combinations of letters and 46 | diacritics. Only those defined as composite characters in the Greek and 47 | Extended Greek sections of unicode. 48 | 49 | Betacode to unicode 50 | ^^^^^^^^^^^^^^^^^^^ 51 | 52 | :: 53 | 54 | import betacode.conv 55 | 56 | beta = 'analabo/ntes de\ kaq\' e(/kaston' 57 | betacode.conv.beta_to_uni(beta) # αναλαβόντες δὲ καθ᾽ ἕκαστον 58 | 59 | Note that polytonic accent marks will be used, and not monotonic accent 60 | marks. Both are de jure equivalent in Greece, but betacode was initially 61 | developed to encode classic works so the polytonic diacritics are more 62 | fitting. In other words, the oxeîa will be used rather than tónos. The 63 | oxeîa form can be converted to the modern accent form easily either 64 | through search and replace, or unicode normalization since oxeîa has 65 | canonical decomposition into tónos. 66 | 67 | Conversion can also be made more strict by using the ``strict`` flag. 68 | 69 | :: 70 | 71 | beta_to_uni(text, strict=False) 72 | 73 | If set, only the cannonical order of diacritics is accepted in betacode. 74 | If it is not set, then any order is allowed as long as capital letters 75 | begin with a ``*`` and lowercase letters begin with the letter and not a 76 | diacritic. 77 | 78 | Unicode to betacode 79 | ^^^^^^^^^^^^^^^^^^^ 80 | 81 | :: 82 | 83 | import betacode.conv 84 | 85 | uni = 'αναλαβόντες δὲ καθ᾽ ἕκαστον' 86 | betacode.conv.uni_to_beta(uni) # analabo/ntes de\ kaq\' e(/kaston 87 | 88 | The unicode text can use polytonic (oxeîa) accent marks or monotonic 89 | (tónos) accent marks can be used. 90 | 91 | Speed 92 | ~~~~~ 93 | 94 | The original implementation used a custom made trie. This maybe was not 95 | the fastest (I wasn't sure). So, I compared against a third party trie 96 | implementation, pygtrie. The pygtrie had nicer prefix methods which 97 | allowed for much faster processing of large texts. This changed 98 | converting all of Strabo or Herodotus in the Perseus catalog from a many 99 | minute operation to a ~3-4 second operation. I have seen implementations 100 | that use regular expressions which I suspsect might be faster since the 101 | underlying implementation is in C. However, this package is much smaller 102 | and simpler if betacode conversion is all that is needed than CLTK, for 103 | example. 104 | 105 | Modified Betacode 106 | ~~~~~~~~~~~~~~~~~ 107 | 108 | There is talk of a modified betacode that I have seen around on the 109 | internet. I have never been able to find a definitive definition of this 110 | so I have not implemented it. Among some differences is word final sigma 111 | usage, ``_`` as macron, and uppercase and lowercase roman letters 112 | instead of using ``*``. 113 | 114 | Development 115 | ----------- 116 | 117 | I am no classicist, and this was done in my free time. It is very 118 | possible that there are some letters missing that are not accounted for, 119 | or some punctuation that is not properly handled. If that is the case, 120 | please tell me as it is easy to fix, or please open a PR for your own 121 | branch. Write tests if you do add a feature. 122 | 123 | .. |Build Status| image:: https://travis-ci.org/matgrioni/betacode.svg?branch=master 124 | :target: https://travis-ci.org/matgrioni/betacode 125 | .. |Coverage Status| image:: https://coveralls.io/repos/github/matgrioni/betacode/badge.svg?branch=master 126 | :target: https://coveralls.io/github/matgrioni/betacode?branch=master 127 | -------------------------------------------------------------------------------- /betacode/conv.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import unicodedata 3 | 4 | import pygtrie 5 | 6 | from . import _map 7 | 8 | # Special characters that need their own references to rewrite with 9 | _FINAL_LC_SIGMA = '\u03c2' 10 | _MEDIAL_LC_SIGMA = '\u03c3' 11 | 12 | # Punctuation marks in the betacode map 13 | _BETA_PUNCTUATION = frozenset('\':-_') 14 | _BETA_APOSTROPHE = '\u2019' 15 | 16 | 17 | def _create_unicode_map(): 18 | """ 19 | Create the inverse map from unicode to betacode. 20 | 21 | Returns: 22 | The hash map to convert unicode characters to the beta code representation. 23 | """ 24 | unicode_map = {} 25 | 26 | for beta, uni in _map.BETACODE_MAP.items(): 27 | # Include decomposed equivalent where necessary. 28 | norm = unicodedata.normalize('NFC', uni) 29 | unicode_map[norm] = beta 30 | unicode_map[uni] = beta 31 | 32 | # Add the final sigmas. 33 | final_sigma_norm = unicodedata.normalize('NFC', _FINAL_LC_SIGMA) 34 | unicode_map[final_sigma_norm] = 's' 35 | unicode_map[_FINAL_LC_SIGMA] = 's' 36 | 37 | return unicode_map 38 | 39 | _UNICODE_MAP = _create_unicode_map() 40 | 41 | 42 | def _create_conversion_trie(strict): 43 | """ 44 | Create the trie for betacode conversion. 45 | 46 | Args: 47 | text: The beta code text to convert. All of this text must be betacode. 48 | strict: Flag to allow for flexible diacritic order on input. 49 | 50 | Returns: 51 | The trie for conversion. 52 | """ 53 | t = pygtrie.CharTrie() 54 | 55 | for beta, uni in _map.BETACODE_MAP.items(): 56 | if strict: 57 | t[beta] = uni 58 | else: 59 | # The order of accents is very strict and weak. Allow for many orders of 60 | # accents between asterisk and letter or after letter. This does not 61 | # introduce ambiguity since each betacode token only has one letter and 62 | # either starts with a asterisk or a letter. 63 | diacritics = beta[1:] 64 | 65 | perms = itertools.permutations(diacritics) 66 | for perm in perms: 67 | perm_str = beta[0] + ''.join(perm) 68 | t[perm_str.lower()] = uni 69 | t[perm_str.upper()] = uni 70 | 71 | return t 72 | 73 | 74 | def _find_max_beta_token_len(): 75 | """ 76 | Finds the maximum length of a single betacode token. 77 | 78 | Returns: 79 | The length of the longest key in the betacode map, which corresponds to the 80 | longest single betacode token. 81 | """ 82 | max_beta_len = -1 83 | for beta, uni in _map.BETACODE_MAP.items(): 84 | if len(beta) > max_beta_len: 85 | max_beta_len = len(beta) 86 | 87 | return max_beta_len 88 | 89 | _MAX_BETA_TOKEN_LEN = _find_max_beta_token_len() 90 | 91 | def _penultimate_sigma_word_final(text): 92 | return len(text) > 1 and text[-2] == _MEDIAL_LC_SIGMA and \ 93 | not text[-1].isalnum() and text[-1] != _BETA_APOSTROPHE 94 | 95 | 96 | _BETA_CONVERSION_TRIES = {} 97 | def beta_to_uni(text, strict=False): 98 | """ 99 | Converts the given text from betacode to unicode. 100 | 101 | Args: 102 | text: The beta code text to convert. All of this text must be betacode. 103 | strict: Flag to allow for flexible diacritic order on input. 104 | 105 | Returns: 106 | The converted text. 107 | """ 108 | # Check if the requested configuration for conversion already has a trie 109 | # stored otherwise convert it. 110 | param_key = (strict,) 111 | try: 112 | t = _BETA_CONVERSION_TRIES[param_key] 113 | except KeyError: 114 | t = _create_conversion_trie(*param_key) 115 | _BETA_CONVERSION_TRIES[param_key] = t 116 | 117 | transform = [] 118 | idx = 0 119 | possible_word_boundary = False 120 | 121 | while idx < len(text): 122 | if possible_word_boundary and _penultimate_sigma_word_final(transform): 123 | transform[-2] = _FINAL_LC_SIGMA 124 | 125 | step = t.longest_prefix(text[idx:idx + _MAX_BETA_TOKEN_LEN]) 126 | 127 | if step: 128 | possible_word_boundary = text[idx] in _BETA_PUNCTUATION 129 | 130 | key, value = step 131 | transform.append(value) 132 | idx += len(key) 133 | else: 134 | possible_word_boundary = True 135 | 136 | transform.append(text[idx]) 137 | idx += 1 138 | 139 | # Check one last time in case there is some whitespace or punctuation at the 140 | # end and check if the last character is a sigma. 141 | if possible_word_boundary and _penultimate_sigma_word_final(transform): 142 | transform[-2] = _FINAL_LC_SIGMA 143 | elif len(transform) > 0 and transform[-1] == _MEDIAL_LC_SIGMA: 144 | transform[-1] = _FINAL_LC_SIGMA 145 | 146 | converted = ''.join(transform) 147 | return converted 148 | 149 | def uni_to_beta(text): 150 | """ 151 | Convert unicode text to a betacode equivalent. 152 | 153 | This method can handle tónos or oxeîa characters in the input. 154 | 155 | Args: 156 | text: The text to convert to betacode. This text does not have to all be 157 | Greek polytonic text, and only Greek characters will be converted. Note 158 | that in this case, you cannot convert to beta and then back to unicode. 159 | 160 | Returns: 161 | The betacode equivalent of the inputted text where applicable. 162 | """ 163 | u = _UNICODE_MAP 164 | 165 | transform = [] 166 | 167 | for ch in text: 168 | try: 169 | conv = u[ch] 170 | except KeyError: 171 | conv = ch 172 | 173 | transform.append(conv) 174 | 175 | converted = ''.join(transform) 176 | return converted 177 | -------------------------------------------------------------------------------- /betacode/_map.py: -------------------------------------------------------------------------------- 1 | BETACODE_MAP = { 2 | # No marks 3 | 'a': '\u03b1', 4 | 'b': '\u03b2', 5 | 'g': '\u03b3', 6 | 'd': '\u03b4', 7 | 'e': '\u03b5', 8 | 'z': '\u03b6', 9 | 'h': '\u03b7', 10 | 'q': '\u03b8', 11 | 'i': '\u03b9', 12 | 'k': '\u03ba', 13 | 'l': '\u03bb', 14 | 'm': '\u03bc', 15 | 'n': '\u03bd', 16 | 'c': '\u03be', 17 | 'o': '\u03bf', 18 | 'p': '\u03c0', 19 | 'r': '\u03c1', 20 | 's': '\u03c3', 21 | 's1': '\u03c3', 22 | 's2': '\u03c2', 23 | 's3': '\u03f2', 24 | 't': '\u03c4', 25 | 'u': '\u03c5', 26 | 'f': '\u03c6', 27 | 'x': '\u03c7', 28 | 'y': '\u03c8', 29 | 'w': '\u03c9', 30 | '*a': '\u0391', 31 | '*b': '\u0392', 32 | '*g': '\u0393', 33 | '*d': '\u0394', 34 | '*e': '\u0395', 35 | '*z': '\u0396', 36 | '*h': '\u0397', 37 | '*q': '\u0398', 38 | '*i': '\u0399', 39 | '*k': '\u039a', 40 | '*l': '\u039b', 41 | '*m': '\u039c', 42 | '*n': '\u039d', 43 | '*c': '\u039e', 44 | '*o': '\u039f', 45 | '*p': '\u03a0', 46 | '*r': '\u03a1', 47 | '*s': '\u03a3', 48 | '*s3': '\u03f9', 49 | '*t': '\u03a4', 50 | '*u': '\u03a5', 51 | '*f': '\u03a6', 52 | '*x': '\u03a7', 53 | '*y': '\u03a8', 54 | '*w': '\u03a9', 55 | 56 | # Smooth breathing 57 | 'a)': '\u1f00', 58 | 'e)': '\u1f10', 59 | 'h)': '\u1f20', 60 | 'i)': '\u1f30', 61 | 'o)': '\u1f40', 62 | 'u)': '\u1f50', 63 | 'w)': '\u1f60', 64 | 'r)': '\u1fe4', 65 | '*)a': '\u1f08', 66 | '*)e': '\u1f18', 67 | '*)h': '\u1f28', 68 | '*)i': '\u1f38', 69 | '*)o': '\u1f48', 70 | '*)w': '\u1f68', 71 | 72 | # Rough breathing 73 | 'a(': '\u1f01', 74 | 'e(': '\u1f11', 75 | 'h(': '\u1f21', 76 | 'i(': '\u1f31', 77 | 'o(': '\u1f41', 78 | 'u(': '\u1f51', 79 | 'w(': '\u1f61', 80 | 'r(': '\u1fe5', 81 | '*(a': '\u1f09', 82 | '*(e': '\u1f19', 83 | '*(h': '\u1f29', 84 | '*(i': '\u1f39', 85 | '*(o': '\u1f49', 86 | '*(u': '\u1f59', 87 | '*(w': '\u1f69', 88 | '*(r': '\u1fec', 89 | 90 | # Acute accent and grave accent 91 | 'a\\': '\u1f70', 92 | 'a/': '\u1f71', 93 | 'e\\': '\u1f72', 94 | 'e/': '\u1f73', 95 | 'h\\': '\u1f74', 96 | 'h/': '\u1f75', 97 | 'i\\': '\u1f76', 98 | 'i/': '\u1f77', 99 | 'o\\': '\u1f78', 100 | 'o/': '\u1f79', 101 | 'u\\': '\u1f7a', 102 | 'u/': '\u1f7b', 103 | 'w\\': '\u1f7c', 104 | 'w/': '\u1f7d', 105 | '*\\a': '\u1fba', 106 | '*/a': '\u1fbb', 107 | '*\\e': '\u1fce', 108 | '*/e': '\u1fc9', 109 | '*\\h': '\u1fca', 110 | '*/h': '\u1fcb', 111 | '*\\i': '\u1fda', 112 | '*/i': '\u1fdb', 113 | '*\\o': '\u1ff8', 114 | '*/o': '\u1ff9', 115 | '*\\u': '\u1fea', 116 | '*/u': '\u1feb', 117 | '*\\w': '\u1ffa', 118 | '*/w': '\u1ffb', 119 | 120 | # Smooth breathing and acute accent 121 | 'a)/': '\u1f04', 122 | 'e)/': '\u1f14', 123 | 'h)/': '\u1f24', 124 | 'i)/': '\u1f34', 125 | 'o)/': '\u1f44', 126 | 'u)/': '\u1f54', 127 | 'w)/': '\u1f64', 128 | '*)/a': '\u1f0c', 129 | '*)/e': '\u1f1c', 130 | '*)/h': '\u1f2c', 131 | '*)/i': '\u1f3c', 132 | '*)/o': '\u1f4c', 133 | '*)/u': '\u1f5c', 134 | '*)/w': '\u1f6c', 135 | 136 | # Smooth breathing and grave accent 137 | 'a)\\': '\u1f02', 138 | 'e)\\': '\u1f12', 139 | 'h)\\': '\u1f22', 140 | 'i)\\': '\u1f32', 141 | 'o)\\': '\u1f42', 142 | 'u)\\': '\u1f52', 143 | 'w)\\': '\u1f62', 144 | '*)\\a': '\u1f0a', 145 | '*)\\e': '\u1f1a', 146 | '*)\\h': '\u1f2a', 147 | '*)\\i': '\u1f3a', 148 | '*)\\o': '\u1f4a', 149 | '*)\\u': '\u1f5a', 150 | '*)\\w': '\u1f6a', 151 | 152 | # Rough breathing and acute accent 153 | 'a(/': '\u1f05', 154 | 'e(/': '\u1f15', 155 | 'h(/': '\u1f25', 156 | 'i(/': '\u1f35', 157 | 'o(/': '\u1f45', 158 | 'u(/': '\u1f55', 159 | 'w(/': '\u1f65', 160 | '*(/a': '\u1f0d', 161 | '*(/e': '\u1f1d', 162 | '*(/h': '\u1f2d', 163 | '*(/i': '\u1f3d', 164 | '*(/o': '\u1f4d', 165 | '*(/u': '\u1f5d', 166 | '*(/w': '\u1f6d', 167 | 168 | # Rough breathing and grave accent 169 | 'a(\\': '\u1f03', 170 | 'e(\\': '\u1f13', 171 | 'h(\\': '\u1f23', 172 | 'i(\\': '\u1f33', 173 | 'o(\\': '\u1f43', 174 | 'u(\\': '\u1f53', 175 | 'w(\\': '\u1f63', 176 | '*(\\a': '\u1f0b', 177 | '*(\\e': '\u1f1b', 178 | '*(\\h': '\u1f2b', 179 | '*(\\i': '\u1f3b', 180 | '*(\\o': '\u1f4b', 181 | '*(\\u': '\u1f5b', 182 | '*(\\w': '\u1f6b', 183 | 184 | # Perispomeni 185 | 'a=': '\u1fb6', 186 | 'h=': '\u1fc6', 187 | 'i=': '\u1fd6', 188 | 'u=': '\u1fe6', 189 | 'w=': '\u1ff6', 190 | 191 | # Smooth breathing and perispomeni 192 | 'a)=': '\u1f06', 193 | 'h)=': '\u1f26', 194 | 'i)=': '\u1f36', 195 | 'u)=': '\u1f56', 196 | 'w)=': '\u1f66', 197 | '*)=a': '\u1f0e', 198 | '*)=h': '\u1f2e', 199 | '*)=i': '\u1f3e', 200 | '*)=w': '\u1f6e', 201 | 202 | # Rough breathing and perispomeni 203 | 'a(=': '\u1f07', 204 | 'h(=': '\u1f27', 205 | 'i(=': '\u1f37', 206 | 'u(=': '\u1f57', 207 | 'w(=': '\u1f67', 208 | '*(=a': '\u1f0f', 209 | '*(=h': '\u1f2f', 210 | '*(=i': '\u1f3f', 211 | '*(=u': '\u1f5f', 212 | '*(=w': '\u1f6f', 213 | 214 | # Perispomeni and ypogegrammeni 215 | 'a=|': '\u1fb7', 216 | 'h=|': '\u1fc7', 217 | 'w=|': '\u1ff7', 218 | 219 | # Ypogegrammeni 220 | 'a|': '\u1fb3', 221 | 'h|': '\u1fc3', 222 | 'w|': '\u1ff3', 223 | '*a|': '\u1fbc', 224 | '*h|': '\u1fcc', 225 | '*w|': '\u1ffc', 226 | 227 | # Acute accent and ypogegrammeni 228 | 'a/|': '\u1fb4', 229 | 'h/|': '\u1fc4', 230 | 'w/|': '\u1ff4', 231 | 232 | # Smooth breathing and ypogegrammeni 233 | 'a)|': '\u1f80', 234 | 'h)|': '\u1f90', 235 | 'w)|': '\u1fa0', 236 | '*)a|': '\u1f88', 237 | '*)h|': '\u1f98', 238 | '*)w|': '\u1fa8', 239 | 240 | # Rough breathing and ypogegrammeni 241 | 'a(|': '\u1f81', 242 | 'h(|': '\u1f91', 243 | 'w(|': '\u1fa1', 244 | '*(a|': '\u1f89', 245 | '*(h|': '\u1f99', 246 | '*(w|': '\u1fa9', 247 | 248 | # Smooth breathing, acute accent, and ypogegrammeni 249 | 'a)\|': '\u1f82', 250 | 'h)\|': '\u1f92', 251 | 'w)\|': '\u1fa2', 252 | '*)\\a|': '\u1f8a', 253 | '*)\h|': '\u1f9a', 254 | '*)\w|': '\u1faa', 255 | 256 | # Rough breathing, grave accent, and ypogegrammeni 257 | 'a(\|': '\u1f83', 258 | 'h)\|': '\u1f93', 259 | 'w)\|': '\u1fa3', 260 | '*(\\a|': '\u1f8b', 261 | '*)\h|': '\u1f9b', 262 | '*)\w|': '\u1fab', 263 | 264 | # Smooth breathing, accute accent, and ypogegrammeni 265 | 'a)/|': '\u1f84', 266 | 'h)/|': '\u1f94', 267 | 'w)/|': '\u1fa4', 268 | '*)/a|': '\u1f8c', 269 | '*)/h|': '\u1f9c', 270 | '*)/w|': '\u1fac', 271 | 272 | # Rough breating, acute accent, and ypogegrammeni 273 | 'a(/|': '\u1f85', 274 | 'h(/|': '\u1f95', 275 | 'w(/|': '\u1fa5', 276 | '*(/a|': '\u1f8d', 277 | '*(/h|': '\u1f9d', 278 | '*(/w|': '\u1fad', 279 | 280 | # Smooth breathing, ypogegrammeni, and perispomeni 281 | 'a)=|': '\u1f86', 282 | 'h)=|': '\u1f96', 283 | 'w)=|': '\u1fa6', 284 | '*)=a|': '\u1f8e', 285 | '*)=h|': '\u1f9e', 286 | '*)=w|': '\u1fae', 287 | 288 | # Rough breathing, ypogegrammeni, and perispomeni 289 | 'a(=|': '\u1f87', 290 | 'h(=|': '\u1f97', 291 | 'w(=|': '\u1fa7', 292 | '*(=a|': '\u1f8f', 293 | '*(=h|': '\u1f9f', 294 | '*(=w|': '\u1faf', 295 | 296 | # Diaeresis 297 | 'i+': '\u03ca', 298 | '*+i': '\u03aa', 299 | 'i\\+': '\u1fd2', 300 | 'i/+': '\u1fd3', 301 | 'i+/': '\u1fd3', 302 | 'i=+': '\u1fd7', 303 | 'u+': '\u03cb', 304 | '*+u': '\u03ab', 305 | 'u\\+': '\u1fe2', 306 | 'u/+': '\u1fe3', 307 | 'u=+': '\u1fe7', 308 | 309 | # Macron 310 | 'a&': '\u1fb0', 311 | 'i&': '\u1fd0', 312 | 'u&': '\u1fe0', 313 | 314 | # Breve 315 | 'a\'': '\u1fb1', 316 | 'i\'': '\u1fd1', 317 | 'u\'': '\u1fe1', 318 | 319 | # Basic punctuation 320 | ':': '\u00b7', 321 | '\'': '\u2019', 322 | '-': '\u2010', 323 | '_': '\u2014' 324 | } 325 | --------------------------------------------------------------------------------