├── tests
    ├── __init__.py
    ├── test_uni_to_beta.py
    └── test_beta_to_uni.py
├── betacode
    ├── __init__.py
    ├── conv.py
    └── _map.py
├── .coveragerc
├── MANIFEST.in
├── .gitignore
├── .travis.yml
├── requirements.txt
├── Makefile
├── LICENSE
├── CHANGELOG.md
├── setup.py
├── README
├── README.md
└── README.rst


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/betacode/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['conv']
2 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit =
3 |     */python?.?/*
4 |     */site-packages/*
5 |     *__init__*
6 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CHANGELOG.md
2 | include LICENSE
3 | include README.md
4 | include README.rst
5 | include README
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .pytest_cache/
 3 | dist/
 4 | build/
 5 | venv/
 6 | 
 7 | MANIFEST
 8 | .*.swp
 9 | *.pyc
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "3.4"
 4 |     - "3.5"
 5 |     - "3.6"
 6 | install:
 7 |     - pip install -r requirements.txt
 8 |     - pip install coveralls
 9 | script:
10 |     - coverage run --source betacode -m pytest
11 | after_success:
12 |     - coveralls
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | attrs==17.4.0
 2 | certifi==2018.4.16
 3 | chardet==3.0.4
 4 | idna==2.6
 5 | more-itertools==4.1.0
 6 | pkginfo==1.4.2
 7 | pluggy==0.6.0
 8 | py==1.5.3
 9 | pygtrie==2.2
10 | pytest==3.5.0
11 | requests==2.18.4
12 | requests-toolbelt==0.8.0
13 | six==1.11.0
14 | tqdm==4.23.3
15 | twine==1.11.0
16 | urllib3==1.22
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | build:
 2 | 	python setup.py build
 3 | 
 4 | sdist:
 5 | 	python setup.py sdist
 6 | 
 7 | publish:
 8 | 	python setup.py sdist
 9 | 	twine upload dist/*
10 | 
11 | publishtest:
12 | 	python setup.py sdist
13 | 	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
14 | 
15 | test:
16 | 	pytest
17 | 
18 | docs:
19 | 	pandoc --from=markdown --to=rst --output=README.rst README.md
20 | 	pandoc --from=markdown --to=plain --output=README README.md
21 | 	# Remove the first 3 lines of the README file which are badge related.
22 | 	sed -i 1,3d README
23 | 
24 | clean:
25 | 	if [ -d 'dist' ]; then \
26 | 		rm -r dist; \
27 | 	fi
28 | 
29 | 	if [ -d 'build' ]; then \
30 | 		rm -r build; \
31 | 	fi
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018 Matias Grioni
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All changes between versions will be kept track of in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 6 | 
 7 | Although there have been previous versions of this software, I do not think there have been previous users so this will be the first version that is kept track of.
 8 | 
 9 | ## 1.0 - 2020-03-08
10 | ### Fixed
11 | - Windows installation did not work since default encoding on Windows is CP-1252 and README read in during setup.py is encoded in UTF-8.
12 | 
13 | ## 0.2 - 2018-05-25
14 | ### Added
15 | - Use strict or non-strict mode when coverting from betacode to unicode
16 | - Fix bug with word final sigma used when word final apostrophe after
17 | 
18 | ## 0.1.6 - 2018-05-24
19 | ### Removed
20 | - Unnecessary test file
21 | 
22 | ## 0.1.5 - 2018-05-24
23 | ### Added
24 | - Convert from betacode to unicode and back
25 |     - Case insensitive
26 |     - Diacritic order insensitive
27 |     - Use oxeîa rather than tónos
28 | - This changelog
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | import os
 3 | 
 4 | def read(fn):
 5 |     """
 6 |     Read in the given file.
 7 | 
 8 |     Args:
 9 |     fn: The filename to read in, relative to the current folder.
10 | 
11 |     Returns:
12 |     The text contents of the file.
13 |     """
14 |     with open(os.path.join(os.path.dirname(__file__), fn), encoding='utf-8') as f:
15 |         return f.read()
16 | 
17 | setup(
18 |     name = 'betacode',
19 |     packages = ['betacode'],
20 |     version = '1.0',
21 |     description = 'Betacode to Unicode converter.',
22 |     long_description = read('README.rst'),
23 |     author = 'Matias Grioni',
24 |     author_email = 'matgrioni@gmail.com',
25 |     url = 'https://github.com/matgrioni/betacode',
26 |     license = 'MIT',
27 |     keywords = ['encoding', 'unicode', 'betacode', 'greek'],
28 |     classifiers = [
29 |         'Programming Language :: Python',
30 |         'Programming Language :: Python :: 3',
31 |         'Operating System :: OS Independent',
32 |         'License :: OSI Approved :: MIT License',
33 |         'Development Status :: 4 - Beta',
34 |         'Intended Audience :: Science/Research',
35 |         'Topic :: Text Processing :: Linguistic',
36 |         'Natural Language :: Greek',
37 |     ],
38 | )
39 | 


--------------------------------------------------------------------------------
/tests/test_uni_to_beta.py:
--------------------------------------------------------------------------------
 1 | import unicodedata
 2 | 
 3 | import betacode.conv
 4 | 
 5 | def _test_uni_beta_equality(uni, beta):
 6 |     """
 7 |     Test that the result of converting uni is beta.
 8 | 
 9 |     Comparison is done via the NFC normalization for unicode.
10 | 
11 |     Args:
12 |     uni: The unicode to convert.
13 |     beta: The expected beta code result from conversion.
14 |     """
15 |     conv = betacode.conv.uni_to_beta(uni)
16 |     conv_normalized = unicodedata.normalize('NFC', conv)
17 |     beta_normalized = unicodedata.normalize('NFC', beta)
18 | 
19 |     assert conv_normalized == beta_normalized
20 | 
21 | def test_empty():
22 |     uni = ''
23 |     beta = ''
24 | 
25 |     _test_uni_beta_equality(uni, beta)
26 | 
27 | def test_simple_conv():
28 |     uni = 'αβ'
29 |     beta = 'ab'
30 | 
31 |     _test_uni_beta_equality(uni, beta)
32 | 
33 | def test_multi_word():
34 |     uni = 'βίον τέχνης καὶ εὐδαιμονίας.'
35 |     beta = 'bi/on te/xnhs kai\ eu)daimoni/as.'
36 | 
37 |     _test_uni_beta_equality(uni, beta)
38 | 
39 | def test_many_accents():
40 |     uni = 'Ἔφορος καὶ ἄλλοι'
41 |     beta = '*)/eforos kai\ a)/lloi'
42 | 
43 |     _test_uni_beta_equality(uni, beta)
44 | 
45 | def test_colon_punc():
46 |     uni = 'πλείους: ἔτι δὲ οἱ μετὰ'
47 |     beta = 'plei/ous: e)/ti de\ oi( meta\\'
48 | 
49 |     _test_uni_beta_equality(uni, beta)
50 | 
51 | def test_mixed_conversion():
52 |     uni = 'Many python packages cannot convert this: ἔτι δὲ οἱ'
53 |     beta = 'Many python packages cannot convert this: e)/ti de\ oi('
54 | 
55 |     _test_uni_beta_equality(uni, beta)
56 | 


--------------------------------------------------------------------------------
/tests/test_beta_to_uni.py:
--------------------------------------------------------------------------------
  1 | import unicodedata
  2 | 
  3 | import betacode.conv
  4 | 
  5 | def _test_beta_uni_equality(beta, uni, strict=False):
  6 |     """
  7 |     Test that the result of converting beta is uni.
  8 | 
  9 |     Comparison is done via the NFC normalization for unicode.
 10 | 
 11 |     Args:
 12 |     beta: The beta code to convert.
 13 |     uni: The expected unicode result from conversion.
 14 |     strict: Flag to set the strictness of betacode parsing.
 15 |     """
 16 |     conv = betacode.conv.beta_to_uni(beta, strict)
 17 |     conv_normalized = unicodedata.normalize('NFC', conv)
 18 |     uni_normalized = unicodedata.normalize('NFC', uni)
 19 | 
 20 |     assert conv_normalized == uni_normalized
 21 | 
 22 | def test_empty():
 23 |     beta = ''
 24 |     uni = ''
 25 | 
 26 |     _test_beta_uni_equality(beta, uni)
 27 | 
 28 | def test_simple_conv():
 29 |     beta = 'tou='
 30 |     uni = 'τοῦ'
 31 | 
 32 |     _test_beta_uni_equality(beta, uni)
 33 | 
 34 | def test_final_sigma():
 35 |     beta = 'th=s'
 36 |     uni = 'τῆς'
 37 | 
 38 |     _test_beta_uni_equality(beta, uni)
 39 | 
 40 | def test_numeric_sigma_id():
 41 |     beta = 'th=s2'
 42 |     uni = 'τῆς'
 43 | 
 44 |     _test_beta_uni_equality(beta, uni)
 45 | 
 46 | def test_keep_non_final_sigma_numeric():
 47 |     beta = 'th=s3 tou='
 48 |     uni = 'τῆϲ τοῦ'
 49 | 
 50 |     _test_beta_uni_equality(beta, uni)
 51 | 
 52 | def test_final_sigma_word():
 53 |     beta = 'th=s tou='
 54 |     uni = 'τῆς τοῦ'
 55 | 
 56 |     _test_beta_uni_equality(beta, uni)
 57 | 
 58 | def test_final_sigma_whitespace():
 59 |     beta = 'th=s\ttou='
 60 |     uni = 'τῆς\tτοῦ'
 61 | 
 62 |     _test_beta_uni_equality(beta, uni)
 63 | 
 64 | def test_final_sigma_punctuation():
 65 |     beta = 'th=s; tou='
 66 |     uni = 'τῆς; τοῦ'
 67 | 
 68 |     _test_beta_uni_equality(beta, uni)
 69 | 
 70 | def test_final_sigma_apostrophe():
 71 |     beta = 'th=s\' tou='
 72 |     uni = 'τῆσ’ τοῦ'
 73 | 
 74 |     _test_beta_uni_equality(beta, uni)
 75 | 
 76 | def test_multi_word():
 77 |     beta = 'analabo/ntes de\ kaq\' e(/kaston'
 78 |     uni = 'αναλαβόντες δὲ καθ’ ἕκαστον'
 79 | 
 80 |     _test_beta_uni_equality(beta, uni)
 81 | 
 82 | def test_punctuation_semicolon():
 83 |     beta = 'e)/oiken h)\ dida/skonti; nh\\'
 84 |     uni = 'ἔοικεν ἢ διδάσκοντι; νὴ'
 85 | 
 86 |     _test_beta_uni_equality(beta, uni)
 87 | 
 88 | def test_punctuation_colon():
 89 |     beta = 'dh=lon: oi(/ te'
 90 |     uni = 'δῆλον· οἵ τε'
 91 | 
 92 |     _test_beta_uni_equality(beta, uni)
 93 | 
 94 | def test_out_of_order():
 95 |     beta = 'e/)oiken h\) dida/skonti; nh\\ a=|)i+\\'
 96 |     uni = 'ἔοικεν ἢ διδάσκοντι; νὴ ᾆῒ'
 97 | 
 98 | def test_cap_out_of_order():
 99 |     beta = '*)/eforos ka*)/ei\ a/)lloi'
100 |     uni = 'Ἔφορος καἜὶ ἄλλοι'
101 | 
102 |     _test_beta_uni_equality(beta, uni)
103 | 
104 | def test_cap_out_of_order_with_iota():
105 |     beta = '*)/eforos ka*)/ei\ a/)lloi *)h\|'
106 |     uni = 'Ἔφορος καἜὶ ἄλλοι ᾛ'
107 | 
108 |     _test_beta_uni_equality(beta, uni)
109 | 
110 | def test_strict_correct():
111 |     beta = 'e)n d\' e)\pes\' w)keanw=|'
112 |     uni = 'ἐν δ’ ἒπεσ’ ὠκεανῷ'
113 | 
114 |     _test_beta_uni_equality(beta, uni, strict=True)
115 | 
116 | def test_strict_incorrect():
117 |     beta = 'e)n d\' e)\pes\' w)keanw|='
118 |     uni = 'ἐν δ’ ἒπεσ’ ὠκεανῳ='
119 | 
120 |     _test_beta_uni_equality(beta, uni, strict=True)
121 | 
122 | def test_unstrict():
123 |     beta = 'e)n d\' e)\pes\' w)keanw|='
124 |     uni = 'ἐν δ’ ἒπεσ’ ὠκεανῷ'
125 | 
126 |     _test_beta_uni_equality(beta, uni, strict=False)
127 | 
128 | def test_unstrict_capitalization():
129 |     beta = '*)e/foros ka*e)/i\ a/)lloi *)\h|'
130 |     uni = 'Ἔφορος καἜὶ ἄλλοι ᾛ'
131 | 
132 |     _test_beta_uni_equality(beta, uni, strict=False)
133 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | betacode
  2 | 
  3 | Convert betacode to unicode and vice-versa easily. Tested on python 3.4,
  4 | 3.5, and 3.6. The definition used is based off what is found at the TLG
  5 | Beta Code Manual. Only the Greek sections were paid attention to.
  6 | 
  7 | 
  8 | Motivation
  9 | 
 10 | I was working a classics research project and had to use the Perseus
 11 | catalog to extract some Greek work. Much to my surprise however, the
 12 | only download I could find was a betacode version. An encoding that is
 13 | over 30 years old, rather than modern, fancy, clean unicode. There was
 14 | no nice pip package that I could easily go to for this simple task, so I
 15 | decided to roll my own.
 16 | 
 17 | Install
 18 | 
 19 | Installation is easy. Use pip or your preferred method to download from
 20 | PyPI.
 21 | 
 22 |     pip install betacode
 23 | 
 24 | Usage
 25 | 
 26 | Note that in all examples, strings are unicode encoded. Input can be in
 27 | upper or lower case. The official definition from TLG uses only
 28 | uppercase, but many resources, such as the Perseus catalog, are encoded
 29 | in lowercase, so this package accepts both. This package also can
 30 | disregard the unnecessary cannonical order of Greek diacritics from the
 31 | official definition. The only thing that matters in order for the
 32 | betacode to be unambiguous is that each unit must either begin with a *
 33 | or a letter. As long as these constraints are followed, breathing marks,
 34 | accents, and such can go in any order. However, the cannonical order
 35 | will be returned when going from unicode to betacode. Also note that
 36 | currently, only individual, non-combining characters are handled. This
 37 | means that you cannot do all combinations of letters and diacritics.
 38 | Only those defined as composite characters in the Greek and Extended
 39 | Greek sections of unicode.
 40 | 
 41 | Betacode to unicode
 42 | 
 43 |     import betacode.conv
 44 | 
 45 |     beta = 'analabo/ntes de\ kaq\' e(/kaston'
 46 |     betacode.conv.beta_to_uni(beta) # αναλαβόντες δὲ καθ᾽ ἕκαστον
 47 | 
 48 | Note that polytonic accent marks will be used, and not monotonic accent
 49 | marks. Both are de jure equivalent in Greece, but betacode was initially
 50 | developed to encode classic works so the polytonic diacritics are more
 51 | fitting. In other words, the oxeîa will be used rather than tónos. The
 52 | oxeîa form can be converted to the modern accent form easily either
 53 | through search and replace, or unicode normalization since oxeîa has
 54 | canonical decomposition into tónos.
 55 | 
 56 | Conversion can also be made more strict by using the strict flag.
 57 | 
 58 |     beta_to_uni(text, strict=False)
 59 | 
 60 | If set, only the cannonical order of diacritics is accepted in betacode.
 61 | If it is not set, then any order is allowed as long as capital letters
 62 | begin with a * and lowercase letters begin with the letter and not a
 63 | diacritic.
 64 | 
 65 | Unicode to betacode
 66 | 
 67 |     import betacode.conv
 68 | 
 69 |     uni = 'αναλαβόντες δὲ καθ᾽ ἕκαστον'
 70 |     betacode.conv.uni_to_beta(uni) # analabo/ntes de\ kaq\' e(/kaston
 71 | 
 72 | The unicode text can use polytonic (oxeîa) accent marks or monotonic
 73 | (tónos) accent marks can be used.
 74 | 
 75 | Speed
 76 | 
 77 | The original implementation used a custom made trie. This maybe was not
 78 | the fastest (I wasn't sure). So, I compared against a third party trie
 79 | implementation, pygtrie. The pygtrie had nicer prefix methods which
 80 | allowed for much faster processing of large texts. This changed
 81 | converting all of Strabo or Herodotus in the Perseus catalog from a many
 82 | minute operation to a ~3-4 second operation. I have seen implementations
 83 | that use regular expressions which I suspsect might be faster since the
 84 | underlying implementation is in C. However, this package is much smaller
 85 | and simpler if betacode conversion is all that is needed than CLTK, for
 86 | example.
 87 | 
 88 | Modified Betacode
 89 | 
 90 | There is talk of a modified betacode that I have seen around on the
 91 | internet. I have never been able to find a definitive definition of this
 92 | so I have not implemented it. Among some differences is word final sigma
 93 | usage, _ as macron, and uppercase and lowercase roman letters instead of
 94 | using *.
 95 | 
 96 | 
 97 | Development
 98 | 
 99 | I am no classicist, and this was done in my free time. It is very
100 | possible that there are some letters missing that are not accounted for,
101 | or some punctuation that is not properly handled. If that is the case,
102 | please tell me as it is easy to fix, or please open a PR for your own
103 | branch. Write tests if you do add a feature.
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/matgrioni/betacode.svg?branch=master)](https://travis-ci.org/matgrioni/betacode)
 2 | [![Coverage Status](https://coveralls.io/repos/github/matgrioni/betacode/badge.svg?branch=master)](https://coveralls.io/github/matgrioni/betacode?branch=master)
 3 | 
 4 | ## betacode
 5 | 
 6 | Convert betacode to unicode and vice-versa easily. Tested on python 3.4, 3.5, and 3.6. The definition used is based off what is found at the [TLG Beta Code Manual](http://www.tlg.uci.edu/encoding/BCM.pdf). Only the Greek sections were paid attention to.
 7 | 
 8 | ## Motivation
 9 | 
10 | I was working a classics research project and had to use the Perseus catalog to extract some Greek work. Much to my surprise however, the only download I could find was a betacode version. An encoding that is over 30 years old, rather than modern, fancy, clean unicode. There was no nice pip package that I could easily go to for this simple task, so I decided to roll my own.
11 | 
12 | ### Install
13 | 
14 | Installation is easy. Use `pip` or your preferred method to download from PyPI.
15 | 
16 | ```
17 | pip install betacode
18 | ```
19 | 
20 | ### Usage
21 | 
22 | Note that in all examples, strings are unicode encoded. Input can be in upper or lower case. The official definition from TLG uses only uppercase, but many resources, such as the Perseus catalog, are encoded in lowercase, so this package accepts both. This package also can disregard the unnecessary cannonical order of Greek diacritics from the official definition. The only thing that matters in order for the betacode to be unambiguous is that each unit must either begin with a `*` or a letter. As long as these constraints are followed, breathing marks, accents, and such can go in any order. However, the cannonical order will be returned when going from unicode to betacode. Also note that currently, only individual, non-combining characters are handled. This means that you cannot do all combinations of letters and diacritics. Only those defined as composite characters in the Greek and Extended Greek sections of unicode.
23 | 
24 | #### Betacode to unicode
25 | 
26 | ```
27 | import betacode.conv
28 | 
29 | beta = 'analabo/ntes de\ kaq\' e(/kaston'
30 | betacode.conv.beta_to_uni(beta) # αναλαβόντες δὲ καθ᾽ ἕκαστον
31 | ```
32 | 
33 | Note that polytonic accent marks will be used, and not monotonic accent marks. Both are de jure equivalent in Greece, but betacode was initially developed to encode classic works so the polytonic diacritics are more fitting. In other words, the oxeîa will be used rather than tónos. The oxeîa form can be converted to the modern accent form easily either through search and replace, or unicode normalization since oxeîa has canonical decomposition into tónos.
34 | 
35 | Conversion can also be made more strict by using the `strict` flag.
36 | 
37 | ```
38 | beta_to_uni(text, strict=False)
39 | ```
40 | 
41 | If set, only the cannonical order of diacritics is accepted in betacode. If it is not set, then any order is allowed as long as capital letters begin with a `*` and lowercase letters begin with the letter and not a diacritic.
42 | 
43 | #### Unicode to betacode
44 | ```
45 | import betacode.conv
46 | 
47 | uni = 'αναλαβόντες δὲ καθ᾽ ἕκαστον'
48 | betacode.conv.uni_to_beta(uni) # analabo/ntes de\ kaq\' e(/kaston
49 | ```
50 | 
51 | The unicode text can use polytonic (oxeîa) accent marks or monotonic (tónos) accent marks can be used.
52 | 
53 | ### Speed
54 | 
55 | The original implementation used a custom made trie. This maybe was not the fastest (I wasn't sure). So, I compared against a third party trie implementation, pygtrie. The pygtrie had nicer prefix methods which allowed for much faster processing of large texts. This changed converting all of Strabo or Herodotus in the Perseus catalog from a many minute operation to a ~3-4 second operation. I have seen implementations that use regular expressions which I suspsect might be faster since the underlying implementation is in C. However, this package is much smaller and simpler if betacode conversion is all that is needed than CLTK, for example.
56 | 
57 | ### Modified Betacode
58 | 
59 | There is talk of a modified betacode that I have seen around on the internet. I have never been able to find a definitive definition of this so I have not implemented it. Among some differences is word final sigma usage, `_` as macron, and uppercase and lowercase roman letters instead of using `*`.
60 | 
61 | 
62 | ## Development
63 | 
64 | I am no classicist, and this was done in my free time. It is very possible that there are some letters missing that are not accounted for, or some punctuation that is not properly handled. If that is the case, please tell me as it is easy to fix, or please open a PR for your own branch. Write tests if you do add a feature.
65 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | |Build Status| |Coverage Status|
  2 | 
  3 | betacode
  4 | --------
  5 | 
  6 | Convert betacode to unicode and vice-versa easily. Tested on python 3.4,
  7 | 3.5, and 3.6. The definition used is based off what is found at the `TLG
  8 | Beta Code Manual <http://www.tlg.uci.edu/encoding/BCM.pdf>`__. Only the
  9 | Greek sections were paid attention to.
 10 | 
 11 | Motivation
 12 | ----------
 13 | 
 14 | I was working a classics research project and had to use the Perseus
 15 | catalog to extract some Greek work. Much to my surprise however, the
 16 | only download I could find was a betacode version. An encoding that is
 17 | over 30 years old, rather than modern, fancy, clean unicode. There was
 18 | no nice pip package that I could easily go to for this simple task, so I
 19 | decided to roll my own.
 20 | 
 21 | Install
 22 | ~~~~~~~
 23 | 
 24 | Installation is easy. Use ``pip`` or your preferred method to download
 25 | from PyPI.
 26 | 
 27 | ::
 28 | 
 29 |     pip install betacode
 30 | 
 31 | Usage
 32 | ~~~~~
 33 | 
 34 | Note that in all examples, strings are unicode encoded. Input can be in
 35 | upper or lower case. The official definition from TLG uses only
 36 | uppercase, but many resources, such as the Perseus catalog, are encoded
 37 | in lowercase, so this package accepts both. This package also can
 38 | disregard the unnecessary cannonical order of Greek diacritics from the
 39 | official definition. The only thing that matters in order for the
 40 | betacode to be unambiguous is that each unit must either begin with a
 41 | ``*`` or a letter. As long as these constraints are followed, breathing
 42 | marks, accents, and such can go in any order. However, the cannonical
 43 | order will be returned when going from unicode to betacode. Also note
 44 | that currently, only individual, non-combining characters are handled.
 45 | This means that you cannot do all combinations of letters and
 46 | diacritics. Only those defined as composite characters in the Greek and
 47 | Extended Greek sections of unicode.
 48 | 
 49 | Betacode to unicode
 50 | ^^^^^^^^^^^^^^^^^^^
 51 | 
 52 | ::
 53 | 
 54 |     import betacode.conv
 55 | 
 56 |     beta = 'analabo/ntes de\ kaq\' e(/kaston'
 57 |     betacode.conv.beta_to_uni(beta) # αναλαβόντες δὲ καθ᾽ ἕκαστον
 58 | 
 59 | Note that polytonic accent marks will be used, and not monotonic accent
 60 | marks. Both are de jure equivalent in Greece, but betacode was initially
 61 | developed to encode classic works so the polytonic diacritics are more
 62 | fitting. In other words, the oxeîa will be used rather than tónos. The
 63 | oxeîa form can be converted to the modern accent form easily either
 64 | through search and replace, or unicode normalization since oxeîa has
 65 | canonical decomposition into tónos.
 66 | 
 67 | Conversion can also be made more strict by using the ``strict`` flag.
 68 | 
 69 | ::
 70 | 
 71 |     beta_to_uni(text, strict=False)
 72 | 
 73 | If set, only the cannonical order of diacritics is accepted in betacode.
 74 | If it is not set, then any order is allowed as long as capital letters
 75 | begin with a ``*`` and lowercase letters begin with the letter and not a
 76 | diacritic.
 77 | 
 78 | Unicode to betacode
 79 | ^^^^^^^^^^^^^^^^^^^
 80 | 
 81 | ::
 82 | 
 83 |     import betacode.conv
 84 | 
 85 |     uni = 'αναλαβόντες δὲ καθ᾽ ἕκαστον'
 86 |     betacode.conv.uni_to_beta(uni) # analabo/ntes de\ kaq\' e(/kaston
 87 | 
 88 | The unicode text can use polytonic (oxeîa) accent marks or monotonic
 89 | (tónos) accent marks can be used.
 90 | 
 91 | Speed
 92 | ~~~~~
 93 | 
 94 | The original implementation used a custom made trie. This maybe was not
 95 | the fastest (I wasn't sure). So, I compared against a third party trie
 96 | implementation, pygtrie. The pygtrie had nicer prefix methods which
 97 | allowed for much faster processing of large texts. This changed
 98 | converting all of Strabo or Herodotus in the Perseus catalog from a many
 99 | minute operation to a ~3-4 second operation. I have seen implementations
100 | that use regular expressions which I suspsect might be faster since the
101 | underlying implementation is in C. However, this package is much smaller
102 | and simpler if betacode conversion is all that is needed than CLTK, for
103 | example.
104 | 
105 | Modified Betacode
106 | ~~~~~~~~~~~~~~~~~
107 | 
108 | There is talk of a modified betacode that I have seen around on the
109 | internet. I have never been able to find a definitive definition of this
110 | so I have not implemented it. Among some differences is word final sigma
111 | usage, ``_`` as macron, and uppercase and lowercase roman letters
112 | instead of using ``*``.
113 | 
114 | Development
115 | -----------
116 | 
117 | I am no classicist, and this was done in my free time. It is very
118 | possible that there are some letters missing that are not accounted for,
119 | or some punctuation that is not properly handled. If that is the case,
120 | please tell me as it is easy to fix, or please open a PR for your own
121 | branch. Write tests if you do add a feature.
122 | 
123 | .. |Build Status| image:: https://travis-ci.org/matgrioni/betacode.svg?branch=master
124 |    :target: https://travis-ci.org/matgrioni/betacode
125 | .. |Coverage Status| image:: https://coveralls.io/repos/github/matgrioni/betacode/badge.svg?branch=master
126 |    :target: https://coveralls.io/github/matgrioni/betacode?branch=master
127 | 


--------------------------------------------------------------------------------
/betacode/conv.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import unicodedata
  3 | 
  4 | import pygtrie
  5 | 
  6 | from . import _map
  7 | 
  8 | # Special characters that need their own references to rewrite with
  9 | _FINAL_LC_SIGMA = '\u03c2'
 10 | _MEDIAL_LC_SIGMA = '\u03c3'
 11 | 
 12 | # Punctuation marks in the betacode map
 13 | _BETA_PUNCTUATION = frozenset('\':-_')
 14 | _BETA_APOSTROPHE = '\u2019'
 15 | 
 16 | 
 17 | def _create_unicode_map():
 18 |     """
 19 |     Create the inverse map from unicode to betacode.
 20 | 
 21 |     Returns:
 22 |     The hash map to convert unicode characters to the beta code representation.
 23 |     """
 24 |     unicode_map = {}
 25 | 
 26 |     for beta, uni in _map.BETACODE_MAP.items():
 27 |         # Include decomposed equivalent where necessary.
 28 |         norm = unicodedata.normalize('NFC', uni)
 29 |         unicode_map[norm] = beta
 30 |         unicode_map[uni] = beta
 31 | 
 32 |     # Add the final sigmas.
 33 |     final_sigma_norm = unicodedata.normalize('NFC', _FINAL_LC_SIGMA)
 34 |     unicode_map[final_sigma_norm] = 's'
 35 |     unicode_map[_FINAL_LC_SIGMA] = 's'
 36 | 
 37 |     return unicode_map
 38 | 
 39 | _UNICODE_MAP = _create_unicode_map()
 40 | 
 41 | 
 42 | def _create_conversion_trie(strict):
 43 |     """
 44 |     Create the trie for betacode conversion.
 45 | 
 46 |     Args:
 47 |     text: The beta code text to convert. All of this text must be betacode.
 48 |     strict: Flag to allow for flexible diacritic order on input.
 49 | 
 50 |     Returns:
 51 |     The trie for conversion.
 52 |     """
 53 |     t = pygtrie.CharTrie()
 54 | 
 55 |     for beta, uni in _map.BETACODE_MAP.items():
 56 |         if strict:
 57 |             t[beta] = uni
 58 |         else:
 59 |             # The order of accents is very strict and weak. Allow for many orders of
 60 |             # accents between asterisk and letter or after letter. This does not
 61 |             # introduce ambiguity since each betacode token only has one letter and
 62 |             # either starts with a asterisk or a letter.
 63 |             diacritics = beta[1:]
 64 | 
 65 |             perms = itertools.permutations(diacritics)
 66 |             for perm in perms:
 67 |                 perm_str = beta[0] + ''.join(perm)
 68 |                 t[perm_str.lower()] = uni
 69 |                 t[perm_str.upper()] = uni
 70 | 
 71 |     return t
 72 | 
 73 | 
 74 | def _find_max_beta_token_len():
 75 |     """
 76 |     Finds the maximum length of a single betacode token.
 77 | 
 78 |     Returns:
 79 |     The length of the longest key in the betacode map, which corresponds to the
 80 |     longest single betacode token.
 81 |     """
 82 |     max_beta_len = -1
 83 |     for beta, uni in _map.BETACODE_MAP.items():
 84 |         if len(beta) > max_beta_len:
 85 |             max_beta_len = len(beta)
 86 | 
 87 |     return max_beta_len
 88 | 
 89 | _MAX_BETA_TOKEN_LEN = _find_max_beta_token_len()
 90 | 
 91 | def _penultimate_sigma_word_final(text):
 92 |     return len(text) > 1 and text[-2] == _MEDIAL_LC_SIGMA and \
 93 |         not text[-1].isalnum() and text[-1] != _BETA_APOSTROPHE
 94 | 
 95 | 
 96 | _BETA_CONVERSION_TRIES = {}
 97 | def beta_to_uni(text, strict=False):
 98 |     """
 99 |     Converts the given text from betacode to unicode.
100 | 
101 |     Args:
102 |     text: The beta code text to convert. All of this text must be betacode.
103 |     strict: Flag to allow for flexible diacritic order on input.
104 | 
105 |     Returns:
106 |     The converted text.
107 |     """
108 |     # Check if the requested configuration for conversion already has a trie
109 |     # stored otherwise convert it.
110 |     param_key = (strict,)
111 |     try:
112 |        t = _BETA_CONVERSION_TRIES[param_key]
113 |     except KeyError:
114 |         t = _create_conversion_trie(*param_key)
115 |         _BETA_CONVERSION_TRIES[param_key] = t
116 | 
117 |     transform = []
118 |     idx = 0
119 |     possible_word_boundary = False
120 | 
121 |     while idx < len(text):
122 |         if possible_word_boundary and _penultimate_sigma_word_final(transform):
123 |             transform[-2] = _FINAL_LC_SIGMA
124 | 
125 |         step = t.longest_prefix(text[idx:idx + _MAX_BETA_TOKEN_LEN])
126 | 
127 |         if step:
128 |             possible_word_boundary = text[idx] in _BETA_PUNCTUATION
129 | 
130 |             key, value = step
131 |             transform.append(value)
132 |             idx += len(key)
133 |         else:
134 |             possible_word_boundary = True
135 | 
136 |             transform.append(text[idx])
137 |             idx += 1
138 | 
139 |     # Check one last time in case there is some whitespace or punctuation at the
140 |     # end and check if the last character is a sigma.
141 |     if possible_word_boundary and _penultimate_sigma_word_final(transform):
142 |         transform[-2] = _FINAL_LC_SIGMA
143 |     elif len(transform) > 0 and transform[-1] == _MEDIAL_LC_SIGMA:
144 |         transform[-1] = _FINAL_LC_SIGMA
145 | 
146 |     converted = ''.join(transform)
147 |     return converted
148 | 
149 | def uni_to_beta(text):
150 |     """
151 |     Convert unicode text to a betacode equivalent.
152 | 
153 |     This method can handle tónos or oxeîa characters in the input.
154 | 
155 |     Args:
156 |     text: The text to convert to betacode. This text does not have to all be
157 |         Greek polytonic text, and only Greek characters will be converted. Note
158 |         that in this case, you cannot convert to beta and then back to unicode.
159 | 
160 |     Returns:
161 |     The betacode equivalent of the inputted text where applicable.
162 |     """
163 |     u = _UNICODE_MAP
164 | 
165 |     transform = []
166 | 
167 |     for ch in text:
168 |         try:
169 |             conv = u[ch]
170 |         except KeyError:
171 |             conv = ch
172 | 
173 |         transform.append(conv)
174 | 
175 |     converted = ''.join(transform)
176 |     return converted
177 | 


--------------------------------------------------------------------------------
/betacode/_map.py:
--------------------------------------------------------------------------------
  1 | BETACODE_MAP = {
  2 |     # No marks
  3 |     'a':       '\u03b1',
  4 |     'b':       '\u03b2',
  5 |     'g':       '\u03b3',
  6 |     'd':       '\u03b4',
  7 |     'e':       '\u03b5',
  8 |     'z':       '\u03b6',
  9 |     'h':       '\u03b7',
 10 |     'q':       '\u03b8',
 11 |     'i':       '\u03b9',
 12 |     'k':       '\u03ba',
 13 |     'l':       '\u03bb',
 14 |     'm':       '\u03bc',
 15 |     'n':       '\u03bd',
 16 |     'c':       '\u03be',
 17 |     'o':       '\u03bf',
 18 |     'p':       '\u03c0',
 19 |     'r':       '\u03c1',
 20 |     's':       '\u03c3',
 21 |     's1':      '\u03c3',
 22 |     's2':      '\u03c2',
 23 |     's3':      '\u03f2',
 24 |     't':       '\u03c4',
 25 |     'u':       '\u03c5',
 26 |     'f':       '\u03c6',
 27 |     'x':       '\u03c7',
 28 |     'y':       '\u03c8',
 29 |     'w':       '\u03c9',
 30 |     '*a':      '\u0391',
 31 |     '*b':      '\u0392',
 32 |     '*g':      '\u0393',
 33 |     '*d':      '\u0394',
 34 |     '*e':      '\u0395',
 35 |     '*z':      '\u0396',
 36 |     '*h':      '\u0397',
 37 |     '*q':      '\u0398',
 38 |     '*i':      '\u0399',
 39 |     '*k':      '\u039a',
 40 |     '*l':      '\u039b',
 41 |     '*m':      '\u039c',
 42 |     '*n':      '\u039d',
 43 |     '*c':      '\u039e',
 44 |     '*o':      '\u039f',
 45 |     '*p':      '\u03a0',
 46 |     '*r':      '\u03a1',
 47 |     '*s':      '\u03a3',
 48 |     '*s3':     '\u03f9',
 49 |     '*t':      '\u03a4',
 50 |     '*u':      '\u03a5',
 51 |     '*f':      '\u03a6',
 52 |     '*x':      '\u03a7',
 53 |     '*y':      '\u03a8',
 54 |     '*w':      '\u03a9',
 55 | 
 56 |     # Smooth breathing
 57 |     'a)':      '\u1f00',
 58 |     'e)':      '\u1f10',
 59 |     'h)':      '\u1f20',
 60 |     'i)':      '\u1f30',
 61 |     'o)':      '\u1f40',
 62 |     'u)':      '\u1f50',
 63 |     'w)':      '\u1f60',
 64 |     'r)':      '\u1fe4',
 65 |     '*)a':     '\u1f08',
 66 |     '*)e':     '\u1f18',
 67 |     '*)h':     '\u1f28',
 68 |     '*)i':     '\u1f38',
 69 |     '*)o':     '\u1f48',
 70 |     '*)w':     '\u1f68',
 71 | 
 72 |     # Rough breathing
 73 |     'a(':      '\u1f01',
 74 |     'e(':      '\u1f11',
 75 |     'h(':      '\u1f21',
 76 |     'i(':      '\u1f31',
 77 |     'o(':      '\u1f41',
 78 |     'u(':      '\u1f51',
 79 |     'w(':      '\u1f61',
 80 |     'r(':      '\u1fe5',
 81 |     '*(a':     '\u1f09',
 82 |     '*(e':     '\u1f19',
 83 |     '*(h':     '\u1f29',
 84 |     '*(i':     '\u1f39',
 85 |     '*(o':     '\u1f49',
 86 |     '*(u':     '\u1f59',
 87 |     '*(w':     '\u1f69',
 88 |     '*(r':     '\u1fec',
 89 | 
 90 |     # Acute accent and grave accent
 91 |     'a\\':     '\u1f70',
 92 |     'a/':      '\u1f71',
 93 |     'e\\':     '\u1f72',
 94 |     'e/':      '\u1f73',
 95 |     'h\\':     '\u1f74',
 96 |     'h/':      '\u1f75',
 97 |     'i\\':     '\u1f76',
 98 |     'i/':      '\u1f77',
 99 |     'o\\':     '\u1f78',
100 |     'o/':      '\u1f79',
101 |     'u\\':     '\u1f7a',
102 |     'u/':      '\u1f7b',
103 |     'w\\':     '\u1f7c',
104 |     'w/':      '\u1f7d',
105 |     '*\\a':    '\u1fba',
106 |     '*/a':     '\u1fbb',
107 |     '*\\e':    '\u1fce',
108 |     '*/e':     '\u1fc9',
109 |     '*\\h':    '\u1fca',
110 |     '*/h':     '\u1fcb',
111 |     '*\\i':    '\u1fda',
112 |     '*/i':     '\u1fdb',
113 |     '*\\o':    '\u1ff8',
114 |     '*/o':     '\u1ff9',
115 |     '*\\u':    '\u1fea',
116 |     '*/u':     '\u1feb',
117 |     '*\\w':    '\u1ffa',
118 |     '*/w':     '\u1ffb',
119 | 
120 |     # Smooth breathing and acute accent
121 |     'a)/':     '\u1f04',
122 |     'e)/':     '\u1f14',
123 |     'h)/':     '\u1f24',
124 |     'i)/':     '\u1f34',
125 |     'o)/':     '\u1f44',
126 |     'u)/':     '\u1f54',
127 |     'w)/':     '\u1f64',
128 |     '*)/a':    '\u1f0c',
129 |     '*)/e':    '\u1f1c',
130 |     '*)/h':    '\u1f2c',
131 |     '*)/i':    '\u1f3c',
132 |     '*)/o':    '\u1f4c',
133 |     '*)/u':    '\u1f5c',
134 |     '*)/w':    '\u1f6c',
135 | 
136 |     # Smooth breathing and grave accent
137 |     'a)\\':    '\u1f02',
138 |     'e)\\':    '\u1f12',
139 |     'h)\\':    '\u1f22',
140 |     'i)\\':    '\u1f32',
141 |     'o)\\':    '\u1f42',
142 |     'u)\\':    '\u1f52',
143 |     'w)\\':    '\u1f62',
144 |     '*)\\a':   '\u1f0a',
145 |     '*)\\e':   '\u1f1a',
146 |     '*)\\h':   '\u1f2a',
147 |     '*)\\i':   '\u1f3a',
148 |     '*)\\o':   '\u1f4a',
149 |     '*)\\u':   '\u1f5a',
150 |     '*)\\w':   '\u1f6a',
151 | 
152 |     # Rough breathing and acute accent
153 |     'a(/':     '\u1f05',
154 |     'e(/':     '\u1f15',
155 |     'h(/':     '\u1f25',
156 |     'i(/':     '\u1f35',
157 |     'o(/':     '\u1f45',
158 |     'u(/':     '\u1f55',
159 |     'w(/':     '\u1f65',
160 |     '*(/a':    '\u1f0d',
161 |     '*(/e':    '\u1f1d',
162 |     '*(/h':    '\u1f2d',
163 |     '*(/i':    '\u1f3d',
164 |     '*(/o':    '\u1f4d',
165 |     '*(/u':    '\u1f5d',
166 |     '*(/w':    '\u1f6d',
167 | 
168 |     # Rough breathing and grave accent
169 |     'a(\\':    '\u1f03',
170 |     'e(\\':    '\u1f13',
171 |     'h(\\':    '\u1f23',
172 |     'i(\\':    '\u1f33',
173 |     'o(\\':    '\u1f43',
174 |     'u(\\':    '\u1f53',
175 |     'w(\\':    '\u1f63',
176 |     '*(\\a':   '\u1f0b',
177 |     '*(\\e':   '\u1f1b',
178 |     '*(\\h':   '\u1f2b',
179 |     '*(\\i':   '\u1f3b',
180 |     '*(\\o':   '\u1f4b',
181 |     '*(\\u':   '\u1f5b',
182 |     '*(\\w':   '\u1f6b',
183 | 
184 |     # Perispomeni
185 |     'a=':     '\u1fb6',
186 |     'h=':     '\u1fc6',
187 |     'i=':     '\u1fd6',
188 |     'u=':     '\u1fe6',
189 |     'w=':     '\u1ff6',
190 | 
191 |     # Smooth breathing and perispomeni
192 |     'a)=':    '\u1f06',
193 |     'h)=':    '\u1f26',
194 |     'i)=':    '\u1f36',
195 |     'u)=':    '\u1f56',
196 |     'w)=':    '\u1f66',
197 |     '*)=a':   '\u1f0e',
198 |     '*)=h':   '\u1f2e',
199 |     '*)=i':   '\u1f3e',
200 |     '*)=w':   '\u1f6e',
201 | 
202 |     # Rough breathing and perispomeni
203 |     'a(=':    '\u1f07',
204 |     'h(=':    '\u1f27',
205 |     'i(=':    '\u1f37',
206 |     'u(=':    '\u1f57',
207 |     'w(=':    '\u1f67',
208 |     '*(=a':   '\u1f0f',
209 |     '*(=h':   '\u1f2f',
210 |     '*(=i':   '\u1f3f',
211 |     '*(=u':   '\u1f5f',
212 |     '*(=w':   '\u1f6f',
213 | 
214 |     # Perispomeni and ypogegrammeni
215 |     'a=|':    '\u1fb7',
216 |     'h=|':    '\u1fc7',
217 |     'w=|':    '\u1ff7',
218 | 
219 |     # Ypogegrammeni
220 |     'a|':     '\u1fb3',
221 |     'h|':     '\u1fc3',
222 |     'w|':     '\u1ff3',
223 |     '*a|':    '\u1fbc',
224 |     '*h|':    '\u1fcc',
225 |     '*w|':    '\u1ffc',
226 | 
227 |     # Acute accent and ypogegrammeni
228 |     'a/|':    '\u1fb4',
229 |     'h/|':    '\u1fc4',
230 |     'w/|':    '\u1ff4',
231 | 
232 |     # Smooth breathing and ypogegrammeni
233 |     'a)|':    '\u1f80',
234 |     'h)|':    '\u1f90',
235 |     'w)|':    '\u1fa0',
236 |     '*)a|':   '\u1f88',
237 |     '*)h|':   '\u1f98',
238 |     '*)w|':   '\u1fa8',
239 | 
240 |     # Rough breathing and ypogegrammeni
241 |     'a(|':    '\u1f81',
242 |     'h(|':    '\u1f91',
243 |     'w(|':    '\u1fa1',
244 |     '*(a|':   '\u1f89',
245 |     '*(h|':   '\u1f99',
246 |     '*(w|':   '\u1fa9',
247 | 
248 |     # Smooth breathing, acute accent, and ypogegrammeni
249 |     'a)\|':   '\u1f82',
250 |     'h)\|':   '\u1f92',
251 |     'w)\|':   '\u1fa2',
252 |     '*)\\a|': '\u1f8a',
253 |     '*)\h|':  '\u1f9a',
254 |     '*)\w|':  '\u1faa',
255 | 
256 |     # Rough breathing, grave accent, and ypogegrammeni
257 |     'a(\|':   '\u1f83',
258 |     'h)\|':   '\u1f93',
259 |     'w)\|':   '\u1fa3',
260 |     '*(\\a|': '\u1f8b',
261 |     '*)\h|':  '\u1f9b',
262 |     '*)\w|':  '\u1fab',
263 | 
264 |     # Smooth breathing, accute accent, and ypogegrammeni
265 |     'a)/|':   '\u1f84',
266 |     'h)/|':   '\u1f94',
267 |     'w)/|':   '\u1fa4',
268 |     '*)/a|':  '\u1f8c',
269 |     '*)/h|':  '\u1f9c',
270 |     '*)/w|':  '\u1fac',
271 | 
272 |     # Rough breating, acute accent, and ypogegrammeni
273 |     'a(/|':   '\u1f85',
274 |     'h(/|':   '\u1f95',
275 |     'w(/|':   '\u1fa5',
276 |     '*(/a|':  '\u1f8d',
277 |     '*(/h|':  '\u1f9d',
278 |     '*(/w|':  '\u1fad',
279 | 
280 |     # Smooth breathing, ypogegrammeni, and perispomeni
281 |     'a)=|':   '\u1f86',
282 |     'h)=|':   '\u1f96',
283 |     'w)=|':   '\u1fa6',
284 |     '*)=a|':  '\u1f8e',
285 |     '*)=h|':  '\u1f9e',
286 |     '*)=w|':  '\u1fae',
287 | 
288 |     # Rough breathing, ypogegrammeni, and perispomeni
289 |     'a(=|':   '\u1f87',
290 |     'h(=|':   '\u1f97',
291 |     'w(=|':   '\u1fa7',
292 |     '*(=a|':  '\u1f8f',
293 |     '*(=h|':  '\u1f9f',
294 |     '*(=w|':  '\u1faf',
295 | 
296 |     # Diaeresis
297 |     'i+':     '\u03ca',
298 |     '*+i':    '\u03aa',
299 |     'i\\+':   '\u1fd2',
300 |     'i/+':    '\u1fd3',
301 |     'i+/':    '\u1fd3',
302 |     'i=+':    '\u1fd7',
303 |     'u+':     '\u03cb',
304 |     '*+u':    '\u03ab',
305 |     'u\\+':   '\u1fe2',
306 |     'u/+':    '\u1fe3',
307 |     'u=+':    '\u1fe7',
308 | 
309 |     # Macron
310 |     'a&':     '\u1fb0',
311 |     'i&':     '\u1fd0',
312 |     'u&':     '\u1fe0',
313 | 
314 |     # Breve
315 |     'a\'':    '\u1fb1',
316 |     'i\'':    '\u1fd1',
317 |     'u\'':    '\u1fe1',
318 | 
319 |     # Basic punctuation
320 |     ':':      '\u00b7',
321 |     '\'':     '\u2019',
322 |     '-':      '\u2010',
323 |     '_':      '\u2014'
324 | }
325 | 


--------------------------------------------------------------------------------