├── .gitignore ├── LICENSE ├── README.md ├── setup.py ├── test ├── __init__.py ├── range_of.py ├── ulower.py └── usplit.py └── unicodeblock ├── __init__.py ├── blocks.py ├── lower.py └── sequence.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Neuron Teckid 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UnicodeBlock 2 | 3 | Python Unicode Block Utilities 4 | 5 | * Unicode block type lookup 6 | * Unicode string split 7 | * Convert full-width letters into half-width, lower letters 8 | 9 | ## Install 10 | 11 | For Python3 12 | 13 | pip install unicodeblock 14 | 15 | For Python2 16 | 17 | pip install unicodeblock==0.2.2 18 | 19 | ## Usage 20 | 21 | >>> import unicodeblock.blocks 22 | >>> print(unicodeblock.blocks.of('0')) 23 | DIGIT 24 | >>> print(unicodeblock.blocks.of('汉')) 25 | CJK_UNIFIED_IDEOGRAPHS 26 | >>> print(unicodeblock.blocks.of('あ')) 27 | HIRAGANA 28 | 29 | >>> import unicodeblock.sequence 30 | >>> for sequence in unicodeblock.sequence.usplit('攻殻機動隊ARISE border:1 Ghost Pain'): 31 | ... print(sequence.lang, sequence) 32 | cjk 攻殻機動隊 33 | en ARISE 34 | en border 35 | digit 1 36 | en Ghost 37 | en Pain 38 | 39 | >>> import unicodeblock.lower 40 | >>> print(unicodeblock.lower.lower_fullwidths('Hello World')) 41 | hello world 42 | 43 | ## Run test 44 | 45 | python -m unittest test/*.py 46 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from setuptools import setup, find_packages 3 | 4 | _URI = 'https://github.com/neuront/pyunicodeblock' 5 | 6 | setup( 7 | name='unicodeblock', 8 | version='0.3.1', 9 | author='Neuron Teckid', 10 | author_email='lene13@gmail.com', 11 | license='MIT', 12 | keywords='Python UnicodeBlock', 13 | url=_URI, 14 | description='Python Unicode Block Utilities', 15 | packages=['unicodeblock'], 16 | long_description='Visit ' + _URI + ' for details please.', 17 | install_requires=[], 18 | zip_safe=False, 19 | ) 20 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zheplusplus/pyunicodeblock/8dabdcf9e99a7cd0e1635d4b5f192c3431358cbf/test/__init__.py -------------------------------------------------------------------------------- /test/range_of.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import string 4 | import unittest 5 | 6 | import unicodeblock.blocks 7 | 8 | 9 | class RangeOf(unittest.TestCase): 10 | def test_range_of(self): 11 | self.assertEqual(None, unicodeblock.blocks.of(chr(0))) 12 | self.assertEqual('SPACE', unicodeblock.blocks.of(' ')) 13 | for ch in string.ascii_letters: 14 | self.assertEqual('BASIC_LATIN', unicodeblock.blocks.of(ch)) 15 | for ch in string.digits: 16 | self.assertEqual('DIGIT', unicodeblock.blocks.of(ch)) 17 | for ch in string.punctuation: 18 | self.assertEqual('BASIC_PUNCTUATION', unicodeblock.blocks.of(ch)) 19 | 20 | for ch in '·§©': 21 | self.assertEqual('LATIN_1_SUPPLEMENT', unicodeblock.blocks.of(ch)) 22 | for ch in 'ÂÃÄÅÒÓÔÕâãäåòóôõ': 23 | self.assertEqual('LATIN_EXTENDED_LETTER', 24 | unicodeblock.blocks.of(ch)) 25 | 26 | for ch in '啊哦呃衣乌淤': 27 | self.assertEqual('CJK_UNIFIED_IDEOGRAPHS', 28 | unicodeblock.blocks.of(ch)) 29 | for ch in 'あいうえおまみむめも': 30 | self.assertEqual('HIRAGANA', unicodeblock.blocks.of(ch)) 31 | for ch in 'アイウエオマミムメモー': 32 | self.assertEqual('KATAKANA', unicodeblock.blocks.of(ch)) 33 | for ch in '〜【】〒〓〔〕『』「」《》。、〝〞': 34 | self.assertEqual('CJK_SYMBOLS_AND_PUNCTUATION', 35 | unicodeblock.blocks.of(ch)) 36 | for ch in '/~()!?:!,=': 37 | self.assertEqual('HALFWIDTH_AND_FULLWIDTH_FORMS', 38 | unicodeblock.blocks.of(ch)) 39 | for ch in '“”‘’…—※': 40 | self.assertEqual('GENERAL_PUNCTUATION', unicodeblock.blocks.of(ch)) 41 | for ch in '☆★': 42 | self.assertEqual('MISCELLANEOUS_SYMBOLS', 43 | unicodeblock.blocks.of(ch)) 44 | for ch in '✧➀➁➂➃➄➅➆➇➈➉➊➋➌➍➎➏': 45 | self.assertEqual('DINGBATS', unicodeblock.blocks.of(ch)) 46 | for ch in '←↑→↓↔↕↖↗↘↙↚↛↜↝↞↟': 47 | self.assertEqual('ARROWS', unicodeblock.blocks.of(ch)) 48 | 49 | self.assertEqual('CJK_SYMBOLS_AND_PUNCTUATION', 50 | unicodeblock.blocks.of(u'々')) 51 | -------------------------------------------------------------------------------- /test/ulower.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import unittest 4 | 5 | import unicodeblock.lower 6 | 7 | 8 | class UniversalLower(unittest.TestCase): 9 | def test_lower_kana(self): 10 | self.assertEqual( 11 | 'ミカサ・アッカーマン', 12 | unicodeblock.lower.lower_kanas('ミカサ・アッカーマン')) 13 | self.assertEqual( 14 | 'ノビ太ノクセニ', 15 | unicodeblock.lower.lower_kanas('のび太のくせに')) 16 | self.assertEqual( 17 | 'ノビ太ノクセニ', 18 | unicodeblock.lower.lower_kanas('のび太のクセに')) 19 | 20 | def test_fullwidth_letters(self): 21 | self.assertEqual('the quick brown fox jumps over the lazy dog.', 22 | unicodeblock.lower.lower_fullwidths( 23 | 'The quick brown fox ' + 24 | 'jumps over the lazy dog.')) 25 | 26 | def test_ulower(self): 27 | self.assertEqual( 28 | 'ノビ太ノクセニ', 29 | unicodeblock.lower.ulower('のび太のクセに')) 30 | self.assertEqual('the quick brown fox jumps over the lazy dog.', 31 | unicodeblock.lower.ulower( 32 | 'The quick brown fox ' + 33 | 'jumps over the lazy dog.')) 34 | self.assertEqual('ノビ太ノクセニ ' + 35 | 'the quick brown fox jumps over the lazy dog. ' + 36 | "the browns' kitsune", 37 | unicodeblock.lower.ulower( 38 | 'のび太のクセに ' + 39 | 'The quick brown fox ' + 40 | 'jumps over the lazy dog. ' + 41 | "The Browns' kitsune")) 42 | -------------------------------------------------------------------------------- /test/usplit.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import unittest 4 | 5 | import unicodeblock.sequence 6 | 7 | 8 | class UnicodeSplit(unittest.TestCase): 9 | def _assert_sequence(self, s, expected_split): 10 | seqs = unicodeblock.sequence.usplit(s) 11 | self.assertEqual(len(expected_split), len(seqs)) 12 | for i, seq in enumerate(seqs): 13 | self.assertEqual(expected_split[i][0], seq.lang, msg=f'index={i}') 14 | self.assertEqual(expected_split[i][1], seq.value, msg=f'index={i}') 15 | 16 | def test_usplit(self): 17 | self._assert_sequence('`12345t6y7u8iop[-09', ( 18 | ('digit', '12345'), 19 | ('en', 't'), 20 | ('digit', '6'), 21 | ('en', 'y'), 22 | ('digit', '7'), 23 | ('en', 'u'), 24 | ('digit', '8'), 25 | ('en', 'iop'), 26 | ('digit', '09'), 27 | )) 28 | 29 | self._assert_sequence('unicodeblock.sequence.usplit', ( 30 | ('en', 'unicodeblock'), 31 | ('en', 'sequence'), 32 | ('en', 'usplit'), 33 | )) 34 | 35 | self._assert_sequence(u"Kuroko's Basketball", ( 36 | ('en', "Kuroko's"), 37 | ('en', 'Basketball'), 38 | )) 39 | 40 | self._assert_sequence('Kiss-Shot Acerola-Orion Heart-Under-Blade', ( 41 | ('en', 'Kiss-Shot'), 42 | ('en', 'Acerola-Orion'), 43 | ('en', 'Heart-Under-Blade'), 44 | )) 45 | 46 | def test_latin_mixed(self): 47 | self._assert_sequence('Diomedéa', ( 48 | ('latin', 'Diomedéa'), 49 | )) 50 | 51 | def test_fullwidth_latins(self): 52 | self._assert_sequence('The quick brown fox ' + 53 | 'jumps over the lazy dog.', ( 54 | ('en', 'The'), 55 | ('en', 'quick'), 56 | ('en', 'brown'), 57 | ('en', 'fox'), 58 | ('en', 'jumps'), 59 | ('en', 'over'), 60 | ('en', 'the'), 61 | ('en', 'lazy'), 62 | ('en', 'dog'), 63 | )) 64 | 65 | self._assert_sequence('The Browns' kitsune', ( 66 | ('en', 'The'), 67 | ('en', 'Browns''), 68 | ('en', 'kitsune'), 69 | )) 70 | 71 | def test_cjk_mixed(self): 72 | self._assert_sequence('らき☆すた', ( 73 | ('ja', 'らき'), 74 | ('ja', 'すた'), 75 | )) 76 | 77 | self._assert_sequence('うたの☆プリンスさまっ♪ マジLOVE2000%', ( 78 | ('ja', 'うたの'), 79 | ('ja', 'プリンスさまっ'), 80 | ('ja', 'マジ'), 81 | ('en', 'LOVE'), 82 | ('digit', '2000'), 83 | )) 84 | 85 | self._assert_sequence('荻上千佳', ( 86 | ('cjk', '荻上千佳'), 87 | )) 88 | 89 | self._assert_sequence('涼宮ハルヒの憂鬱2006', ( 90 | ('ja', '涼宮ハルヒの憂鬱'), 91 | ('digit', '2006'), 92 | )) 93 | 94 | self._assert_sequence('涼宮 ハルヒ', ( 95 | ('cjk', '涼宮'), 96 | ('ja', 'ハルヒ'), 97 | )) 98 | 99 | self._assert_sequence('カードキャプターさくら', ( 100 | ('ja', 'カードキャプターさくら'), 101 | )) 102 | 103 | self._assert_sequence('丹下桜단게사쿠라', ( 104 | ('cjk', '丹下桜'), 105 | ('kr', '단게사쿠라'), 106 | )) 107 | 108 | self._assert_sequence('ミカサ・アッカーマン', ( 109 | ('ja', 'ミカサ'), 110 | ('ja', 'アッカーマン'), 111 | )) 112 | 113 | self._assert_sequence('佐々木 純人', ( 114 | ('ja', '佐々木'), 115 | ('cjk', '純人'), 116 | )) 117 | 118 | self._assert_sequence('々々木', ( 119 | ('cjk', '木'), 120 | )) 121 | 122 | self._assert_sequence('々々木', ( 123 | ('cjk', '木'), 124 | )) 125 | 126 | self._assert_sequence('島﨑 信長', ( 127 | ('cjk', '島﨑'), 128 | ('cjk', '信長'), 129 | )) 130 | -------------------------------------------------------------------------------- /unicodeblock/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zheplusplus/pyunicodeblock/8dabdcf9e99a7cd0e1635d4b5f192c3431358cbf/unicodeblock/__init__.py -------------------------------------------------------------------------------- /unicodeblock/blocks.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | 3 | # from java.lang.Character.UnicodeBlock, with slight differeneces 4 | 5 | _BLOCK_STARTS, _BLOCK_NAMES = (lambda x: ( 6 | [i[0] for i in x], [i[1] for i in x]))([ 7 | (0x0000, None), 8 | (0x0020, 'SPACE'), 9 | (0x0021, 'BASIC_PUNCTUATION'), 10 | (0x0030, 'DIGIT'), 11 | (0x003A, 'BASIC_PUNCTUATION'), 12 | (0x0041, 'BASIC_LATIN'), 13 | (0x005B, 'BASIC_PUNCTUATION'), 14 | (0x0061, 'BASIC_LATIN'), 15 | (0x007B, 'BASIC_PUNCTUATION'), 16 | (0x007f, None), 17 | (0x00A0, 'LATIN_1_SUPPLEMENT'), 18 | (0x00C0, 'LATIN_EXTENDED_LETTER'), 19 | (0x0100, 'LATIN_EXTENDED_A'), 20 | (0x0180, 'LATIN_EXTENDED_B'), 21 | (0x0250, 'IPA_EXTENSIONS'), 22 | (0x02B0, 'SPACING_MODIFIER_LETTERS'), 23 | (0x0300, 'COMBINING_DIACRITICAL_MARKS'), 24 | (0x0370, 'GREEK'), 25 | (0x0400, 'CYRILLIC'), 26 | (0x0500, 'CYRILLIC_SUPPLEMENTARY'), 27 | (0x0530, 'ARMENIAN'), 28 | (0x0590, 'HEBREW'), 29 | (0x0600, 'ARABIC'), 30 | (0x0700, 'SYRIAC'), 31 | (0x0750, 'ARABIC_SUPPLEMENT'), 32 | (0x0780, 'THAANA'), 33 | (0x07C0, 'NKO'), 34 | (0x0800, 'SAMARITAN'), 35 | (0x0840, 'MANDAIC'), 36 | (0x0860, None), 37 | (0x0900, 'DEVANAGARI'), 38 | (0x0980, 'BENGALI'), 39 | (0x0A00, 'GURMUKHI'), 40 | (0x0A80, 'GUJARATI'), 41 | (0x0B00, 'ORIYA'), 42 | (0x0B80, 'TAMIL'), 43 | (0x0C00, 'TELUGU'), 44 | (0x0C80, 'KANNADA'), 45 | (0x0D00, 'MALAYALAM'), 46 | (0x0D80, 'SINHALA'), 47 | (0x0E00, 'THAI'), 48 | (0x0E80, 'LAO'), 49 | (0x0F00, 'TIBETAN'), 50 | (0x1000, 'MYANMAR'), 51 | (0x10A0, 'GEORGIAN'), 52 | (0x1100, 'HANGUL_JAMO'), 53 | (0x1200, 'ETHIOPIC'), 54 | (0x1380, 'ETHIOPIC_SUPPLEMENT'), 55 | (0x13A0, 'CHEROKEE'), 56 | (0x1400, 'UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS'), 57 | (0x1680, 'OGHAM'), 58 | (0x16A0, 'RUNIC'), 59 | (0x1700, 'TAGALOG'), 60 | (0x1720, 'HANUNOO'), 61 | (0x1740, 'BUHID'), 62 | (0x1760, 'TAGBANWA'), 63 | (0x1780, 'KHMER'), 64 | (0x1800, 'MONGOLIAN'), 65 | (0x18B0, 'UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED'), 66 | (0x1900, 'LIMBU'), 67 | (0x1950, 'TAI_LE'), 68 | (0x1980, 'NEW_TAI_LUE'), 69 | (0x19E0, 'KHMER_SYMBOLS'), 70 | (0x1A00, 'BUGINESE'), 71 | (0x1A20, 'TAI_THAM'), 72 | (0x1AB0, None), 73 | (0x1B00, 'BALINESE'), 74 | (0x1B80, 'SUNDANESE'), 75 | (0x1BC0, 'BATAK'), 76 | (0x1C00, 'LEPCHA'), 77 | (0x1C50, 'OL_CHIKI'), 78 | (0x1C80, None), 79 | (0x1CD0, 'VEDIC_EXTENSIONS'), 80 | (0x1D00, 'PHONETIC_EXTENSIONS'), 81 | (0x1D80, 'PHONETIC_EXTENSIONS_SUPPLEMENT'), 82 | (0x1DC0, 'COMBINING_DIACRITICAL_MARKS_SUPPLEMENT'), 83 | (0x1E00, 'LATIN_EXTENDED_ADDITIONAL'), 84 | (0x1F00, 'GREEK_EXTENDED'), 85 | (0x2000, 'GENERAL_PUNCTUATION'), 86 | (0x2070, 'SUPERSCRIPTS_AND_SUBSCRIPTS'), 87 | (0x20A0, 'CURRENCY_SYMBOLS'), 88 | (0x20D0, 'COMBINING_MARKS_FOR_SYMBOLS'), 89 | (0x2100, 'LETTERLIKE_SYMBOLS'), 90 | (0x2150, 'NUMBER_FORMS'), 91 | (0x2190, 'ARROWS'), 92 | (0x2200, 'MATHEMATICAL_OPERATORS'), 93 | (0x2300, 'MISCELLANEOUS_TECHNICAL'), 94 | (0x2400, 'CONTROL_PICTURES'), 95 | (0x2440, 'OPTICAL_CHARACTER_RECOGNITION'), 96 | (0x2460, 'ENCLOSED_ALPHANUMERICS'), 97 | (0x2500, 'BOX_DRAWING'), 98 | (0x2580, 'BLOCK_ELEMENTS'), 99 | (0x25A0, 'GEOMETRIC_SHAPES'), 100 | (0x2600, 'MISCELLANEOUS_SYMBOLS'), 101 | (0x2700, 'DINGBATS'), 102 | (0x27C0, 'MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A'), 103 | (0x27F0, 'SUPPLEMENTAL_ARROWS_A'), 104 | (0x2800, 'BRAILLE_PATTERNS'), 105 | (0x2900, 'SUPPLEMENTAL_ARROWS_B'), 106 | (0x2980, 'MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B'), 107 | (0x2A00, 'SUPPLEMENTAL_MATHEMATICAL_OPERATORS'), 108 | (0x2B00, 'MISCELLANEOUS_SYMBOLS_AND_ARROWS'), 109 | (0x2C00, 'GLAGOLITIC'), 110 | (0x2C60, 'LATIN_EXTENDED_C'), 111 | (0x2C80, 'COPTIC'), 112 | (0x2D00, 'GEORGIAN_SUPPLEMENT'), 113 | (0x2D30, 'TIFINAGH'), 114 | (0x2D80, 'ETHIOPIC_EXTENDED'), 115 | (0x2DE0, 'CYRILLIC_EXTENDED_A'), 116 | (0x2E00, 'SUPPLEMENTAL_PUNCTUATION'), 117 | (0x2E80, 'CJK_RADICALS_SUPPLEMENT'), 118 | (0x2F00, 'KANGXI_RADICALS'), 119 | (0x2FE0, None), 120 | (0x2FF0, 'IDEOGRAPHIC_DESCRIPTION_CHARACTERS'), 121 | (0x3000, 'CJK_SYMBOLS_AND_PUNCTUATION'), 122 | (0x3041, 'HIRAGANA'), 123 | (0x3097, 'CJK_SYMBOLS_AND_PUNCTUATION'), 124 | (0x30A1, 'KATAKANA'), 125 | (0x30FB, 'CJK_SYMBOLS_AND_PUNCTUATION'), 126 | (0x30FC, 'KATAKANA'), 127 | (0x3100, 'BOPOMOFO'), 128 | (0x3130, 'HANGUL_COMPATIBILITY_JAMO'), 129 | (0x3190, 'KANBUN'), 130 | (0x31A0, 'BOPOMOFO_EXTENDED'), 131 | (0x31C0, 'CJK_STROKES'), 132 | (0x31F0, 'KATAKANA_PHONETIC_EXTENSIONS'), 133 | (0x3200, 'ENCLOSED_CJK_LETTERS_AND_MONTHS'), 134 | (0x3300, 'CJK_COMPATIBILITY'), 135 | (0x3400, 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A'), 136 | (0x4DC0, 'YIJING_HEXAGRAM_SYMBOLS'), 137 | (0x4E00, 'CJK_UNIFIED_IDEOGRAPHS'), 138 | (0xA000, 'YI_SYLLABLES'), 139 | (0xA490, 'YI_RADICALS'), 140 | (0xA4D0, 'LISU'), 141 | (0xA500, 'VAI'), 142 | (0xA640, 'CYRILLIC_EXTENDED_B'), 143 | (0xA6A0, 'BAMUM'), 144 | (0xA700, 'MODIFIER_TONE_LETTERS'), 145 | (0xA720, 'LATIN_EXTENDED_D'), 146 | (0xA800, 'SYLOTI_NAGRI'), 147 | (0xA830, 'COMMON_INDIC_NUMBER_FORMS'), 148 | (0xA840, 'PHAGS_PA'), 149 | (0xA880, 'SAURASHTRA'), 150 | (0xA8E0, 'DEVANAGARI_EXTENDED'), 151 | (0xA900, 'KAYAH_LI'), 152 | (0xA930, 'REJANG'), 153 | (0xA960, 'HANGUL_JAMO_EXTENDED_A'), 154 | (0xA980, 'JAVANESE'), 155 | (0xA9E0, None), 156 | (0xAA00, 'CHAM'), 157 | (0xAA60, 'MYANMAR_EXTENDED_A'), 158 | (0xAA80, 'TAI_VIET'), 159 | (0xAAE0, None), 160 | (0xAB00, 'ETHIOPIC_EXTENDED_A'), 161 | (0xAB30, None), 162 | (0xABC0, 'MEETEI_MAYEK'), 163 | (0xAC00, 'HANGUL_SYLLABLES'), 164 | (0xD7B0, 'HANGUL_JAMO_EXTENDED_B'), 165 | (0xD800, 'HIGH_SURROGATES'), 166 | (0xDB80, 'HIGH_PRIVATE_USE_SURROGATES'), 167 | (0xDC00, 'LOW_SURROGATES'), 168 | (0xE000, 'PRIVATE_USE_AREA'), 169 | (0xF900, 'CJK_COMPATIBILITY_IDEOGRAPHS'), 170 | (0xFB00, 'ALPHABETIC_PRESENTATION_FORMS'), 171 | (0xFB50, 'ARABIC_PRESENTATION_FORMS_A'), 172 | (0xFE00, 'VARIATION_SELECTORS'), 173 | (0xFE10, 'VERTICAL_FORMS'), 174 | (0xFE20, 'COMBINING_HALF_MARKS'), 175 | (0xFE30, 'CJK_COMPATIBILITY_FORMS'), 176 | (0xFE50, 'SMALL_FORM_VARIANTS'), 177 | (0xFE70, 'ARABIC_PRESENTATION_FORMS_B'), 178 | (0xFF00, 'HALFWIDTH_AND_FULLWIDTH_FORMS'), 179 | (0xFF10, 'FULLWIDTH_DIGIT'), 180 | (0xFF1A, 'HALFWIDTH_AND_FULLWIDTH_FORMS'), 181 | (0xFF21, 'FULLWIDTH_LATIN'), 182 | (0xFF3B, 'HALFWIDTH_AND_FULLWIDTH_FORMS'), 183 | (0xFF41, 'FULLWIDTH_LATIN'), 184 | (0xFF5B, 'HALFWIDTH_AND_FULLWIDTH_FORMS'), 185 | (0xFFF0, 'SPECIALS'), 186 | 187 | (0x10000, 'LINEAR_B_SYLLABARY'), 188 | (0x10080, 'LINEAR_B_IDEOGRAMS'), 189 | (0x10100, 'AEGEAN_NUMBERS'), 190 | (0x10140, 'ANCIENT_GREEK_NUMBERS'), 191 | (0x10190, 'ANCIENT_SYMBOLS'), 192 | (0x101D0, 'PHAISTOS_DISC'), 193 | (0x10200, None), 194 | (0x10280, 'LYCIAN'), 195 | (0x102A0, 'CARIAN'), 196 | (0x102E0, None), 197 | (0x10300, 'OLD_ITALIC'), 198 | (0x10330, 'GOTHIC'), 199 | (0x10350, None), 200 | (0x10380, 'UGARITIC'), 201 | (0x103A0, 'OLD_PERSIAN'), 202 | (0x103E0, None), 203 | (0x10400, 'DESERET'), 204 | (0x10450, 'SHAVIAN'), 205 | (0x10480, 'OSMANYA'), 206 | (0x104B0, None), 207 | (0x10800, 'CYPRIOT_SYLLABARY'), 208 | (0x10840, 'IMPERIAL_ARAMAIC'), 209 | (0x10860, None), 210 | (0x10900, 'PHOENICIAN'), 211 | (0x10920, 'LYDIAN'), 212 | (0x10940, None), 213 | (0x10A00, 'KHAROSHTHI'), 214 | (0x10A60, 'OLD_SOUTH_ARABIAN'), 215 | (0x10A80, None), 216 | (0x10B00, 'AVESTAN'), 217 | (0x10B40, 'INSCRIPTIONAL_PARTHIAN'), 218 | (0x10B60, 'INSCRIPTIONAL_PAHLAVI'), 219 | (0x10B80, None), 220 | (0x10C00, 'OLD_TURKIC'), 221 | (0x10C50, None), 222 | (0x10E60, 'RUMI_NUMERAL_SYMBOLS'), 223 | (0x10E80, None), 224 | (0x11000, 'BRAHMI'), 225 | (0x11080, 'KAITHI'), 226 | (0x110D0, None), 227 | (0x12000, 'CUNEIFORM'), 228 | (0x12400, 'CUNEIFORM_NUMBERS_AND_PUNCTUATION'), 229 | (0x12480, None), 230 | (0x13000, 'EGYPTIAN_HIEROGLYPHS'), 231 | (0x13430, None), 232 | (0x16800, 'BAMUM_SUPPLEMENT'), 233 | (0x16A40, None), 234 | (0x1B000, 'KANA_SUPPLEMENT'), 235 | (0x1B100, None), 236 | (0x1D000, 'BYZANTINE_MUSICAL_SYMBOLS'), 237 | (0x1D100, 'MUSICAL_SYMBOLS'), 238 | (0x1D200, 'ANCIENT_GREEK_MUSICAL_NOTATION'), 239 | (0x1D250, None), 240 | (0x1D300, 'TAI_XUAN_JING_SYMBOLS'), 241 | (0x1D360, 'COUNTING_ROD_NUMERALS'), 242 | (0x1D380, None), 243 | (0x1D400, 'MATHEMATICAL_ALPHANUMERIC_SYMBOLS'), 244 | (0x1D800, None), 245 | (0x1F000, 'MAHJONG_TILES'), 246 | (0x1F030, 'DOMINO_TILES'), 247 | (0x1F0A0, 'PLAYING_CARDS'), 248 | (0x1F100, 'ENCLOSED_ALPHANUMERIC_SUPPLEMENT'), 249 | (0x1F200, 'ENCLOSED_IDEOGRAPHIC_SUPPLEMENT'), 250 | (0x1F300, 'MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS'), 251 | (0x1F600, 'EMOTICONS'), 252 | (0x1F650, None), 253 | (0x1F680, 'TRANSPORT_AND_MAP_SYMBOLS'), 254 | (0x1F700, 'ALCHEMICAL_SYMBOLS'), 255 | (0x1F780, None), 256 | (0x20000, 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B'), 257 | (0x2A6E0, None), 258 | (0x2A700, 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C'), 259 | (0x2B740, 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D'), 260 | (0x2B820, None), 261 | (0x2F800, 'CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT'), 262 | (0x2FA20, None), 263 | (0xE0000, 'TAGS'), 264 | (0xE0080, None), 265 | (0xE0100, 'VARIATION_SELECTORS_SUPPLEMENT'), 266 | (0xE01F0, None), 267 | (0xF0000, 'SUPPLEMENTARY_PRIVATE_USE_AREA_A'), 268 | (0x100000, 'SUPPLEMENTARY_PRIVATE_USE_AREA_B'), 269 | (0x10FFFF, None), 270 | ]) 271 | 272 | 273 | def of(uchar: str) -> str: 274 | return _BLOCK_NAMES[bisect.bisect_right(_BLOCK_STARTS, ord(uchar)) - 1] 275 | -------------------------------------------------------------------------------- /unicodeblock/lower.py: -------------------------------------------------------------------------------- 1 | _KANA_LOW = ord('ぁ') 2 | _KANA_HIGH = ord('ゖ') 3 | _KANA_DIFF = ord('ア') - ord('あ') 4 | 5 | 6 | def lower_kana(ch: str) -> str: 7 | code = ord(ch) 8 | if _KANA_LOW <= code <= _KANA_HIGH: 9 | return chr(code + _KANA_DIFF) 10 | return ch 11 | 12 | 13 | def lower_kanas(u: str) -> str: 14 | return ''.join([lower_kana(ch) for ch in u]) 15 | 16 | _FULL_WIDTH_LOW = ord('!') 17 | _FULL_WIDTH_HIGH = ord('~') 18 | _FULL_WIDTH_LOWER_DIFF = _FULL_WIDTH_LOW - ord('!') 19 | _FULL_WIDTH_CAPITAL_LOW = ord('A') 20 | _FULL_WIDTH_CAPITAL_HIGH = ord('Z') 21 | _FULL_WIDTH_CAPITAL_DIFF = _FULL_WIDTH_CAPITAL_LOW - ord('a') 22 | 23 | 24 | def lower_fullwidth(ch: str) -> str: 25 | code = ord(ch) 26 | if _FULL_WIDTH_CAPITAL_LOW <= code <= _FULL_WIDTH_CAPITAL_HIGH: 27 | return chr(code - _FULL_WIDTH_CAPITAL_DIFF) 28 | if _FULL_WIDTH_LOW <= code <= _FULL_WIDTH_HIGH: 29 | return chr(code - _FULL_WIDTH_LOWER_DIFF) 30 | return ch 31 | 32 | 33 | def lower_fullwidths(u: str) -> str: 34 | return ''.join([lower_fullwidth(ch) for ch in u]) 35 | 36 | 37 | def ulower(u: str) -> str: 38 | return ''.join([lower_kana(lower_fullwidth(ch)) for ch in u]).lower() 39 | -------------------------------------------------------------------------------- /unicodeblock/sequence.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from . import blocks 3 | 4 | 5 | class UnicodeSequence: 6 | def __init__(self, val: str, lang: str): 7 | self.value: str = val 8 | self.lang: str = lang 9 | 10 | def __str__(self): 11 | return self.value 12 | 13 | def __unicode__(self): 14 | return self.value 15 | 16 | def __repr__(self): 17 | return self.value 18 | 19 | 20 | def _init_states(): 21 | class State(dict): 22 | def __init__(self, lang, d=None): 23 | dict.__init__(self, d or dict()) 24 | self.lang = lang 25 | 26 | def next(self, ch): 27 | cht = blocks.of(ch) 28 | if cht in self: 29 | return self[cht](ch) 30 | return None 31 | 32 | number = State('digit') 33 | fullwidth_num = State('digit') 34 | basic_latin = State('en') 35 | fullwidth_latin = State('en') 36 | ext_latin = State('latin') 37 | basic_cjk = State('cjk') 38 | ja = State('ja') 39 | kr = State('kr') 40 | 41 | number['DIGIT'] = lambda _: number 42 | fullwidth_num['FULLWIDTH_DIGIT'] = lambda _: fullwidth_num 43 | 44 | basic_latin['BASIC_LATIN'] = lambda _: basic_latin 45 | basic_latin['BASIC_PUNCTUATION'] = ( 46 | lambda c: basic_latin if c == "'" or c == '-' else None) 47 | basic_latin['LATIN_EXTENDED_LETTER'] = lambda _: ext_latin 48 | 49 | fullwidth_latin['FULLWIDTH_LATIN'] = lambda _: fullwidth_latin 50 | fullwidth_latin['HALFWIDTH_AND_FULLWIDTH_FORMS'] = ( 51 | lambda c: fullwidth_latin if c == u''' or c == u'-' else None) 52 | fullwidth_latin['BASIC_PUNCTUATION'] = ( 53 | lambda c: fullwidth_latin if c == "'" or c == '-' else None) 54 | 55 | ext_latin['BASIC_LATIN'] = lambda _: ext_latin 56 | ext_latin['BASIC_PUNCTUATION'] = ( 57 | lambda c: ext_latin if c == "'" or c == '-' else None) 58 | ext_latin['LATIN_EXTENDED_LETTER'] = lambda _: ext_latin 59 | 60 | ja_repeat = lambda c: ja if c == u'々' else None 61 | 62 | basic_cjk['CJK_SYMBOLS_AND_PUNCTUATION'] = ja_repeat 63 | basic_cjk['CJK_UNIFIED_IDEOGRAPHS'] = lambda _: basic_cjk 64 | basic_cjk['CJK_COMPATIBILITY_IDEOGRAPHS'] = lambda _: basic_cjk 65 | basic_cjk['HIRAGANA'] = lambda _: ja 66 | basic_cjk['KATAKANA'] = lambda _: ja 67 | 68 | ja['CJK_SYMBOLS_AND_PUNCTUATION'] = ja_repeat 69 | ja['CJK_UNIFIED_IDEOGRAPHS'] = lambda _: ja 70 | ja['CJK_COMPATIBILITY_IDEOGRAPHS'] = lambda _: ja 71 | ja['HIRAGANA'] = lambda _: ja 72 | ja['KATAKANA'] = lambda _: ja 73 | 74 | kr['HANGUL_SYLLABLES'] = lambda _: kr 75 | kr['HANGUL_JAMO_EXTENDED_B'] = lambda _: kr 76 | 77 | return State(None, { 78 | 'DIGIT': lambda _: number, 79 | 'FULLWIDTH_DIGIT': lambda _: fullwidth_num, 80 | 'BASIC_LATIN': lambda _: basic_latin, 81 | 'FULLWIDTH_LATIN': lambda _: fullwidth_latin, 82 | 'HANGUL_SYLLABLES': lambda _: kr, 83 | 'HANGUL_JAMO_EXTENDED_B': lambda _: kr, 84 | 'CJK_UNIFIED_IDEOGRAPHS': lambda _: basic_cjk, 85 | 'CJK_COMPATIBILITY_IDEOGRAPHS': lambda _: basic_cjk, 86 | 'HIRAGANA': lambda _: ja, 87 | 'KATAKANA': lambda _: ja, 88 | }) 89 | 90 | _START = _init_states() 91 | 92 | 93 | def usplit(u: str) -> List[UnicodeSequence]: 94 | begin = 0 95 | record = False 96 | seqs = [] 97 | state = _START 98 | 99 | for index, ch in enumerate(u): 100 | next_state = state.next(ch) 101 | if next_state is not None: 102 | state = next_state 103 | if not record: 104 | record = True 105 | begin = index 106 | continue 107 | if record: 108 | seqs.append(UnicodeSequence(u[begin: index], state.lang)) 109 | record = False 110 | next_state = _START.next(ch) 111 | if next_state is None: 112 | state = _START 113 | continue 114 | state = next_state 115 | record = True 116 | begin = index 117 | 118 | if record: 119 | seqs.append(UnicodeSequence(u[begin:], state.lang)) 120 | return seqs 121 | --------------------------------------------------------------------------------