├── bidi ├── __init__.py ├── mirror.pyc ├── algorithm.pyc ├── tests.py ├── arabic_reshaper.py ├── mirror.py └── algorithm.py ├── Default (Linux).sublime-keymap ├── Default (OSX).sublime-keymap ├── Context.sublime-menu ├── Default (Windows).sublime-keymap ├── Default.sublime-commands ├── Main.sublime-menu ├── rtl.py └── README.md /bidi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Default (Linux).sublime-keymap: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "keys": ["ctrl+b"], 4 | "command": "bidi" 5 | } 6 | ] -------------------------------------------------------------------------------- /Default (OSX).sublime-keymap: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "keys": ["ctrl+b"], 4 | "command": "bidi" 5 | } 6 | ] -------------------------------------------------------------------------------- /bidi/mirror.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HosseinRashno/Sublime-Text-2-BIDI/HEAD/bidi/mirror.pyc -------------------------------------------------------------------------------- /bidi/algorithm.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HosseinRashno/Sublime-Text-2-BIDI/HEAD/bidi/algorithm.pyc -------------------------------------------------------------------------------- /Context.sublime-menu: -------------------------------------------------------------------------------- 1 | [ 2 | { "command": "bidi", "caption":"Bidirectional text" }, 3 | { "command": "bidiselection", "caption":"Bidirectional Selection" } 4 | ] 5 | -------------------------------------------------------------------------------- /Default (Windows).sublime-keymap: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "keys": ["ctrl+b"], 4 | "command": "bidi" 5 | }, 6 | { 7 | "keys": ["ctrl+u"], 8 | "command": "bidiselection" 9 | } 10 | ] -------------------------------------------------------------------------------- /Default.sublime-commands: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "caption": "Bidirectional text", 4 | "command": "run_bidi" 5 | }, 6 | { 7 | "caption": "Bidirectional selection", 8 | "command": "run_bidiselection" 9 | } 10 | ] -------------------------------------------------------------------------------- /Main.sublime-menu: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "caption": "Tools", 4 | "mnemonic": "t", 5 | "id": "tools", 6 | "children": 7 | [ 8 | {"caption":"-"}, 9 | { 10 | "caption": "Bidirectional text", 11 | "mnemonic": "B", 12 | "command": "bidi" 13 | }, 14 | { 15 | "caption": "Bidirectional selection", 16 | "mnemonic": "S", 17 | "command": "bidiselection" 18 | } 19 | ] 20 | } 21 | ] -------------------------------------------------------------------------------- /rtl.py: -------------------------------------------------------------------------------- 1 | import sublime, sublime_plugin, sys 2 | 3 | sys.path.append( 'bidi' ) 4 | try: 5 | 6 | # Python 3 7 | 8 | from .bidi.arabic_reshaper import reshape 9 | from .bidi.algorithm import get_display 10 | except ValueError: 11 | 12 | # Python 2 13 | 14 | from bidi.arabic_reshaper import reshape 15 | from bidi.algorithm import get_display 16 | 17 | class bidiCommand(sublime_plugin.TextCommand): 18 | def run(self, edit): 19 | region = sublime.Region(0, self.view.size()) 20 | bidiRegion(region, self.view, edit) 21 | 22 | class bidiselectionCommand(sublime_plugin.TextCommand): 23 | def run(self, edit): 24 | selectionSet = self.view.sel() 25 | for selectionRegion in selectionSet: 26 | bidiRegion(selectionRegion, self.view, edit) 27 | 28 | def bidiRegion(region, view, edit): 29 | txt = view.substr(region) 30 | reshaped_text = reshape(txt) 31 | bdiText = get_display(reshaped_text) 32 | view.replace(edit, region, bdiText) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Bidirectional text support for Sublime Text 3 2 | =================== 3 | 4 | Currently Sublime Text 3 is not supporting bidirectional languages like Arabic, Hebrew etc.. Using this plugin you can view bidirectional texts. 5 | 6 | Please note, I don't know Arabic or Hebrew. I have checked the results by pattern matching. Its a starting point. 7 | 8 | 9 | 10 | Install 11 | ----------------- 12 | Clone it into Sublime Package directory. 13 | 14 | 15 | 16 | Set Font face to any Arabic supporting font (Arial) in user settings.
17 |
18 | 19 | 20 | 21 | Usage 22 | ---------------------- 23 | Open file. 24 | Enter text 25 | Tools > Bidirectional text (ctrl+b) 26 | 27 | 28 | 29 | 30 | 31 | Command Accebility 32 | ------------------- 33 | Tools > Bidirectional text 34 | Ctrl + B 35 | Right click > Bidirectional text 36 | 37 | Bug tracker 38 | ---------- 39 | Post an issue here on Github. 40 | https://github.com/praveenvijayan/Sublime-Text-2-BIDI/issues 41 | 42 | Resources 43 | ---------- 44 | http://www.decodize.com/html/sublime-text-2-bidirectional-language-support-plugin/ 45 | 46 | Twitter 47 | ------------------ 48 | Follow for updates : @praveen_vijaya 49 | 50 | Thanks 51 | ---- 52 | https://github.com/MeirKriheli/python-bidi
53 | https://github.com/mpcabd/python-arabic-reshaper 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /bidi/tests.py: -------------------------------------------------------------------------------- 1 | # This file is part of python-bidi 2 | # 3 | # python-bidi is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU Lesser General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # along with this program. If not, see . 15 | 16 | # Copyright (C) 2008-2010 Yaacov Zamir , 17 | # Meir kriheli 18 | """BiDi algorithm unit tests""" 19 | 20 | import unittest 21 | from bidi.algorithm import get_display, get_empty_storage, get_embedding_levels 22 | 23 | class TestBidiAlgorithm(unittest.TestCase): 24 | "Tests the bidi algorithm (based on GNU fribidi ones)" 25 | 26 | def test_surrogate(self): 27 | """Test for storage and base levels in case of surrogate pairs""" 28 | 29 | storage = get_empty_storage() 30 | 31 | text = u'HELLO \U0001d7f612' 32 | get_embedding_levels(text, storage, upper_is_rtl=True) 33 | 34 | # should return 9, not 10 even in --with-unicode=ucs2 35 | self.assertEqual(len(storage['chars']), 9) 36 | 37 | # Is the expected result ? should be EN 38 | _ch = storage['chars'][6] 39 | self.assertEqual(_ch['ch'], u'\U0001d7f6') 40 | self.assertEqual(_ch['type'], 'EN') 41 | 42 | display = get_display(text, upper_is_rtl=True) 43 | self.assertEqual(display, u'\U0001d7f612 OLLEH') 44 | 45 | def test_implict_with_upper_is_rtl(self): 46 | '''Implicit tests''' 47 | 48 | tests = ( 49 | (u'car is THE CAR in arabic', u'car is RAC EHT in arabic'), 50 | (u'CAR IS the car IN ENGLISH', u'HSILGNE NI the car SI RAC'), 51 | (u'he said "IT IS 123, 456, OK"', u'he said "KO ,456 ,123 SI TI"'), 52 | (u'he said "IT IS (123, 456), OK"', u'he said "KO ,(456 ,123) SI TI"'), 53 | (u'he said "IT IS 123,456, OK"', u'he said "KO ,123,456 SI TI"'), 54 | (u'he said "IT IS (123,456), OK"', u'he said "KO ,(123,456) SI TI"'), 55 | (u'HE SAID "it is 123, 456, ok"', u'"it is 123, 456, ok" DIAS EH'), 56 | (u'shalom', u'<123H/>shalom<123H>'), 57 | (u'SAALAM', u'MALAAS'), 58 | (u'HE SAID "it is a car!" AND RAN', u'NAR DNA "!it is a car" DIAS EH'), 59 | (u'HE SAID "it is a car!x" AND RAN', u'NAR DNA "it is a car!x" DIAS EH'), 60 | (u'SOLVE 1*5 1-5 1/5 1+5', u'1+5 1/5 1-5 5*1 EVLOS'), 61 | (u'THE RANGE IS 2.5..5', u'5..2.5 SI EGNAR EHT'), 62 | (u'-2 CELSIUS IS COLD', u'DLOC SI SUISLEC 2-'), 63 | ) 64 | 65 | for storage, display in tests: 66 | self.assertEqual(get_display(storage, upper_is_rtl=True), display) 67 | 68 | def test_override_base_dir(self): 69 | """Tests overriding the base paragraph direction""" 70 | 71 | # normaly the display should be :MOLAHS be since we're overriding the 72 | # base dir the colon should be at the end of the display 73 | storage = u'SHALOM:' 74 | display = u'MOLAHS:' 75 | 76 | self.assertEqual(get_display(storage, upper_is_rtl=True, base_dir='L'), display) 77 | 78 | 79 | 80 | def test_output_encoding(self): 81 | """Make sure the display is in the same encdoing as the incoming text""" 82 | 83 | storage = '\xf9\xec\xe5\xed' # Hebrew word shalom in cp1255 84 | display = '\xed\xe5\xec\xf9' 85 | 86 | self.assertEqual(get_display(storage, encoding='cp1255'), display) 87 | 88 | 89 | def test_explicit_with_upper_is_rtl(self): 90 | """Explicit tests""" 91 | tests = ( 92 | (u'this is _LJUST_o', u'this is JUST'), 93 | (u'a _lsimple _RteST_o th_oat', u'a simple TSet that'), 94 | (u'HAS A _LPDF missing', u'PDF missing A SAH'), 95 | (u'AnD hOw_L AbOuT, 123,987 tHiS_o', u'w AbOuT, 123,987 tHiSOh DnA'), 96 | (u'a GOOD - _L_oTEST.', u'a TSET - DOOG.'), 97 | (u'here_L is_o_o_o _R a good one_o', u'here is eno doog a'), 98 | (u'THE _rbest _lONE and', u'best ENO and EHT'), 99 | (u'A REAL BIG_l_o BUG!', u'!GUB GIB LAER A'), 100 | (u'a _L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_Rbug', u'a gub'), 101 | (u'AN ARABIC _l_o 123-456 NICE ONE!', u'!ENO ECIN 456-123 CIBARA NA'), 102 | (u'AN ARABIC _l _o 123-456 PAIR', u'RIAP 123-456 CIBARA NA'), 103 | (u'this bug 67_r_o89 catched!', u'this bug 6789 catched!'), 104 | ) 105 | 106 | # adopt fribidi's CapRtl encoding 107 | mappings = { 108 | u'_>': u"\u200E", 109 | u'_<': u"\u200F", 110 | u'_l': u"\u202A", 111 | u'_r': u"\u202B", 112 | u'_o': u"\u202C", 113 | u'_L': u"\u202D", 114 | u'_R': u"\u202E", 115 | u'__': '_', 116 | } 117 | 118 | for storage, display in tests: 119 | for key, val in mappings.items(): 120 | storage = storage.replace(key, val) 121 | self.assertEqual(get_display(storage, upper_is_rtl=True), display) 122 | 123 | 124 | if __name__ == '__main__': 125 | unittest.main() 126 | -------------------------------------------------------------------------------- /bidi/arabic_reshaper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # This work is licensed under the GNU Public License (GPL). 4 | # To view a copy of this license, visit http://www.gnu.org/copyleft/gpl.html 5 | 6 | # Written by Abd Allah Diab (mpcabd) 7 | # Email: mpcabd ^at^ gmail ^dot^ com 8 | # Website: http://mpcabd.igeex.biz 9 | 10 | # Ported and tweaked from Java to Python, from Better Arabic Reshaper [https://github.com/agawish/Better-Arabic-Reshaper/] 11 | 12 | import re 13 | 14 | DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_MDD = u'\u0622' 15 | DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_HAMAZA = u'\u0623' 16 | DEFINED_CHARACTERS_ORGINAL_ALF_LOWER_HAMAZA = u'\u0625' 17 | DEFINED_CHARACTERS_ORGINAL_ALF = u'\u0627' 18 | DEFINED_CHARACTERS_ORGINAL_LAM = u'\u0644' 19 | 20 | LAM_ALEF_GLYPHS = [ 21 | [u'\u3BA6', u'\uFEF6', u'\uFEF5'], 22 | [u'\u3BA7', u'\uFEF8', u'\uFEF7'], 23 | [u'\u0627', u'\uFEFC', u'\uFEFB'], 24 | [u'\u0625', u'\uFEFA', u'\uFEF9'] 25 | ] 26 | 27 | HARAKAT = [ 28 | u'\u0600', u'\u0601', u'\u0602', u'\u0603', u'\u0606', u'\u0607', u'\u0608', u'\u0609', 29 | u'\u060A', u'\u060B', u'\u060D', u'\u060E', u'\u0610', u'\u0611', u'\u0612', u'\u0613', 30 | u'\u0614', u'\u0615', u'\u0616', u'\u0617', u'\u0618', u'\u0619', u'\u061A', u'\u061B', 31 | u'\u061E', u'\u061F', u'\u0621', u'\u063B', u'\u063C', u'\u063D', u'\u063E', u'\u063F', 32 | u'\u0640', u'\u064B', u'\u064C', u'\u064D', u'\u064E', u'\u064F', u'\u0650', u'\u0651', 33 | u'\u0652', u'\u0653', u'\u0654', u'\u0655', u'\u0656', u'\u0657', u'\u0658', u'\u0659', 34 | u'\u065A', u'\u065B', u'\u065C', u'\u065D', u'\u065E', u'\u0660', u'\u066A', u'\u066B', 35 | u'\u066C', u'\u066F', u'\u0670', u'\u0672', u'\u06D4', u'\u06D5', u'\u06D6', u'\u06D7', 36 | u'\u06D8', u'\u06D9', u'\u06DA', u'\u06DB', u'\u06DC', u'\u06DF', u'\u06E0', u'\u06E1', 37 | u'\u06E2', u'\u06E3', u'\u06E4', u'\u06E5', u'\u06E6', u'\u06E7', u'\u06E8', u'\u06E9', 38 | u'\u06EA', u'\u06EB', u'\u06EC', u'\u06ED', u'\u06EE', u'\u06EF', u'\u06D6', u'\u06D7', 39 | u'\u06D8', u'\u06D9', u'\u06DA', u'\u06DB', u'\u06DC', u'\u06DD', u'\u06DE', u'\u06DF', 40 | u'\u06F0', u'\u06FD', u'\uFE70', u'\uFE71', u'\uFE72', u'\uFE73', u'\uFE74', u'\uFE75', 41 | u'\uFE76', u'\uFE77', u'\uFE78', u'\uFE79', u'\uFE7A', u'\uFE7B', u'\uFE7C', u'\uFE7D', 42 | u'\uFE7E', u'\uFE7F', u'\uFC5E', u'\uFC5F', u'\uFC60', u'\uFC61', u'\uFC62', u'\uFC63' 43 | ] 44 | 45 | ARABIC_GLYPHS = { 46 | u'\u0622' : [u'\u0622', u'\uFE81', u'\uFE81', u'\uFE82', u'\uFE82', 2], 47 | u'\u0623' : [u'\u0623', u'\uFE83', u'\uFE83', u'\uFE84', u'\uFE84', 2], 48 | u'\u0624' : [u'\u0624', u'\uFE85', u'\uFE85', u'\uFE86', u'\uFE86', 2], 49 | u'\u0625' : [u'\u0625', u'\uFE87', u'\uFE87', u'\uFE88', u'\uFE88', 2], 50 | u'\u0626' : [u'\u0626', u'\uFE89', u'\uFE8B', u'\uFE8C', u'\uFE8A', 4], 51 | u'\u0627' : [u'\u0627', u'\u0627', u'\u0627', u'\uFE8E', u'\uFE8E', 2], 52 | u'\u0628' : [u'\u0628', u'\uFE8F', u'\uFE91', u'\uFE92', u'\uFE90', 4], 53 | u'\u0629' : [u'\u0629', u'\uFE93', u'\uFE93', u'\uFE94', u'\uFE94', 2], 54 | u'\u062A' : [u'\u062A', u'\uFE95', u'\uFE97', u'\uFE98', u'\uFE96', 4], 55 | u'\u062B' : [u'\u062B', u'\uFE99', u'\uFE9B', u'\uFE9C', u'\uFE9A', 4], 56 | u'\u062C' : [u'\u062C', u'\uFE9D', u'\uFE9F', u'\uFEA0', u'\uFE9E', 4], 57 | u'\u062D' : [u'\u062D', u'\uFEA1', u'\uFEA3', u'\uFEA4', u'\uFEA2', 4], 58 | u'\u062E' : [u'\u062E', u'\uFEA5', u'\uFEA7', u'\uFEA8', u'\uFEA6', 4], 59 | u'\u062F' : [u'\u062F', u'\uFEA9', u'\uFEA9', u'\uFEAA', u'\uFEAA', 2], 60 | u'\u0630' : [u'\u0630', u'\uFEAB', u'\uFEAB', u'\uFEAC', u'\uFEAC', 2], 61 | u'\u0631' : [u'\u0631', u'\uFEAD', u'\uFEAD', u'\uFEAE', u'\uFEAE', 2], 62 | u'\u0632' : [u'\u0632', u'\uFEAF', u'\uFEAF', u'\uFEB0', u'\uFEB0', 2], 63 | u'\u0633' : [u'\u0633', u'\uFEB1', u'\uFEB3', u'\uFEB4', u'\uFEB2', 4], 64 | u'\u0634' : [u'\u0634', u'\uFEB5', u'\uFEB7', u'\uFEB8', u'\uFEB6', 4], 65 | u'\u0635' : [u'\u0635', u'\uFEB9', u'\uFEBB', u'\uFEBC', u'\uFEBA', 4], 66 | u'\u0636' : [u'\u0636', u'\uFEBD', u'\uFEBF', u'\uFEC0', u'\uFEBE', 4], 67 | u'\u0637' : [u'\u0637', u'\uFEC1', u'\uFEC3', u'\uFEC4', u'\uFEC2', 4], 68 | u'\u0638' : [u'\u0638', u'\uFEC5', u'\uFEC7', u'\uFEC8', u'\uFEC6', 4], 69 | u'\u0639' : [u'\u0639', u'\uFEC9', u'\uFECB', u'\uFECC', u'\uFECA', 4], 70 | u'\u063A' : [u'\u063A', u'\uFECD', u'\uFECF', u'\uFED0', u'\uFECE', 4], 71 | u'\u0641' : [u'\u0641', u'\uFED1', u'\uFED3', u'\uFED4', u'\uFED2', 4], 72 | u'\u0642' : [u'\u0642', u'\uFED5', u'\uFED7', u'\uFED8', u'\uFED6', 4], 73 | u'\u0643' : [u'\u0643', u'\uFED9', u'\uFEDB', u'\uFEDC', u'\uFEDA', 4], 74 | u'\u0644' : [u'\u0644', u'\uFEDD', u'\uFEDF', u'\uFEE0', u'\uFEDE', 4], 75 | u'\u0645' : [u'\u0645', u'\uFEE1', u'\uFEE3', u'\uFEE4', u'\uFEE2', 4], 76 | u'\u0646' : [u'\u0646', u'\uFEE5', u'\uFEE7', u'\uFEE8', u'\uFEE6', 4], 77 | u'\u0647' : [u'\u0647', u'\uFEE9', u'\uFEEB', u'\uFEEC', u'\uFEEA', 4], 78 | u'\u0648' : [u'\u0648', u'\uFEED', u'\uFEED', u'\uFEEE', u'\uFEEE', 2], 79 | u'\u0649' : [u'\u0649', u'\uFEEF', u'\uFEEF', u'\uFEF0', u'\uFEF0', 2], 80 | u'\u0671' : [u'\u0671', u'\u0671', u'\u0671', u'\uFB51', u'\uFB51', 2], 81 | u'\u064A' : [u'\u064A', u'\uFEF1', u'\uFEF3', u'\uFEF4', u'\uFEF2', 4], 82 | u'\u066E' : [u'\u066E', u'\uFBE4', u'\uFBE8', u'\uFBE9', u'\uFBE5', 4], 83 | u'\u06AA' : [u'\u06AA', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4], 84 | u'\u06C1' : [u'\u06C1', u'\uFBA6', u'\uFBA8', u'\uFBA9', u'\uFBA7', 4], 85 | u'\u06E4' : [u'\u06E4', u'\u06E4', u'\u06E4', u'\u06E4', u'\uFEEE', 2], 86 | u'\u067E' : [u'\u067E', u'\uFB56', u'\uFB58', u'\uFB59', u'\uFB57', 4], 87 | u'\u0698' : [u'\u0698', u'\uFB8A', u'\uFB8A', u'\uFB8B', u'\uFB8B', 2], 88 | u'\u06A9' : [u'\u06A9', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4], 89 | u'\u06AF' : [u'\u06AF', u'\uFB92', u'\uFB94', u'\uFB95', u'\uFB93', 4], 90 | u'\u06CC' : [u'\u06CC', u'\uFBFC', u'\uFBFE', u'\uFBFF', u'\uFBFD', 4], 91 | u'\u0686' : [u'\u0686', u'\uFB7A', u'\uFB7C', u'\uFB7D', u'\uFB7B', 4] 92 | } 93 | 94 | ARABIC_GLYPHS_LIST = [ 95 | [u'\u0622', u'\uFE81', u'\uFE81', u'\uFE82', u'\uFE82', 2], 96 | [u'\u0623', u'\uFE83', u'\uFE83', u'\uFE84', u'\uFE84', 2], 97 | [u'\u0624', u'\uFE85', u'\uFE85', u'\uFE86', u'\uFE86', 2], 98 | [u'\u0625', u'\uFE87', u'\uFE87', u'\uFE88', u'\uFE88', 2], 99 | [u'\u0626', u'\uFE89', u'\uFE8B', u'\uFE8C', u'\uFE8A', 4], 100 | [u'\u0627', u'\u0627', u'\u0627', u'\uFE8E', u'\uFE8E', 2], 101 | [u'\u0628', u'\uFE8F', u'\uFE91', u'\uFE92', u'\uFE90', 4], 102 | [u'\u0629', u'\uFE93', u'\uFE93', u'\uFE94', u'\uFE94', 2], 103 | [u'\u062A', u'\uFE95', u'\uFE97', u'\uFE98', u'\uFE96', 4], 104 | [u'\u062B', u'\uFE99', u'\uFE9B', u'\uFE9C', u'\uFE9A', 4], 105 | [u'\u062C', u'\uFE9D', u'\uFE9F', u'\uFEA0', u'\uFE9E', 4], 106 | [u'\u062D', u'\uFEA1', u'\uFEA3', u'\uFEA4', u'\uFEA2', 4], 107 | [u'\u062E', u'\uFEA5', u'\uFEA7', u'\uFEA8', u'\uFEA6', 4], 108 | [u'\u062F', u'\uFEA9', u'\uFEA9', u'\uFEAA', u'\uFEAA', 2], 109 | [u'\u0630', u'\uFEAB', u'\uFEAB', u'\uFEAC', u'\uFEAC', 2], 110 | [u'\u0631', u'\uFEAD', u'\uFEAD', u'\uFEAE', u'\uFEAE', 2], 111 | [u'\u0632', u'\uFEAF', u'\uFEAF', u'\uFEB0', u'\uFEB0', 2], 112 | [u'\u0633', u'\uFEB1', u'\uFEB3', u'\uFEB4', u'\uFEB2', 4], 113 | [u'\u0634', u'\uFEB5', u'\uFEB7', u'\uFEB8', u'\uFEB6', 4], 114 | [u'\u0635', u'\uFEB9', u'\uFEBB', u'\uFEBC', u'\uFEBA', 4], 115 | [u'\u0636', u'\uFEBD', u'\uFEBF', u'\uFEC0', u'\uFEBE', 4], 116 | [u'\u0637', u'\uFEC1', u'\uFEC3', u'\uFEC4', u'\uFEC2', 4], 117 | [u'\u0638', u'\uFEC5', u'\uFEC7', u'\uFEC8', u'\uFEC6', 4], 118 | [u'\u0639', u'\uFEC9', u'\uFECB', u'\uFECC', u'\uFECA', 4], 119 | [u'\u063A', u'\uFECD', u'\uFECF', u'\uFED0', u'\uFECE', 4], 120 | [u'\u0641', u'\uFED1', u'\uFED3', u'\uFED4', u'\uFED2', 4], 121 | [u'\u0642', u'\uFED5', u'\uFED7', u'\uFED8', u'\uFED6', 4], 122 | [u'\u0643', u'\uFED9', u'\uFEDB', u'\uFEDC', u'\uFEDA', 4], 123 | [u'\u0644', u'\uFEDD', u'\uFEDF', u'\uFEE0', u'\uFEDE', 4], 124 | [u'\u0645', u'\uFEE1', u'\uFEE3', u'\uFEE4', u'\uFEE2', 4], 125 | [u'\u0646', u'\uFEE5', u'\uFEE7', u'\uFEE8', u'\uFEE6', 4], 126 | [u'\u0647', u'\uFEE9', u'\uFEEB', u'\uFEEC', u'\uFEEA', 4], 127 | [u'\u0648', u'\uFEED', u'\uFEED', u'\uFEEE', u'\uFEEE', 2], 128 | [u'\u0649', u'\uFEEF', u'\uFEEF', u'\uFEF0', u'\uFEF0', 2], 129 | [u'\u0671', u'\u0671', u'\u0671', u'\uFB51', u'\uFB51', 2], 130 | [u'\u064A', u'\uFEF1', u'\uFEF3', u'\uFEF4', u'\uFEF2', 4], 131 | [u'\u066E', u'\uFBE4', u'\uFBE8', u'\uFBE9', u'\uFBE5', 4], 132 | [u'\u06AA', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4], 133 | [u'\u06C1', u'\uFBA6', u'\uFBA8', u'\uFBA9', u'\uFBA7', 4], 134 | [u'\u067E', u'\uFB56', u'\uFB58', u'\uFB59', u'\uFB57', 4], 135 | [u'\u0698', u'\uFB8A', u'\uFB8A', u'\uFB8B', u'\uFB8B', 2], 136 | [u'\u06A9', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4], 137 | [u'\u06AF', u'\uFB92', u'\uFB94', u'\uFB95', u'\uFB93', 4], 138 | [u'\u06CC', u'\uFBFC', u'\uFBFE', u'\uFBFF', u'\uFBFD', 4], 139 | [u'\u0686', u'\uFB7A', u'\uFB7C', u'\uFB7D', u'\uFB7B', 4], 140 | ] 141 | 142 | def get_reshaped_glyph(target, location): 143 | if target in ARABIC_GLYPHS: 144 | return ARABIC_GLYPHS[target][location] 145 | else: 146 | return target 147 | 148 | def get_glyph_type(target): 149 | if target in ARABIC_GLYPHS: 150 | return ARABIC_GLYPHS[target][5] 151 | else: 152 | return 2 153 | 154 | def is_haraka(target): 155 | return target in HARAKAT 156 | 157 | def replace_lam_alef(unshaped_word): 158 | list_word = list(unshaped_word) 159 | letter_before = u'' 160 | for i in range(len(unshaped_word)): 161 | if not is_haraka(unshaped_word[i]) and unshaped_word[i] != DEFINED_CHARACTERS_ORGINAL_LAM: 162 | letter_before = unshaped_word[i] 163 | 164 | if unshaped_word[i] == DEFINED_CHARACTERS_ORGINAL_LAM: 165 | candidate_lam = unshaped_word[i] 166 | lam_position = i 167 | haraka_position = i + 1 168 | 169 | while haraka_position < len(unshaped_word) and is_haraka(unshaped_word[haraka_position]): 170 | haraka_position += 1 171 | 172 | if haraka_position < len(unshaped_word): 173 | if lam_position > 0 and get_glyph_type(letter_before) > 2: 174 | lam_alef = get_lam_alef(list_word[haraka_position], candidate_lam, False) 175 | else: 176 | lam_alef = get_lam_alef(list_word[haraka_position], candidate_lam, True) 177 | if lam_alef != '': 178 | list_word[lam_position] = lam_alef 179 | list_word[haraka_position] = u' ' 180 | 181 | return u''.join(list_word).replace(u' ', u'') 182 | 183 | def get_lam_alef(candidate_alef, candidate_lam, is_end_of_word): 184 | shift_rate = 1 185 | reshaped_lam_alef = u'' 186 | if is_end_of_word: 187 | shift_rate += 1 188 | 189 | if DEFINED_CHARACTERS_ORGINAL_LAM == candidate_lam: 190 | if DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_MDD == candidate_alef: 191 | reshaped_lam_alef = LAM_ALEF_GLYPHS[0][shift_rate] 192 | 193 | if DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_HAMAZA == candidate_alef: 194 | reshaped_lam_alef = LAM_ALEF_GLYPHS[1][shift_rate] 195 | 196 | if DEFINED_CHARACTERS_ORGINAL_ALF == candidate_alef: 197 | reshaped_lam_alef = LAM_ALEF_GLYPHS[2][shift_rate] 198 | 199 | if DEFINED_CHARACTERS_ORGINAL_ALF_LOWER_HAMAZA == candidate_alef: 200 | reshaped_lam_alef = LAM_ALEF_GLYPHS[3][shift_rate] 201 | 202 | return reshaped_lam_alef 203 | 204 | class DecomposedWord(object): 205 | def __init__(self, word): 206 | self.stripped_harakat = [] 207 | self.harakat_positions = [] 208 | self.stripped_regular_letters = [] 209 | self.letters_position = [] 210 | 211 | for i in range(len(word)): 212 | c = word[i] 213 | if is_haraka(c): 214 | self.harakat_positions.append(i) 215 | self.stripped_harakat.append(c) 216 | else: 217 | self.letters_position.append(i) 218 | self.stripped_regular_letters.append(c) 219 | 220 | def reconstruct_word(self, reshaped_word): 221 | l = list(u'\0' * (len(self.stripped_harakat) + len(reshaped_word))) 222 | for i in range(len(self.letters_position)): 223 | l[self.letters_position[i]] = reshaped_word[i] 224 | for i in range(len(self.harakat_positions)): 225 | l[self.harakat_positions[i]] = self.stripped_harakat[i] 226 | return u''.join(l) 227 | 228 | def get_reshaped_word(unshaped_word): 229 | unshaped_word = replace_lam_alef(unshaped_word) 230 | decomposed_word = DecomposedWord(unshaped_word) 231 | result = u'' 232 | if decomposed_word.stripped_regular_letters: 233 | result = reshape_it(u''.join(decomposed_word.stripped_regular_letters)) 234 | return decomposed_word.reconstruct_word(result) 235 | 236 | def reshape_it(unshaped_word): 237 | if not unshaped_word: 238 | return u'' 239 | if len(unshaped_word) == 1: 240 | return get_reshaped_glyph(unshaped_word[0], 1) 241 | reshaped_word = [] 242 | for i in range(len(unshaped_word)): 243 | before = False 244 | after = False 245 | if i == 0: 246 | after = get_glyph_type(unshaped_word[i]) == 4 247 | elif i == len(unshaped_word) - 1: 248 | before = get_glyph_type(unshaped_word[i - 1]) == 4 249 | else: 250 | after = get_glyph_type(unshaped_word[i]) == 4 251 | before = get_glyph_type(unshaped_word[i - 1]) == 4 252 | if after and before: 253 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 3)) 254 | elif after and not before: 255 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 2)) 256 | elif not after and before: 257 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 4)) 258 | elif not after and not before: 259 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 1)) 260 | 261 | return u''.join(reshaped_word) 262 | 263 | 264 | def is_arabic_character(target): 265 | return target in ARABIC_GLYPHS or target in HARAKAT 266 | 267 | def get_words(sentence): 268 | if sentence: 269 | return re.split('\\s', sentence) 270 | return [] 271 | 272 | def has_arabic_letters(word): 273 | for c in word: 274 | if is_arabic_character(c): 275 | return True 276 | return False 277 | 278 | def is_arabic_word(word): 279 | for c in word: 280 | if not is_arabic_character(c): 281 | return False 282 | return True 283 | 284 | def get_words_from_mixed_word(word): 285 | temp_word = u'' 286 | words = [] 287 | for c in word: 288 | if is_arabic_character(c): 289 | if temp_word and not is_arabic_word(temp_word): 290 | words.append(temp_word) 291 | temp_word = c 292 | else: 293 | temp_word += c 294 | else: 295 | if temp_word and is_arabic_word(temp_word): 296 | words.append(temp_word) 297 | temp_word = c 298 | else: 299 | temp_word += c 300 | if temp_word: 301 | words.append(temp_word) 302 | return words 303 | 304 | def reshape(text): 305 | if text: 306 | lines = re.split('\\r?\\n', text) 307 | for i in range(len(lines)): 308 | lines[i] = reshape_sentence(lines[i]) 309 | return u'\n'.join(lines) 310 | return u'' 311 | 312 | def reshape_sentence(sentence): 313 | words = get_words(sentence) 314 | for i in range(len(words)): 315 | word = words[i] 316 | if has_arabic_letters(word): 317 | if is_arabic_word(word): 318 | words[i] = get_reshaped_word(word) 319 | else: 320 | mixed_words = get_words_from_mixed_word(word) 321 | for j in range(len(mixed_words)): 322 | mixed_words[j] = get_reshaped_word(mixed_words[j]) 323 | words[i] = u''.join(mixed_words) 324 | return u' '.join(words) 325 | -------------------------------------------------------------------------------- /bidi/mirror.py: -------------------------------------------------------------------------------- 1 | # This file is part of python-bidi 2 | # 3 | # python-bidi is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU Lesser General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # along with this program. If not, see . 15 | 16 | # Copyright (C) 2008-2010 Yaacov Zamir , 17 | # Meir kriheli 18 | """Mirrored chars""" 19 | 20 | # Can't seem to get this data from python's unicode data, so this is imported 21 | # from http://www.unicode.org/Public/UNIDATA/BidiMirroring.txt 22 | MIRRORED = { 23 | u'\u0028': u'\u0029', # LEFT PARENTHESIS 24 | u'\u0029': u'\u0028', # RIGHT PARENTHESIS 25 | u'\u003C': u'\u003E', # LESS-THAN SIGN 26 | u'\u003E': u'\u003C', # GREATER-THAN SIGN 27 | u'\u005B': u'\u005D', # LEFT SQUARE BRACKET 28 | u'\u005D': u'\u005B', # RIGHT SQUARE BRACKET 29 | u'\u007B': u'\u007D', # LEFT CURLY BRACKET 30 | u'\u007D': u'\u007B', # RIGHT CURLY BRACKET 31 | u'\u00AB': u'\u00BB', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 32 | u'\u00BB': u'\u00AB', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 33 | u'\u0F3A': u'\u0F3B', # TIBETAN MARK GUG RTAGS GYON 34 | u'\u0F3B': u'\u0F3A', # TIBETAN MARK GUG RTAGS GYAS 35 | u'\u0F3C': u'\u0F3D', # TIBETAN MARK ANG KHANG GYON 36 | u'\u0F3D': u'\u0F3C', # TIBETAN MARK ANG KHANG GYAS 37 | u'\u169B': u'\u169C', # OGHAM FEATHER MARK 38 | u'\u169C': u'\u169B', # OGHAM REVERSED FEATHER MARK 39 | u'\u2039': u'\u203A', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 40 | u'\u203A': u'\u2039', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 41 | u'\u2045': u'\u2046', # LEFT SQUARE BRACKET WITH QUILL 42 | u'\u2046': u'\u2045', # RIGHT SQUARE BRACKET WITH QUILL 43 | u'\u207D': u'\u207E', # SUPERSCRIPT LEFT PARENTHESIS 44 | u'\u207E': u'\u207D', # SUPERSCRIPT RIGHT PARENTHESIS 45 | u'\u208D': u'\u208E', # SUBSCRIPT LEFT PARENTHESIS 46 | u'\u208E': u'\u208D', # SUBSCRIPT RIGHT PARENTHESIS 47 | u'\u2208': u'\u220B', # ELEMENT OF 48 | u'\u2209': u'\u220C', # NOT AN ELEMENT OF 49 | u'\u220A': u'\u220D', # SMALL ELEMENT OF 50 | u'\u220B': u'\u2208', # CONTAINS AS MEMBER 51 | u'\u220C': u'\u2209', # DOES NOT CONTAIN AS MEMBER 52 | u'\u220D': u'\u220A', # SMALL CONTAINS AS MEMBER 53 | u'\u2215': u'\u29F5', # DIVISION SLASH 54 | u'\u223C': u'\u223D', # TILDE OPERATOR 55 | u'\u223D': u'\u223C', # REVERSED TILDE 56 | u'\u2243': u'\u22CD', # ASYMPTOTICALLY EQUAL TO 57 | u'\u2252': u'\u2253', # APPROXIMATELY EQUAL TO OR THE IMAGE OF 58 | u'\u2253': u'\u2252', # IMAGE OF OR APPROXIMATELY EQUAL TO 59 | u'\u2254': u'\u2255', # COLON EQUALS 60 | u'\u2255': u'\u2254', # EQUALS COLON 61 | u'\u2264': u'\u2265', # LESS-THAN OR EQUAL TO 62 | u'\u2265': u'\u2264', # GREATER-THAN OR EQUAL TO 63 | u'\u2266': u'\u2267', # LESS-THAN OVER EQUAL TO 64 | u'\u2267': u'\u2266', # GREATER-THAN OVER EQUAL TO 65 | u'\u2268': u'\u2269', # [BEST FIT] LESS-THAN BUT NOT EQUAL TO 66 | u'\u2269': u'\u2268', # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO 67 | u'\u226A': u'\u226B', # MUCH LESS-THAN 68 | u'\u226B': u'\u226A', # MUCH GREATER-THAN 69 | u'\u226E': u'\u226F', # [BEST FIT] NOT LESS-THAN 70 | u'\u226F': u'\u226E', # [BEST FIT] NOT GREATER-THAN 71 | u'\u2270': u'\u2271', # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO 72 | u'\u2271': u'\u2270', # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO 73 | u'\u2272': u'\u2273', # [BEST FIT] LESS-THAN OR EQUIVALENT TO 74 | u'\u2273': u'\u2272', # [BEST FIT] GREATER-THAN OR EQUIVALENT TO 75 | u'\u2274': u'\u2275', # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO 76 | u'\u2275': u'\u2274', # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO 77 | u'\u2276': u'\u2277', # LESS-THAN OR GREATER-THAN 78 | u'\u2277': u'\u2276', # GREATER-THAN OR LESS-THAN 79 | u'\u2278': u'\u2279', # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN 80 | u'\u2279': u'\u2278', # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN 81 | u'\u227A': u'\u227B', # PRECEDES 82 | u'\u227B': u'\u227A', # SUCCEEDS 83 | u'\u227C': u'\u227D', # PRECEDES OR EQUAL TO 84 | u'\u227D': u'\u227C', # SUCCEEDS OR EQUAL TO 85 | u'\u227E': u'\u227F', # [BEST FIT] PRECEDES OR EQUIVALENT TO 86 | u'\u227F': u'\u227E', # [BEST FIT] SUCCEEDS OR EQUIVALENT TO 87 | u'\u2280': u'\u2281', # [BEST FIT] DOES NOT PRECEDE 88 | u'\u2281': u'\u2280', # [BEST FIT] DOES NOT SUCCEED 89 | u'\u2282': u'\u2283', # SUBSET OF 90 | u'\u2283': u'\u2282', # SUPERSET OF 91 | u'\u2284': u'\u2285', # [BEST FIT] NOT A SUBSET OF 92 | u'\u2285': u'\u2284', # [BEST FIT] NOT A SUPERSET OF 93 | u'\u2286': u'\u2287', # SUBSET OF OR EQUAL TO 94 | u'\u2287': u'\u2286', # SUPERSET OF OR EQUAL TO 95 | u'\u2288': u'\u2289', # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO 96 | u'\u2289': u'\u2288', # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO 97 | u'\u228A': u'\u228B', # [BEST FIT] SUBSET OF WITH NOT EQUAL TO 98 | u'\u228B': u'\u228A', # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO 99 | u'\u228F': u'\u2290', # SQUARE IMAGE OF 100 | u'\u2290': u'\u228F', # SQUARE ORIGINAL OF 101 | u'\u2291': u'\u2292', # SQUARE IMAGE OF OR EQUAL TO 102 | u'\u2292': u'\u2291', # SQUARE ORIGINAL OF OR EQUAL TO 103 | u'\u2298': u'\u29B8', # CIRCLED DIVISION SLASH 104 | u'\u22A2': u'\u22A3', # RIGHT TACK 105 | u'\u22A3': u'\u22A2', # LEFT TACK 106 | u'\u22A6': u'\u2ADE', # ASSERTION 107 | u'\u22A8': u'\u2AE4', # TRUE 108 | u'\u22A9': u'\u2AE3', # FORCES 109 | u'\u22AB': u'\u2AE5', # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE 110 | u'\u22B0': u'\u22B1', # PRECEDES UNDER RELATION 111 | u'\u22B1': u'\u22B0', # SUCCEEDS UNDER RELATION 112 | u'\u22B2': u'\u22B3', # NORMAL SUBGROUP OF 113 | u'\u22B3': u'\u22B2', # CONTAINS AS NORMAL SUBGROUP 114 | u'\u22B4': u'\u22B5', # NORMAL SUBGROUP OF OR EQUAL TO 115 | u'\u22B5': u'\u22B4', # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO 116 | u'\u22B6': u'\u22B7', # ORIGINAL OF 117 | u'\u22B7': u'\u22B6', # IMAGE OF 118 | u'\u22C9': u'\u22CA', # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT 119 | u'\u22CA': u'\u22C9', # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT 120 | u'\u22CB': u'\u22CC', # LEFT SEMIDIRECT PRODUCT 121 | u'\u22CC': u'\u22CB', # RIGHT SEMIDIRECT PRODUCT 122 | u'\u22CD': u'\u2243', # REVERSED TILDE EQUALS 123 | u'\u22D0': u'\u22D1', # DOUBLE SUBSET 124 | u'\u22D1': u'\u22D0', # DOUBLE SUPERSET 125 | u'\u22D6': u'\u22D7', # LESS-THAN WITH DOT 126 | u'\u22D7': u'\u22D6', # GREATER-THAN WITH DOT 127 | u'\u22D8': u'\u22D9', # VERY MUCH LESS-THAN 128 | u'\u22D9': u'\u22D8', # VERY MUCH GREATER-THAN 129 | u'\u22DA': u'\u22DB', # LESS-THAN EQUAL TO OR GREATER-THAN 130 | u'\u22DB': u'\u22DA', # GREATER-THAN EQUAL TO OR LESS-THAN 131 | u'\u22DC': u'\u22DD', # EQUAL TO OR LESS-THAN 132 | u'\u22DD': u'\u22DC', # EQUAL TO OR GREATER-THAN 133 | u'\u22DE': u'\u22DF', # EQUAL TO OR PRECEDES 134 | u'\u22DF': u'\u22DE', # EQUAL TO OR SUCCEEDS 135 | u'\u22E0': u'\u22E1', # [BEST FIT] DOES NOT PRECEDE OR EQUAL 136 | u'\u22E1': u'\u22E0', # [BEST FIT] DOES NOT SUCCEED OR EQUAL 137 | u'\u22E2': u'\u22E3', # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO 138 | u'\u22E3': u'\u22E2', # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO 139 | u'\u22E4': u'\u22E5', # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO 140 | u'\u22E5': u'\u22E4', # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO 141 | u'\u22E6': u'\u22E7', # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO 142 | u'\u22E7': u'\u22E6', # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO 143 | u'\u22E8': u'\u22E9', # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO 144 | u'\u22E9': u'\u22E8', # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO 145 | u'\u22EA': u'\u22EB', # [BEST FIT] NOT NORMAL SUBGROUP OF 146 | u'\u22EB': u'\u22EA', # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP 147 | u'\u22EC': u'\u22ED', # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO 148 | u'\u22ED': u'\u22EC', # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL 149 | u'\u22F0': u'\u22F1', # UP RIGHT DIAGONAL ELLIPSIS 150 | u'\u22F1': u'\u22F0', # DOWN RIGHT DIAGONAL ELLIPSIS 151 | u'\u22F2': u'\u22FA', # ELEMENT OF WITH LONG HORIZONTAL STROKE 152 | u'\u22F3': u'\u22FB', # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 153 | u'\u22F4': u'\u22FC', # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 154 | u'\u22F6': u'\u22FD', # ELEMENT OF WITH OVERBAR 155 | u'\u22F7': u'\u22FE', # SMALL ELEMENT OF WITH OVERBAR 156 | u'\u22FA': u'\u22F2', # CONTAINS WITH LONG HORIZONTAL STROKE 157 | u'\u22FB': u'\u22F3', # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 158 | u'\u22FC': u'\u22F4', # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE 159 | u'\u22FD': u'\u22F6', # CONTAINS WITH OVERBAR 160 | u'\u22FE': u'\u22F7', # SMALL CONTAINS WITH OVERBAR 161 | u'\u2308': u'\u2309', # LEFT CEILING 162 | u'\u2309': u'\u2308', # RIGHT CEILING 163 | u'\u230A': u'\u230B', # LEFT FLOOR 164 | u'\u230B': u'\u230A', # RIGHT FLOOR 165 | u'\u2329': u'\u232A', # LEFT-POINTING ANGLE BRACKET 166 | u'\u232A': u'\u2329', # RIGHT-POINTING ANGLE BRACKET 167 | u'\u2768': u'\u2769', # MEDIUM LEFT PARENTHESIS ORNAMENT 168 | u'\u2769': u'\u2768', # MEDIUM RIGHT PARENTHESIS ORNAMENT 169 | u'\u276A': u'\u276B', # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT 170 | u'\u276B': u'\u276A', # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT 171 | u'\u276C': u'\u276D', # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT 172 | u'\u276D': u'\u276C', # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT 173 | u'\u276E': u'\u276F', # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT 174 | u'\u276F': u'\u276E', # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT 175 | u'\u2770': u'\u2771', # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT 176 | u'\u2771': u'\u2770', # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT 177 | u'\u2772': u'\u2773', # LIGHT LEFT TORTOISE SHELL BRACKET 178 | u'\u2773': u'\u2772', # LIGHT RIGHT TORTOISE SHELL BRACKET 179 | u'\u2774': u'\u2775', # MEDIUM LEFT CURLY BRACKET ORNAMENT 180 | u'\u2775': u'\u2774', # MEDIUM RIGHT CURLY BRACKET ORNAMENT 181 | u'\u27C3': u'\u27C4', # OPEN SUBSET 182 | u'\u27C4': u'\u27C3', # OPEN SUPERSET 183 | u'\u27C5': u'\u27C6', # LEFT S-SHAPED BAG DELIMITER 184 | u'\u27C6': u'\u27C5', # RIGHT S-SHAPED BAG DELIMITER 185 | u'\u27C8': u'\u27C9', # REVERSE SOLIDUS PRECEDING SUBSET 186 | u'\u27C9': u'\u27C8', # SUPERSET PRECEDING SOLIDUS 187 | u'\u27D5': u'\u27D6', # LEFT OUTER JOIN 188 | u'\u27D6': u'\u27D5', # RIGHT OUTER JOIN 189 | u'\u27DD': u'\u27DE', # LONG RIGHT TACK 190 | u'\u27DE': u'\u27DD', # LONG LEFT TACK 191 | u'\u27E2': u'\u27E3', # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK 192 | u'\u27E3': u'\u27E2', # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK 193 | u'\u27E4': u'\u27E5', # WHITE SQUARE WITH LEFTWARDS TICK 194 | u'\u27E5': u'\u27E4', # WHITE SQUARE WITH RIGHTWARDS TICK 195 | u'\u27E6': u'\u27E7', # MATHEMATICAL LEFT WHITE SQUARE BRACKET 196 | u'\u27E7': u'\u27E6', # MATHEMATICAL RIGHT WHITE SQUARE BRACKET 197 | u'\u27E8': u'\u27E9', # MATHEMATICAL LEFT ANGLE BRACKET 198 | u'\u27E9': u'\u27E8', # MATHEMATICAL RIGHT ANGLE BRACKET 199 | u'\u27EA': u'\u27EB', # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 200 | u'\u27EB': u'\u27EA', # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 201 | u'\u27EC': u'\u27ED', # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 202 | u'\u27ED': u'\u27EC', # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 203 | u'\u27EE': u'\u27EF', # MATHEMATICAL LEFT FLATTENED PARENTHESIS 204 | u'\u27EF': u'\u27EE', # MATHEMATICAL RIGHT FLATTENED PARENTHESIS 205 | u'\u2983': u'\u2984', # LEFT WHITE CURLY BRACKET 206 | u'\u2984': u'\u2983', # RIGHT WHITE CURLY BRACKET 207 | u'\u2985': u'\u2986', # LEFT WHITE PARENTHESIS 208 | u'\u2986': u'\u2985', # RIGHT WHITE PARENTHESIS 209 | u'\u2987': u'\u2988', # Z NOTATION LEFT IMAGE BRACKET 210 | u'\u2988': u'\u2987', # Z NOTATION RIGHT IMAGE BRACKET 211 | u'\u2989': u'\u298A', # Z NOTATION LEFT BINDING BRACKET 212 | u'\u298A': u'\u2989', # Z NOTATION RIGHT BINDING BRACKET 213 | u'\u298B': u'\u298C', # LEFT SQUARE BRACKET WITH UNDERBAR 214 | u'\u298C': u'\u298B', # RIGHT SQUARE BRACKET WITH UNDERBAR 215 | u'\u298D': u'\u2990', # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 216 | u'\u298E': u'\u298F', # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 217 | u'\u298F': u'\u298E', # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 218 | u'\u2990': u'\u298D', # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 219 | u'\u2991': u'\u2992', # LEFT ANGLE BRACKET WITH DOT 220 | u'\u2992': u'\u2991', # RIGHT ANGLE BRACKET WITH DOT 221 | u'\u2993': u'\u2994', # LEFT ARC LESS-THAN BRACKET 222 | u'\u2994': u'\u2993', # RIGHT ARC GREATER-THAN BRACKET 223 | u'\u2995': u'\u2996', # DOUBLE LEFT ARC GREATER-THAN BRACKET 224 | u'\u2996': u'\u2995', # DOUBLE RIGHT ARC LESS-THAN BRACKET 225 | u'\u2997': u'\u2998', # LEFT BLACK TORTOISE SHELL BRACKET 226 | u'\u2998': u'\u2997', # RIGHT BLACK TORTOISE SHELL BRACKET 227 | u'\u29B8': u'\u2298', # CIRCLED REVERSE SOLIDUS 228 | u'\u29C0': u'\u29C1', # CIRCLED LESS-THAN 229 | u'\u29C1': u'\u29C0', # CIRCLED GREATER-THAN 230 | u'\u29C4': u'\u29C5', # SQUARED RISING DIAGONAL SLASH 231 | u'\u29C5': u'\u29C4', # SQUARED FALLING DIAGONAL SLASH 232 | u'\u29CF': u'\u29D0', # LEFT TRIANGLE BESIDE VERTICAL BAR 233 | u'\u29D0': u'\u29CF', # VERTICAL BAR BESIDE RIGHT TRIANGLE 234 | u'\u29D1': u'\u29D2', # BOWTIE WITH LEFT HALF BLACK 235 | u'\u29D2': u'\u29D1', # BOWTIE WITH RIGHT HALF BLACK 236 | u'\u29D4': u'\u29D5', # TIMES WITH LEFT HALF BLACK 237 | u'\u29D5': u'\u29D4', # TIMES WITH RIGHT HALF BLACK 238 | u'\u29D8': u'\u29D9', # LEFT WIGGLY FENCE 239 | u'\u29D9': u'\u29D8', # RIGHT WIGGLY FENCE 240 | u'\u29DA': u'\u29DB', # LEFT DOUBLE WIGGLY FENCE 241 | u'\u29DB': u'\u29DA', # RIGHT DOUBLE WIGGLY FENCE 242 | u'\u29F5': u'\u2215', # REVERSE SOLIDUS OPERATOR 243 | u'\u29F8': u'\u29F9', # BIG SOLIDUS 244 | u'\u29F9': u'\u29F8', # BIG REVERSE SOLIDUS 245 | u'\u29FC': u'\u29FD', # LEFT-POINTING CURVED ANGLE BRACKET 246 | u'\u29FD': u'\u29FC', # RIGHT-POINTING CURVED ANGLE BRACKET 247 | u'\u2A2B': u'\u2A2C', # MINUS SIGN WITH FALLING DOTS 248 | u'\u2A2C': u'\u2A2B', # MINUS SIGN WITH RISING DOTS 249 | u'\u2A2D': u'\u2A2E', # PLUS SIGN IN LEFT HALF CIRCLE 250 | u'\u2A2E': u'\u2A2D', # PLUS SIGN IN RIGHT HALF CIRCLE 251 | u'\u2A34': u'\u2A35', # MULTIPLICATION SIGN IN LEFT HALF CIRCLE 252 | u'\u2A35': u'\u2A34', # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE 253 | u'\u2A3C': u'\u2A3D', # INTERIOR PRODUCT 254 | u'\u2A3D': u'\u2A3C', # RIGHTHAND INTERIOR PRODUCT 255 | u'\u2A64': u'\u2A65', # Z NOTATION DOMAIN ANTIRESTRICTION 256 | u'\u2A65': u'\u2A64', # Z NOTATION RANGE ANTIRESTRICTION 257 | u'\u2A79': u'\u2A7A', # LESS-THAN WITH CIRCLE INSIDE 258 | u'\u2A7A': u'\u2A79', # GREATER-THAN WITH CIRCLE INSIDE 259 | u'\u2A7D': u'\u2A7E', # LESS-THAN OR SLANTED EQUAL TO 260 | u'\u2A7E': u'\u2A7D', # GREATER-THAN OR SLANTED EQUAL TO 261 | u'\u2A7F': u'\u2A80', # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE 262 | u'\u2A80': u'\u2A7F', # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE 263 | u'\u2A81': u'\u2A82', # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE 264 | u'\u2A82': u'\u2A81', # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE 265 | u'\u2A83': u'\u2A84', # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT 266 | u'\u2A84': u'\u2A83', # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT 267 | u'\u2A8B': u'\u2A8C', # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN 268 | u'\u2A8C': u'\u2A8B', # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN 269 | u'\u2A91': u'\u2A92', # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL 270 | u'\u2A92': u'\u2A91', # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL 271 | u'\u2A93': u'\u2A94', # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL 272 | u'\u2A94': u'\u2A93', # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL 273 | u'\u2A95': u'\u2A96', # SLANTED EQUAL TO OR LESS-THAN 274 | u'\u2A96': u'\u2A95', # SLANTED EQUAL TO OR GREATER-THAN 275 | u'\u2A97': u'\u2A98', # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE 276 | u'\u2A98': u'\u2A97', # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE 277 | u'\u2A99': u'\u2A9A', # DOUBLE-LINE EQUAL TO OR LESS-THAN 278 | u'\u2A9A': u'\u2A99', # DOUBLE-LINE EQUAL TO OR GREATER-THAN 279 | u'\u2A9B': u'\u2A9C', # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN 280 | u'\u2A9C': u'\u2A9B', # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN 281 | u'\u2AA1': u'\u2AA2', # DOUBLE NESTED LESS-THAN 282 | u'\u2AA2': u'\u2AA1', # DOUBLE NESTED GREATER-THAN 283 | u'\u2AA6': u'\u2AA7', # LESS-THAN CLOSED BY CURVE 284 | u'\u2AA7': u'\u2AA6', # GREATER-THAN CLOSED BY CURVE 285 | u'\u2AA8': u'\u2AA9', # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL 286 | u'\u2AA9': u'\u2AA8', # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL 287 | u'\u2AAA': u'\u2AAB', # SMALLER THAN 288 | u'\u2AAB': u'\u2AAA', # LARGER THAN 289 | u'\u2AAC': u'\u2AAD', # SMALLER THAN OR EQUAL TO 290 | u'\u2AAD': u'\u2AAC', # LARGER THAN OR EQUAL TO 291 | u'\u2AAF': u'\u2AB0', # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN 292 | u'\u2AB0': u'\u2AAF', # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN 293 | u'\u2AB3': u'\u2AB4', # PRECEDES ABOVE EQUALS SIGN 294 | u'\u2AB4': u'\u2AB3', # SUCCEEDS ABOVE EQUALS SIGN 295 | u'\u2ABB': u'\u2ABC', # DOUBLE PRECEDES 296 | u'\u2ABC': u'\u2ABB', # DOUBLE SUCCEEDS 297 | u'\u2ABD': u'\u2ABE', # SUBSET WITH DOT 298 | u'\u2ABE': u'\u2ABD', # SUPERSET WITH DOT 299 | u'\u2ABF': u'\u2AC0', # SUBSET WITH PLUS SIGN BELOW 300 | u'\u2AC0': u'\u2ABF', # SUPERSET WITH PLUS SIGN BELOW 301 | u'\u2AC1': u'\u2AC2', # SUBSET WITH MULTIPLICATION SIGN BELOW 302 | u'\u2AC2': u'\u2AC1', # SUPERSET WITH MULTIPLICATION SIGN BELOW 303 | u'\u2AC3': u'\u2AC4', # SUBSET OF OR EQUAL TO WITH DOT ABOVE 304 | u'\u2AC4': u'\u2AC3', # SUPERSET OF OR EQUAL TO WITH DOT ABOVE 305 | u'\u2AC5': u'\u2AC6', # SUBSET OF ABOVE EQUALS SIGN 306 | u'\u2AC6': u'\u2AC5', # SUPERSET OF ABOVE EQUALS SIGN 307 | u'\u2ACD': u'\u2ACE', # SQUARE LEFT OPEN BOX OPERATOR 308 | u'\u2ACE': u'\u2ACD', # SQUARE RIGHT OPEN BOX OPERATOR 309 | u'\u2ACF': u'\u2AD0', # CLOSED SUBSET 310 | u'\u2AD0': u'\u2ACF', # CLOSED SUPERSET 311 | u'\u2AD1': u'\u2AD2', # CLOSED SUBSET OR EQUAL TO 312 | u'\u2AD2': u'\u2AD1', # CLOSED SUPERSET OR EQUAL TO 313 | u'\u2AD3': u'\u2AD4', # SUBSET ABOVE SUPERSET 314 | u'\u2AD4': u'\u2AD3', # SUPERSET ABOVE SUBSET 315 | u'\u2AD5': u'\u2AD6', # SUBSET ABOVE SUBSET 316 | u'\u2AD6': u'\u2AD5', # SUPERSET ABOVE SUPERSET 317 | u'\u2ADE': u'\u22A6', # SHORT LEFT TACK 318 | u'\u2AE3': u'\u22A9', # DOUBLE VERTICAL BAR LEFT TURNSTILE 319 | u'\u2AE4': u'\u22A8', # VERTICAL BAR DOUBLE LEFT TURNSTILE 320 | u'\u2AE5': u'\u22AB', # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE 321 | u'\u2AEC': u'\u2AED', # DOUBLE STROKE NOT SIGN 322 | u'\u2AED': u'\u2AEC', # REVERSED DOUBLE STROKE NOT SIGN 323 | u'\u2AF7': u'\u2AF8', # TRIPLE NESTED LESS-THAN 324 | u'\u2AF8': u'\u2AF7', # TRIPLE NESTED GREATER-THAN 325 | u'\u2AF9': u'\u2AFA', # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO 326 | u'\u2AFA': u'\u2AF9', # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO 327 | u'\u2E02': u'\u2E03', # LEFT SUBSTITUTION BRACKET 328 | u'\u2E03': u'\u2E02', # RIGHT SUBSTITUTION BRACKET 329 | u'\u2E04': u'\u2E05', # LEFT DOTTED SUBSTITUTION BRACKET 330 | u'\u2E05': u'\u2E04', # RIGHT DOTTED SUBSTITUTION BRACKET 331 | u'\u2E09': u'\u2E0A', # LEFT TRANSPOSITION BRACKET 332 | u'\u2E0A': u'\u2E09', # RIGHT TRANSPOSITION BRACKET 333 | u'\u2E0C': u'\u2E0D', # LEFT RAISED OMISSION BRACKET 334 | u'\u2E0D': u'\u2E0C', # RIGHT RAISED OMISSION BRACKET 335 | u'\u2E1C': u'\u2E1D', # LEFT LOW PARAPHRASE BRACKET 336 | u'\u2E1D': u'\u2E1C', # RIGHT LOW PARAPHRASE BRACKET 337 | u'\u2E20': u'\u2E21', # LEFT VERTICAL BAR WITH QUILL 338 | u'\u2E21': u'\u2E20', # RIGHT VERTICAL BAR WITH QUILL 339 | u'\u2E22': u'\u2E23', # TOP LEFT HALF BRACKET 340 | u'\u2E23': u'\u2E22', # TOP RIGHT HALF BRACKET 341 | u'\u2E24': u'\u2E25', # BOTTOM LEFT HALF BRACKET 342 | u'\u2E25': u'\u2E24', # BOTTOM RIGHT HALF BRACKET 343 | u'\u2E26': u'\u2E27', # LEFT SIDEWAYS U BRACKET 344 | u'\u2E27': u'\u2E26', # RIGHT SIDEWAYS U BRACKET 345 | u'\u2E28': u'\u2E29', # LEFT DOUBLE PARENTHESIS 346 | u'\u2E29': u'\u2E28', # RIGHT DOUBLE PARENTHESIS 347 | u'\u3008': u'\u3009', # LEFT ANGLE BRACKET 348 | u'\u3009': u'\u3008', # RIGHT ANGLE BRACKET 349 | u'\u300A': u'\u300B', # LEFT DOUBLE ANGLE BRACKET 350 | u'\u300B': u'\u300A', # RIGHT DOUBLE ANGLE BRACKET 351 | u'\u300C': u'\u300D', # [BEST FIT] LEFT CORNER BRACKET 352 | u'\u300D': u'\u300C', # [BEST FIT] RIGHT CORNER BRACKET 353 | u'\u300E': u'\u300F', # [BEST FIT] LEFT WHITE CORNER BRACKET 354 | u'\u300F': u'\u300E', # [BEST FIT] RIGHT WHITE CORNER BRACKET 355 | u'\u3010': u'\u3011', # LEFT BLACK LENTICULAR BRACKET 356 | u'\u3011': u'\u3010', # RIGHT BLACK LENTICULAR BRACKET 357 | u'\u3014': u'\u3015', # LEFT TORTOISE SHELL BRACKET 358 | u'\u3015': u'\u3014', # RIGHT TORTOISE SHELL BRACKET 359 | u'\u3016': u'\u3017', # LEFT WHITE LENTICULAR BRACKET 360 | u'\u3017': u'\u3016', # RIGHT WHITE LENTICULAR BRACKET 361 | u'\u3018': u'\u3019', # LEFT WHITE TORTOISE SHELL BRACKET 362 | u'\u3019': u'\u3018', # RIGHT WHITE TORTOISE SHELL BRACKET 363 | u'\u301A': u'\u301B', # LEFT WHITE SQUARE BRACKET 364 | u'\u301B': u'\u301A', # RIGHT WHITE SQUARE BRACKET 365 | u'\uFE59': u'\uFE5A', # SMALL LEFT PARENTHESIS 366 | u'\uFE5A': u'\uFE59', # SMALL RIGHT PARENTHESIS 367 | u'\uFE5B': u'\uFE5C', # SMALL LEFT CURLY BRACKET 368 | u'\uFE5C': u'\uFE5B', # SMALL RIGHT CURLY BRACKET 369 | u'\uFE5D': u'\uFE5E', # SMALL LEFT TORTOISE SHELL BRACKET 370 | u'\uFE5E': u'\uFE5D', # SMALL RIGHT TORTOISE SHELL BRACKET 371 | u'\uFE64': u'\uFE65', # SMALL LESS-THAN SIGN 372 | u'\uFE65': u'\uFE64', # SMALL GREATER-THAN SIGN 373 | u'\uFF08': u'\uFF09', # FULLWIDTH LEFT PARENTHESIS 374 | u'\uFF09': u'\uFF08', # FULLWIDTH RIGHT PARENTHESIS 375 | u'\uFF1C': u'\uFF1E', # FULLWIDTH LESS-THAN SIGN 376 | u'\uFF1E': u'\uFF1C', # FULLWIDTH GREATER-THAN SIGN 377 | u'\uFF3B': u'\uFF3D', # FULLWIDTH LEFT SQUARE BRACKET 378 | u'\uFF3D': u'\uFF3B', # FULLWIDTH RIGHT SQUARE BRACKET 379 | u'\uFF5B': u'\uFF5D', # FULLWIDTH LEFT CURLY BRACKET 380 | u'\uFF5D': u'\uFF5B', # FULLWIDTH RIGHT CURLY BRACKET 381 | u'\uFF5F': u'\uFF60', # FULLWIDTH LEFT WHITE PARENTHESIS 382 | u'\uFF60': u'\uFF5F', # FULLWIDTH RIGHT WHITE PARENTHESIS 383 | u'\uFF62': u'\uFF63', # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET 384 | u'\uFF63': u'\uFF62', # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET 385 | } 386 | -------------------------------------------------------------------------------- /bidi/algorithm.py: -------------------------------------------------------------------------------- 1 | # This file is part of python-bidi 2 | # 3 | # python-bidi is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU Lesser General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # along with this program. If not, see . 15 | 16 | # Copyright (C) 2008-2010 Yaacov Zamir , 17 | # Meir kriheli 18 | "bidirectional alogrithm implementation" 19 | 20 | from unicodedata import bidirectional, mirrored 21 | import inspect 22 | import sys 23 | from collections import deque 24 | 25 | try: 26 | 27 | # Python 3 28 | 29 | from .mirror import MIRRORED 30 | except ValueError: 31 | 32 | # Python 2 33 | from bidi.mirror import MIRRORED 34 | 35 | # Some definitions 36 | PARAGRAPH_LEVELS = { 'L':0, 'AL':1, 'R': 1 } 37 | EXPLICIT_LEVEL_LIMIT = 62 38 | 39 | _LEAST_GREATER_ODD = lambda x: (x + 1) | 1 40 | _LEAST_GREATER_EVEN = lambda x: (x + 2) & ~1 41 | 42 | X2_X5_MAPPINGS = { 43 | 'RLE': (_LEAST_GREATER_ODD, 'N'), 44 | 'LRE': (_LEAST_GREATER_EVEN, 'N'), 45 | 'RLO': (_LEAST_GREATER_ODD, 'R'), 46 | 'LRO': (_LEAST_GREATER_EVEN, 'L'), 47 | } 48 | 49 | # Added 'B' so X6 won't execute in that case and X8 will run it's course 50 | X6_IGNORED = list(X2_X5_MAPPINGS.keys()) + ['BN', 'PDF', 'B'] 51 | X9_REMOVED = list(X2_X5_MAPPINGS.keys()) + ['BN', 'PDF'] 52 | 53 | _embedding_direction = lambda x:('L', 'R')[x % 2] 54 | 55 | _IS_UCS2 = sys.maxunicode == 65535 56 | _SURROGATE_MIN, _SURROGATE_MAX = 55296, 56319 # D800, DBFF 57 | 58 | def debug_storage(storage, base_info=False, chars=True, runs=False): 59 | "Display debug information for the storage" 60 | 61 | import codecs 62 | import locale 63 | import sys 64 | 65 | stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr) 66 | 67 | caller = inspect.stack()[1][3] 68 | stderr.write('in %s\n' % caller) 69 | 70 | if base_info: 71 | stderr.write(u' base level : %d\n' % storage['base_level']) 72 | stderr.write(u' base dir : %s\n' % storage['base_dir']) 73 | 74 | if runs: 75 | stderr.write(u' runs : %s\n' % list(storage['runs'])) 76 | 77 | if chars: 78 | output = u' Chars : ' 79 | for _ch in storage['chars']: 80 | if _ch != '\n': 81 | output += _ch['ch'] 82 | else: 83 | output += 'C' 84 | stderr.write(output + u'\n') 85 | 86 | output = u' Res. levels : %s\n' % u''.join( 87 | [unicode(_ch['level']) for _ch in storage['chars']]) 88 | stderr.write(output) 89 | 90 | _types = [_ch['type'].ljust(3) for _ch in storage['chars']] 91 | 92 | for i in range(3): 93 | if i: 94 | output = u' %s\n' 95 | else: 96 | output = u' Res. types : %s\n' 97 | stderr.write(output % u''.join([_t[i] for _t in _types])) 98 | 99 | 100 | def get_base_level(text, upper_is_rtl=False): 101 | """Get the paragraph base embedding level. Returns 0 for LTR, 102 | 1 for RTL. 103 | 104 | `text` a unicode object. 105 | 106 | Set `upper_is_rtl` to True to treat upper case chars as strong 'R' 107 | for debugging (default: False). 108 | 109 | """ 110 | 111 | base_level = None 112 | 113 | prev_surrogate = False 114 | # P2 115 | for _ch in text: 116 | # surrogate in case of ucs2 117 | if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX): 118 | prev_surrogate = _ch 119 | continue 120 | elif prev_surrogate: 121 | _ch = prev_surrogate + _ch 122 | prev_surrogate = False 123 | 124 | # treat upper as RTL ? 125 | if upper_is_rtl and _ch.isupper(): 126 | base_level = 1 127 | break 128 | 129 | bidi_type = bidirectional(_ch) 130 | 131 | if bidi_type in ('AL', 'R'): 132 | base_level = 1 133 | break 134 | 135 | elif bidi_type == 'L': 136 | base_level = 0 137 | break 138 | 139 | # P3 140 | if base_level is None: 141 | base_level = 0 142 | 143 | return base_level 144 | 145 | def get_embedding_levels(text, storage, upper_is_rtl=False, debug=False): 146 | """Get the paragraph base embedding level and direction, 147 | set the storage to the array of chars""" 148 | 149 | prev_surrogate = False 150 | base_level = storage['base_level'] 151 | 152 | # preset the storage's chars 153 | for _ch in text: 154 | if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX): 155 | prev_surrogate = _ch 156 | continue 157 | elif prev_surrogate: 158 | _ch = prev_surrogate + _ch 159 | prev_surrogate = False 160 | 161 | if upper_is_rtl and _ch.isupper(): 162 | bidi_type = 'R' 163 | else: 164 | bidi_type = bidirectional(_ch) 165 | storage['chars'].append({'ch':_ch, 'level':base_level, 'type':bidi_type, 166 | 'orig':bidi_type}) 167 | if debug: 168 | debug_storage(storage, base_info=True) 169 | 170 | def explicit_embed_and_overrides(storage, debug=False): 171 | """Apply X1 to X9 rules of the unicode algorithm. 172 | 173 | See http://unicode.org/reports/tr9/#Explicit_Levels_and_Directions 174 | 175 | """ 176 | overflow_counter = almost_overflow_counter = 0 177 | directional_override = 'N' 178 | levels = deque() 179 | 180 | #X1 181 | embedding_level = storage['base_level'] 182 | 183 | for _ch in storage['chars']: 184 | bidi_type = _ch['type'] 185 | 186 | level_func, override = X2_X5_MAPPINGS.get(bidi_type, (None, None)) 187 | 188 | if level_func: 189 | # So this is X2 to X5 190 | # if we've past EXPLICIT_LEVEL_LIMIT, note it and do nothing 191 | 192 | if overflow_counter != 0: 193 | overflow_counter += 1 194 | continue 195 | 196 | new_level = level_func(embedding_level) 197 | if new_level < EXPLICIT_LEVEL_LIMIT: 198 | levels.append( (embedding_level, directional_override) ) 199 | embedding_level, directional_override = new_level, override 200 | 201 | elif embedding_level == EXPLICIT_LEVEL_LIMIT -2: 202 | # The new level is invalid, but a valid level can still be 203 | # achieved if this level is 60 and we encounter an RLE or 204 | # RLO further on. So record that we 'almost' overflowed. 205 | almost_overflow_counter += 1 206 | 207 | else: 208 | overflow_counter += 1 209 | else: 210 | # X6 211 | if bidi_type not in X6_IGNORED: 212 | _ch['level'] = embedding_level 213 | if directional_override != 'N': 214 | _ch['type'] = directional_override 215 | 216 | # X7 217 | elif bidi_type == 'PDF': 218 | if overflow_counter: 219 | overflow_counter -= 1 220 | elif almost_overflow_counter and \ 221 | embedding_level != EXPLICIT_LEVEL_LIMIT - 1: 222 | almost_overflow_counter -= 1 223 | elif levels: 224 | embedding_level, directional_override = levels.pop() 225 | 226 | # X8 227 | elif bidi_type == 'B': 228 | levels.clear() 229 | overflow_counter = almost_overflow_counter = 0 230 | embedding_level = _ch['level'] = storage['base_level'] 231 | directional_override = 'N' 232 | 233 | #Removes the explicit embeds and overrides of types 234 | #RLE, LRE, RLO, LRO, PDF, and BN. Adjusts extended chars 235 | #next and prev as well 236 | 237 | #Applies X9. See http://unicode.org/reports/tr9/#X9 238 | storage['chars'] = [_ch for _ch in storage['chars']\ 239 | if _ch['type'] not in X9_REMOVED] 240 | 241 | calc_level_runs(storage) 242 | 243 | if debug: 244 | debug_storage(storage, runs=True) 245 | 246 | def calc_level_runs(storage): 247 | """Split the storage to run of char types at the same level. 248 | 249 | Applies X10. See http://unicode.org/reports/tr9/#X10 250 | """ 251 | #run level depends on the higher of the two levels on either side of 252 | #the boundary If the higher level is odd, the type is R; otherwise, 253 | #it is L 254 | 255 | storage['runs'].clear() 256 | chars = storage['chars'] 257 | 258 | #empty string ? 259 | if not chars: 260 | return 261 | 262 | calc_level_run = lambda b_l, b_r: ['L', 'R'][max(b_l, b_r) % 2] 263 | 264 | first_char = chars[0] 265 | 266 | sor = calc_level_run(storage['base_level'], first_char['level']) 267 | eor = None 268 | 269 | run_start = run_length = 0 270 | 271 | prev_level, prev_type = first_char['level'], first_char['type'] 272 | 273 | for _ch in chars: 274 | curr_level, curr_type = _ch['level'], _ch['type'] 275 | 276 | if curr_level == prev_level: 277 | run_length += 1 278 | else: 279 | eor = calc_level_run(prev_level, curr_level) 280 | storage['runs'].append({'sor':sor, 'eor':eor, 'start':run_start, 281 | 'type': prev_type,'length': run_length}) 282 | sor = eor 283 | run_start += run_length 284 | run_length = 1 285 | 286 | prev_level, prev_type = curr_level, curr_type 287 | 288 | # for the last char/runlevel 289 | eor = calc_level_run(curr_level, storage['base_level']) 290 | storage['runs'].append({'sor':sor, 'eor':eor, 'start':run_start, 291 | 'type':curr_type, 'length': run_length}) 292 | 293 | def resolve_weak_types(storage, debug=False): 294 | """Reslove weak type rules W1 - W3. 295 | 296 | See: http://unicode.org/reports/tr9/#Resolving_Weak_Types 297 | 298 | """ 299 | 300 | for run in storage['runs']: 301 | prev_strong = prev_type = run['sor'] 302 | start, length = run['start'], run['length'] 303 | chars = storage['chars'][start:start+length] 304 | for _ch in chars: 305 | # W1. Examine each nonspacing mark (NSM) in the level run, and 306 | # change the type of the NSM to the type of the previous character. 307 | # If the NSM is at the start of the level run, it will get the type 308 | # of sor. 309 | bidi_type = _ch['type'] 310 | 311 | if bidi_type == 'NSM': 312 | _ch['type'] = bidi_type = prev_type 313 | 314 | # W2. Search backward from each instance of a European number until 315 | # the first strong type (R, L, AL, or sor) is found. If an AL is 316 | # found, change the type of the European number to Arabic number. 317 | if bidi_type == 'EN' and prev_strong == 'AL': 318 | _ch['type'] = 'AN' 319 | 320 | # update prev_strong if needed 321 | if bidi_type in ('R', 'L', 'AL'): 322 | prev_strong = bidi_type 323 | 324 | prev_type = _ch['type'] 325 | 326 | # W3. Change all ALs to R 327 | for _ch in chars: 328 | if _ch['type'] == 'AL': 329 | _ch['type'] = 'R' 330 | 331 | # W4. A single European separator between two European numbers changes 332 | # to a European number. A single common separator between two numbers of 333 | # the same type changes to that type. 334 | for idx in range(1, len(chars) -1 ): 335 | bidi_type = chars[idx]['type'] 336 | prev_type = chars[idx-1]['type'] 337 | next_type = chars[idx+1]['type'] 338 | 339 | if bidi_type == 'ES' and (prev_type == next_type == 'EN'): 340 | chars[idx]['type'] = 'EN' 341 | 342 | if bidi_type == 'CS' and prev_type == next_type and \ 343 | prev_type in ('AN', 'EN'): 344 | chars[idx]['type'] = prev_type 345 | 346 | 347 | # W5. A sequence of European terminators adjacent to European numbers 348 | # changes to all European numbers. 349 | for idx in range(len(chars)): 350 | if chars[idx]['type'] == 'EN': 351 | for et_idx in range(idx-1, -1, -1): 352 | if chars[et_idx]['type'] == 'ET': 353 | chars[et_idx]['type'] = 'EN' 354 | else: 355 | break 356 | for et_idx in range(idx+1, len(chars)): 357 | if chars[et_idx]['type'] == 'ET': 358 | chars[et_idx]['type'] = 'EN' 359 | else: 360 | break 361 | 362 | # W6. Otherwise, separators and terminators change to Other Neutral. 363 | for _ch in chars: 364 | if _ch['type'] in ('ET', 'ES', 'CS'): 365 | _ch['type'] = 'ON' 366 | 367 | # W7. Search backward from each instance of a European number until the 368 | # first strong type (R, L, or sor) is found. If an L is found, then 369 | # change the type of the European number to L. 370 | prev_strong = run['sor'] 371 | for _ch in chars: 372 | if _ch['type'] == 'EN' and prev_strong == 'L': 373 | _ch['type'] = 'L' 374 | 375 | if _ch['type'] in ('L', 'R'): 376 | prev_strong = _ch['type'] 377 | 378 | if debug: 379 | debug_storage(storage, runs=True) 380 | 381 | def resolve_neutral_types(storage, debug): 382 | """Resolving neutral types. Implements N1 and N2 383 | 384 | See: http://unicode.org/reports/tr9/#Resolving_Neutral_Types 385 | 386 | """ 387 | 388 | for run in storage['runs']: 389 | start, length = run['start'], run['length'] 390 | # use sor and eor 391 | chars = [{'type':run['sor']}] + storage['chars'][start:start+length] +\ 392 | [{'type':run['eor']}] 393 | total_chars = len(chars) 394 | 395 | seq_start = None 396 | for idx in range(total_chars): 397 | _ch = chars[idx] 398 | if _ch['type'] in ('B', 'S', 'WS', 'ON'): 399 | # N1. A sequence of neutrals takes the direction of the 400 | # surrounding strong text if the text on both sides has the same 401 | # direction. European and Arabic numbers act as if they were R 402 | # in terms of their influence on neutrals. Start-of-level-run 403 | # (sor) and end-of-level-run (eor) are used at level run 404 | # boundaries. 405 | if seq_start is None: 406 | seq_start = idx 407 | prev_bidi_type = chars[idx-1]['type'] 408 | else: 409 | if seq_start is not None: 410 | next_bidi_type = chars[idx]['type'] 411 | 412 | if prev_bidi_type in ('AN', 'EN'): 413 | prev_bidi_type = 'R' 414 | 415 | if next_bidi_type in ('AN', 'EN'): 416 | next_bidi_type = 'R' 417 | 418 | for seq_idx in range(seq_start, idx): 419 | if prev_bidi_type == next_bidi_type: 420 | chars[seq_idx]['type'] = prev_bidi_type 421 | else: 422 | # N2. Any remaining neutrals take the embedding 423 | # direction. The embedding direction for the given 424 | # neutral character is derived from its embedding 425 | # level: L if the character is set to an even level, 426 | # and R if the level is odd. 427 | chars[seq_idx]['type'] = \ 428 | _embedding_direction(chars[seq_idx]['level']) 429 | 430 | seq_start = None 431 | 432 | if debug: 433 | debug_storage(storage) 434 | 435 | def resolve_implicit_levels(storage, debug): 436 | """Resolving implicit levels (I1, I2) 437 | 438 | See: http://unicode.org/reports/tr9/#Resolving_Implicit_Levels 439 | 440 | """ 441 | for run in storage['runs']: 442 | start, length = run['start'], run['length'] 443 | chars = storage['chars'][start:start+length] 444 | 445 | for _ch in chars: 446 | # only those types are allowed at this stage 447 | assert _ch['type'] in ('L', 'R', 'EN', 'AN'),\ 448 | '%s not allowed here' % _ch['type'] 449 | 450 | if _embedding_direction(_ch['level']) == 'L': 451 | # I1. For all characters with an even (left-to-right) embedding 452 | # direction, those of type R go up one level and those of type 453 | # AN or EN go up two levels. 454 | if _ch['type'] == 'R': 455 | _ch['level'] += 1 456 | elif _ch['type'] != 'L': 457 | _ch['level'] += 2 458 | else: 459 | # I2. For all characters with an odd (right-to-left) embedding 460 | # direction, those of type L, EN or AN go up one level. 461 | if _ch['type'] != 'R': 462 | _ch['level'] += 1 463 | 464 | if debug: 465 | debug_storage(storage, runs=True) 466 | 467 | def reverse_contiguous_sequence(chars, line_start, line_end, highest_level, 468 | lowest_odd_level): 469 | """L2. From the highest level found in the text to the lowest odd 470 | level on each line, including intermediate levels not actually 471 | present in the text, reverse any contiguous sequence of characters 472 | that are at that level or higher. 473 | 474 | """ 475 | for level in range(highest_level, lowest_odd_level-1, -1): 476 | _start = _end = None 477 | 478 | for run_idx in range(line_start, line_end+1): 479 | run_ch = chars[run_idx] 480 | 481 | if run_ch['level'] >= level: 482 | if _start is None: 483 | _start = _end = run_idx 484 | else: 485 | _end = run_idx 486 | else: 487 | if _end: 488 | chars[_start:+_end+1] = \ 489 | reversed(chars[_start:+_end+1]) 490 | _start = _end = None 491 | 492 | # anything remaining ? 493 | if _start is not None: 494 | chars[_start:+_end+1] = \ 495 | reversed(chars[_start:+_end+1]) 496 | 497 | 498 | def reorder_resolved_levels(storage, debug): 499 | """L1 and L2 rules""" 500 | 501 | # Applies L1. 502 | 503 | should_reset = True 504 | chars = storage['chars'] 505 | 506 | for _ch in chars[::-1]: 507 | # L1. On each line, reset the embedding level of the following 508 | # characters to the paragraph embedding level: 509 | if _ch['orig'] in ('B', 'S'): 510 | # 1. Segment separators, 511 | # 2. Paragraph separators, 512 | _ch['level'] = storage['base_level'] 513 | should_reset = True 514 | elif should_reset and _ch['orig'] in ('BN', 'WS'): 515 | # 3. Any sequence of whitespace characters preceding a segment 516 | # separator or paragraph separator 517 | # 4. Any sequence of white space characters at the end of the 518 | # line. 519 | _ch['level'] = storage['base_level'] 520 | else: 521 | should_reset = False 522 | 523 | max_len = len(chars) 524 | 525 | # L2 should be per line 526 | # Calculates highest level and loweset odd level on the fly. 527 | 528 | line_start = line_end = 0 529 | highest_level = 0 530 | lowest_odd_level = EXPLICIT_LEVEL_LIMIT 531 | 532 | for idx in range(max_len): 533 | _ch = chars[idx] 534 | 535 | # calc the levels 536 | char_level = _ch['level'] 537 | if char_level > highest_level: 538 | highest_level = char_level 539 | 540 | if char_level % 2 and char_level < lowest_odd_level: 541 | lowest_odd_level = char_level 542 | 543 | if _ch['orig'] == 'B' or idx == max_len -1: 544 | line_end = idx 545 | # omit line breaks 546 | if _ch['orig'] == 'B': 547 | line_end -= 1 548 | 549 | reverse_contiguous_sequence(chars, line_start, line_end, 550 | highest_level, lowest_odd_level) 551 | 552 | # reset for next line run 553 | line_start = idx+1 554 | highest_level = 0 555 | lowest_odd_level = EXPLICIT_LEVEL_LIMIT 556 | 557 | if debug: 558 | debug_storage(storage) 559 | 560 | 561 | def apply_mirroring(storage, debug): 562 | """Applies L4: mirroring 563 | 564 | See: http://unicode.org/reports/tr9/#L4 565 | 566 | """ 567 | # L4. A character is depicted by a mirrored glyph if and only if (a) the 568 | # resolved directionality of that character is R, and (b) the 569 | # Bidi_Mirrored property value of that character is true. 570 | for _ch in storage['chars']: 571 | unichar = _ch['ch'] 572 | if mirrored(unichar) and \ 573 | _embedding_direction(_ch['level']) == 'R': 574 | _ch['ch'] = MIRRORED.get(unichar, unichar) 575 | 576 | if debug: 577 | debug_storage(storage) 578 | 579 | def get_empty_storage(): 580 | """Return an empty storage skeleton, usable for testing""" 581 | return { 582 | 'base_level': None, 583 | 'base_dir' : None, 584 | 'chars': [], 585 | 'runs' : deque(), 586 | } 587 | 588 | 589 | def get_display(unicode_or_str, encoding='utf-8', upper_is_rtl=False, 590 | base_dir=None, debug=False): 591 | """Accepts unicode or string. In case it's a string, `encoding` 592 | is needed as it works on unicode ones (default:"utf-8"). 593 | 594 | Set `upper_is_rtl` to True to treat upper case chars as strong 'R' 595 | for debugging (default: False). 596 | 597 | Set `base_dir` to 'L' or 'R' to override the calculated base_level. 598 | 599 | Set `debug` to True to display (using sys.stderr) the steps taken with the 600 | algorithm. 601 | 602 | Returns the display layout, either as unicode or `encoding` encoded 603 | string. 604 | 605 | """ 606 | storage = get_empty_storage() 607 | 608 | # utf-8 ? we need unicode 609 | if isinstance(unicode_or_str, str): 610 | text = unicode_or_str 611 | decoded = False 612 | else: 613 | text = unicode_or_str.decode(encoding) 614 | decoded = True 615 | 616 | if base_dir is None: 617 | base_level = get_base_level(text, upper_is_rtl) 618 | else: 619 | base_level = PARAGRAPH_LEVELS[base_dir] 620 | 621 | storage['base_level'] = base_level 622 | storage['base_dir'] = ('L', 'R')[base_level] 623 | 624 | get_embedding_levels(text, storage, upper_is_rtl, debug) 625 | explicit_embed_and_overrides(storage, debug) 626 | resolve_weak_types(storage, debug) 627 | resolve_neutral_types(storage, debug) 628 | resolve_implicit_levels(storage, debug) 629 | reorder_resolved_levels(storage, debug) 630 | apply_mirroring(storage, debug) 631 | 632 | chars = storage['chars'] 633 | display = u''.join([_ch['ch'] for _ch in chars]) 634 | 635 | if decoded: 636 | return display.encode(encoding) 637 | else: 638 | return display 639 | --------------------------------------------------------------------------------