├── bidi
├── __init__.py
├── mirror.pyc
├── algorithm.pyc
├── tests.py
├── arabic_reshaper.py
├── mirror.py
└── algorithm.py
├── Default (Linux).sublime-keymap
├── Default (OSX).sublime-keymap
├── Context.sublime-menu
├── Default (Windows).sublime-keymap
├── Default.sublime-commands
├── Main.sublime-menu
├── rtl.py
└── README.md
/bidi/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Default (Linux).sublime-keymap:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "keys": ["ctrl+b"],
4 | "command": "bidi"
5 | }
6 | ]
--------------------------------------------------------------------------------
/Default (OSX).sublime-keymap:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "keys": ["ctrl+b"],
4 | "command": "bidi"
5 | }
6 | ]
--------------------------------------------------------------------------------
/bidi/mirror.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HosseinRashno/Sublime-Text-2-BIDI/HEAD/bidi/mirror.pyc
--------------------------------------------------------------------------------
/bidi/algorithm.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HosseinRashno/Sublime-Text-2-BIDI/HEAD/bidi/algorithm.pyc
--------------------------------------------------------------------------------
/Context.sublime-menu:
--------------------------------------------------------------------------------
1 | [
2 | { "command": "bidi", "caption":"Bidirectional text" },
3 | { "command": "bidiselection", "caption":"Bidirectional Selection" }
4 | ]
5 |
--------------------------------------------------------------------------------
/Default (Windows).sublime-keymap:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "keys": ["ctrl+b"],
4 | "command": "bidi"
5 | },
6 | {
7 | "keys": ["ctrl+u"],
8 | "command": "bidiselection"
9 | }
10 | ]
--------------------------------------------------------------------------------
/Default.sublime-commands:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "caption": "Bidirectional text",
4 | "command": "run_bidi"
5 | },
6 | {
7 | "caption": "Bidirectional selection",
8 | "command": "run_bidiselection"
9 | }
10 | ]
--------------------------------------------------------------------------------
/Main.sublime-menu:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "caption": "Tools",
4 | "mnemonic": "t",
5 | "id": "tools",
6 | "children":
7 | [
8 | {"caption":"-"},
9 | {
10 | "caption": "Bidirectional text",
11 | "mnemonic": "B",
12 | "command": "bidi"
13 | },
14 | {
15 | "caption": "Bidirectional selection",
16 | "mnemonic": "S",
17 | "command": "bidiselection"
18 | }
19 | ]
20 | }
21 | ]
--------------------------------------------------------------------------------
/rtl.py:
--------------------------------------------------------------------------------
1 | import sublime, sublime_plugin, sys
2 |
3 | sys.path.append( 'bidi' )
4 | try:
5 |
6 | # Python 3
7 |
8 | from .bidi.arabic_reshaper import reshape
9 | from .bidi.algorithm import get_display
10 | except ValueError:
11 |
12 | # Python 2
13 |
14 | from bidi.arabic_reshaper import reshape
15 | from bidi.algorithm import get_display
16 |
17 | class bidiCommand(sublime_plugin.TextCommand):
18 | def run(self, edit):
19 | region = sublime.Region(0, self.view.size())
20 | bidiRegion(region, self.view, edit)
21 |
22 | class bidiselectionCommand(sublime_plugin.TextCommand):
23 | def run(self, edit):
24 | selectionSet = self.view.sel()
25 | for selectionRegion in selectionSet:
26 | bidiRegion(selectionRegion, self.view, edit)
27 |
28 | def bidiRegion(region, view, edit):
29 | txt = view.substr(region)
30 | reshaped_text = reshape(txt)
31 | bdiText = get_display(reshaped_text)
32 | view.replace(edit, region, bdiText)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Bidirectional text support for Sublime Text 3
2 | ===================
3 |
4 | Currently Sublime Text 3 is not supporting bidirectional languages like Arabic, Hebrew etc.. Using this plugin you can view bidirectional texts.
5 |
6 | Please note, I don't know Arabic or Hebrew. I have checked the results by pattern matching. Its a starting point.
7 |
8 |
9 |
10 | Install
11 | -----------------
12 | Clone it into Sublime Package directory.
13 |
14 |
15 |
16 | Set Font face to any Arabic supporting font (Arial) in user settings.
17 |
18 |
19 |
20 |
21 | Usage
22 | ----------------------
23 | Open file.
24 | Enter text
25 | Tools > Bidirectional text (ctrl+b)
26 |
27 |
28 |
29 |
30 |
31 | Command Accebility
32 | -------------------
33 | Tools > Bidirectional text
34 | Ctrl + B
35 | Right click > Bidirectional text
36 |
37 | Bug tracker
38 | ----------
39 | Post an issue here on Github.
40 | https://github.com/praveenvijayan/Sublime-Text-2-BIDI/issues
41 |
42 | Resources
43 | ----------
44 | http://www.decodize.com/html/sublime-text-2-bidirectional-language-support-plugin/
45 |
46 | Twitter
47 | ------------------
48 | Follow for updates : @praveen_vijaya
49 |
50 | Thanks
51 | ----
52 | https://github.com/MeirKriheli/python-bidi
53 | https://github.com/mpcabd/python-arabic-reshaper
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/bidi/tests.py:
--------------------------------------------------------------------------------
1 | # This file is part of python-bidi
2 | #
3 | # python-bidi is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU Lesser General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this program. If not, see .
15 |
16 | # Copyright (C) 2008-2010 Yaacov Zamir ,
17 | # Meir kriheli
18 | """BiDi algorithm unit tests"""
19 |
20 | import unittest
21 | from bidi.algorithm import get_display, get_empty_storage, get_embedding_levels
22 |
23 | class TestBidiAlgorithm(unittest.TestCase):
24 | "Tests the bidi algorithm (based on GNU fribidi ones)"
25 |
26 | def test_surrogate(self):
27 | """Test for storage and base levels in case of surrogate pairs"""
28 |
29 | storage = get_empty_storage()
30 |
31 | text = u'HELLO \U0001d7f612'
32 | get_embedding_levels(text, storage, upper_is_rtl=True)
33 |
34 | # should return 9, not 10 even in --with-unicode=ucs2
35 | self.assertEqual(len(storage['chars']), 9)
36 |
37 | # Is the expected result ? should be EN
38 | _ch = storage['chars'][6]
39 | self.assertEqual(_ch['ch'], u'\U0001d7f6')
40 | self.assertEqual(_ch['type'], 'EN')
41 |
42 | display = get_display(text, upper_is_rtl=True)
43 | self.assertEqual(display, u'\U0001d7f612 OLLEH')
44 |
45 | def test_implict_with_upper_is_rtl(self):
46 | '''Implicit tests'''
47 |
48 | tests = (
49 | (u'car is THE CAR in arabic', u'car is RAC EHT in arabic'),
50 | (u'CAR IS the car IN ENGLISH', u'HSILGNE NI the car SI RAC'),
51 | (u'he said "IT IS 123, 456, OK"', u'he said "KO ,456 ,123 SI TI"'),
52 | (u'he said "IT IS (123, 456), OK"', u'he said "KO ,(456 ,123) SI TI"'),
53 | (u'he said "IT IS 123,456, OK"', u'he said "KO ,123,456 SI TI"'),
54 | (u'he said "IT IS (123,456), OK"', u'he said "KO ,(123,456) SI TI"'),
55 | (u'HE SAID "it is 123, 456, ok"', u'"it is 123, 456, ok" DIAS EH'),
56 | (u'shalom', u'<123H/>shalom<123H>'),
57 | (u'SAALAM', u'MALAAS'),
58 | (u'HE SAID "it is a car!" AND RAN', u'NAR DNA "!it is a car" DIAS EH'),
59 | (u'HE SAID "it is a car!x" AND RAN', u'NAR DNA "it is a car!x" DIAS EH'),
60 | (u'SOLVE 1*5 1-5 1/5 1+5', u'1+5 1/5 1-5 5*1 EVLOS'),
61 | (u'THE RANGE IS 2.5..5', u'5..2.5 SI EGNAR EHT'),
62 | (u'-2 CELSIUS IS COLD', u'DLOC SI SUISLEC 2-'),
63 | )
64 |
65 | for storage, display in tests:
66 | self.assertEqual(get_display(storage, upper_is_rtl=True), display)
67 |
68 | def test_override_base_dir(self):
69 | """Tests overriding the base paragraph direction"""
70 |
71 | # normaly the display should be :MOLAHS be since we're overriding the
72 | # base dir the colon should be at the end of the display
73 | storage = u'SHALOM:'
74 | display = u'MOLAHS:'
75 |
76 | self.assertEqual(get_display(storage, upper_is_rtl=True, base_dir='L'), display)
77 |
78 |
79 |
80 | def test_output_encoding(self):
81 | """Make sure the display is in the same encdoing as the incoming text"""
82 |
83 | storage = '\xf9\xec\xe5\xed' # Hebrew word shalom in cp1255
84 | display = '\xed\xe5\xec\xf9'
85 |
86 | self.assertEqual(get_display(storage, encoding='cp1255'), display)
87 |
88 |
89 | def test_explicit_with_upper_is_rtl(self):
90 | """Explicit tests"""
91 | tests = (
92 | (u'this is _LJUST_o', u'this is JUST'),
93 | (u'a _lsimple _RteST_o th_oat', u'a simple TSet that'),
94 | (u'HAS A _LPDF missing', u'PDF missing A SAH'),
95 | (u'AnD hOw_L AbOuT, 123,987 tHiS_o', u'w AbOuT, 123,987 tHiSOh DnA'),
96 | (u'a GOOD - _L_oTEST.', u'a TSET - DOOG.'),
97 | (u'here_L is_o_o_o _R a good one_o', u'here is eno doog a'),
98 | (u'THE _rbest _lONE and', u'best ENO and EHT'),
99 | (u'A REAL BIG_l_o BUG!', u'!GUB GIB LAER A'),
100 | (u'a _L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_L_Rbug', u'a gub'),
101 | (u'AN ARABIC _l_o 123-456 NICE ONE!', u'!ENO ECIN 456-123 CIBARA NA'),
102 | (u'AN ARABIC _l _o 123-456 PAIR', u'RIAP 123-456 CIBARA NA'),
103 | (u'this bug 67_r_o89 catched!', u'this bug 6789 catched!'),
104 | )
105 |
106 | # adopt fribidi's CapRtl encoding
107 | mappings = {
108 | u'_>': u"\u200E",
109 | u'_<': u"\u200F",
110 | u'_l': u"\u202A",
111 | u'_r': u"\u202B",
112 | u'_o': u"\u202C",
113 | u'_L': u"\u202D",
114 | u'_R': u"\u202E",
115 | u'__': '_',
116 | }
117 |
118 | for storage, display in tests:
119 | for key, val in mappings.items():
120 | storage = storage.replace(key, val)
121 | self.assertEqual(get_display(storage, upper_is_rtl=True), display)
122 |
123 |
124 | if __name__ == '__main__':
125 | unittest.main()
126 |
--------------------------------------------------------------------------------
/bidi/arabic_reshaper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # This work is licensed under the GNU Public License (GPL).
4 | # To view a copy of this license, visit http://www.gnu.org/copyleft/gpl.html
5 |
6 | # Written by Abd Allah Diab (mpcabd)
7 | # Email: mpcabd ^at^ gmail ^dot^ com
8 | # Website: http://mpcabd.igeex.biz
9 |
10 | # Ported and tweaked from Java to Python, from Better Arabic Reshaper [https://github.com/agawish/Better-Arabic-Reshaper/]
11 |
12 | import re
13 |
14 | DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_MDD = u'\u0622'
15 | DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_HAMAZA = u'\u0623'
16 | DEFINED_CHARACTERS_ORGINAL_ALF_LOWER_HAMAZA = u'\u0625'
17 | DEFINED_CHARACTERS_ORGINAL_ALF = u'\u0627'
18 | DEFINED_CHARACTERS_ORGINAL_LAM = u'\u0644'
19 |
20 | LAM_ALEF_GLYPHS = [
21 | [u'\u3BA6', u'\uFEF6', u'\uFEF5'],
22 | [u'\u3BA7', u'\uFEF8', u'\uFEF7'],
23 | [u'\u0627', u'\uFEFC', u'\uFEFB'],
24 | [u'\u0625', u'\uFEFA', u'\uFEF9']
25 | ]
26 |
27 | HARAKAT = [
28 | u'\u0600', u'\u0601', u'\u0602', u'\u0603', u'\u0606', u'\u0607', u'\u0608', u'\u0609',
29 | u'\u060A', u'\u060B', u'\u060D', u'\u060E', u'\u0610', u'\u0611', u'\u0612', u'\u0613',
30 | u'\u0614', u'\u0615', u'\u0616', u'\u0617', u'\u0618', u'\u0619', u'\u061A', u'\u061B',
31 | u'\u061E', u'\u061F', u'\u0621', u'\u063B', u'\u063C', u'\u063D', u'\u063E', u'\u063F',
32 | u'\u0640', u'\u064B', u'\u064C', u'\u064D', u'\u064E', u'\u064F', u'\u0650', u'\u0651',
33 | u'\u0652', u'\u0653', u'\u0654', u'\u0655', u'\u0656', u'\u0657', u'\u0658', u'\u0659',
34 | u'\u065A', u'\u065B', u'\u065C', u'\u065D', u'\u065E', u'\u0660', u'\u066A', u'\u066B',
35 | u'\u066C', u'\u066F', u'\u0670', u'\u0672', u'\u06D4', u'\u06D5', u'\u06D6', u'\u06D7',
36 | u'\u06D8', u'\u06D9', u'\u06DA', u'\u06DB', u'\u06DC', u'\u06DF', u'\u06E0', u'\u06E1',
37 | u'\u06E2', u'\u06E3', u'\u06E4', u'\u06E5', u'\u06E6', u'\u06E7', u'\u06E8', u'\u06E9',
38 | u'\u06EA', u'\u06EB', u'\u06EC', u'\u06ED', u'\u06EE', u'\u06EF', u'\u06D6', u'\u06D7',
39 | u'\u06D8', u'\u06D9', u'\u06DA', u'\u06DB', u'\u06DC', u'\u06DD', u'\u06DE', u'\u06DF',
40 | u'\u06F0', u'\u06FD', u'\uFE70', u'\uFE71', u'\uFE72', u'\uFE73', u'\uFE74', u'\uFE75',
41 | u'\uFE76', u'\uFE77', u'\uFE78', u'\uFE79', u'\uFE7A', u'\uFE7B', u'\uFE7C', u'\uFE7D',
42 | u'\uFE7E', u'\uFE7F', u'\uFC5E', u'\uFC5F', u'\uFC60', u'\uFC61', u'\uFC62', u'\uFC63'
43 | ]
44 |
45 | ARABIC_GLYPHS = {
46 | u'\u0622' : [u'\u0622', u'\uFE81', u'\uFE81', u'\uFE82', u'\uFE82', 2],
47 | u'\u0623' : [u'\u0623', u'\uFE83', u'\uFE83', u'\uFE84', u'\uFE84', 2],
48 | u'\u0624' : [u'\u0624', u'\uFE85', u'\uFE85', u'\uFE86', u'\uFE86', 2],
49 | u'\u0625' : [u'\u0625', u'\uFE87', u'\uFE87', u'\uFE88', u'\uFE88', 2],
50 | u'\u0626' : [u'\u0626', u'\uFE89', u'\uFE8B', u'\uFE8C', u'\uFE8A', 4],
51 | u'\u0627' : [u'\u0627', u'\u0627', u'\u0627', u'\uFE8E', u'\uFE8E', 2],
52 | u'\u0628' : [u'\u0628', u'\uFE8F', u'\uFE91', u'\uFE92', u'\uFE90', 4],
53 | u'\u0629' : [u'\u0629', u'\uFE93', u'\uFE93', u'\uFE94', u'\uFE94', 2],
54 | u'\u062A' : [u'\u062A', u'\uFE95', u'\uFE97', u'\uFE98', u'\uFE96', 4],
55 | u'\u062B' : [u'\u062B', u'\uFE99', u'\uFE9B', u'\uFE9C', u'\uFE9A', 4],
56 | u'\u062C' : [u'\u062C', u'\uFE9D', u'\uFE9F', u'\uFEA0', u'\uFE9E', 4],
57 | u'\u062D' : [u'\u062D', u'\uFEA1', u'\uFEA3', u'\uFEA4', u'\uFEA2', 4],
58 | u'\u062E' : [u'\u062E', u'\uFEA5', u'\uFEA7', u'\uFEA8', u'\uFEA6', 4],
59 | u'\u062F' : [u'\u062F', u'\uFEA9', u'\uFEA9', u'\uFEAA', u'\uFEAA', 2],
60 | u'\u0630' : [u'\u0630', u'\uFEAB', u'\uFEAB', u'\uFEAC', u'\uFEAC', 2],
61 | u'\u0631' : [u'\u0631', u'\uFEAD', u'\uFEAD', u'\uFEAE', u'\uFEAE', 2],
62 | u'\u0632' : [u'\u0632', u'\uFEAF', u'\uFEAF', u'\uFEB0', u'\uFEB0', 2],
63 | u'\u0633' : [u'\u0633', u'\uFEB1', u'\uFEB3', u'\uFEB4', u'\uFEB2', 4],
64 | u'\u0634' : [u'\u0634', u'\uFEB5', u'\uFEB7', u'\uFEB8', u'\uFEB6', 4],
65 | u'\u0635' : [u'\u0635', u'\uFEB9', u'\uFEBB', u'\uFEBC', u'\uFEBA', 4],
66 | u'\u0636' : [u'\u0636', u'\uFEBD', u'\uFEBF', u'\uFEC0', u'\uFEBE', 4],
67 | u'\u0637' : [u'\u0637', u'\uFEC1', u'\uFEC3', u'\uFEC4', u'\uFEC2', 4],
68 | u'\u0638' : [u'\u0638', u'\uFEC5', u'\uFEC7', u'\uFEC8', u'\uFEC6', 4],
69 | u'\u0639' : [u'\u0639', u'\uFEC9', u'\uFECB', u'\uFECC', u'\uFECA', 4],
70 | u'\u063A' : [u'\u063A', u'\uFECD', u'\uFECF', u'\uFED0', u'\uFECE', 4],
71 | u'\u0641' : [u'\u0641', u'\uFED1', u'\uFED3', u'\uFED4', u'\uFED2', 4],
72 | u'\u0642' : [u'\u0642', u'\uFED5', u'\uFED7', u'\uFED8', u'\uFED6', 4],
73 | u'\u0643' : [u'\u0643', u'\uFED9', u'\uFEDB', u'\uFEDC', u'\uFEDA', 4],
74 | u'\u0644' : [u'\u0644', u'\uFEDD', u'\uFEDF', u'\uFEE0', u'\uFEDE', 4],
75 | u'\u0645' : [u'\u0645', u'\uFEE1', u'\uFEE3', u'\uFEE4', u'\uFEE2', 4],
76 | u'\u0646' : [u'\u0646', u'\uFEE5', u'\uFEE7', u'\uFEE8', u'\uFEE6', 4],
77 | u'\u0647' : [u'\u0647', u'\uFEE9', u'\uFEEB', u'\uFEEC', u'\uFEEA', 4],
78 | u'\u0648' : [u'\u0648', u'\uFEED', u'\uFEED', u'\uFEEE', u'\uFEEE', 2],
79 | u'\u0649' : [u'\u0649', u'\uFEEF', u'\uFEEF', u'\uFEF0', u'\uFEF0', 2],
80 | u'\u0671' : [u'\u0671', u'\u0671', u'\u0671', u'\uFB51', u'\uFB51', 2],
81 | u'\u064A' : [u'\u064A', u'\uFEF1', u'\uFEF3', u'\uFEF4', u'\uFEF2', 4],
82 | u'\u066E' : [u'\u066E', u'\uFBE4', u'\uFBE8', u'\uFBE9', u'\uFBE5', 4],
83 | u'\u06AA' : [u'\u06AA', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4],
84 | u'\u06C1' : [u'\u06C1', u'\uFBA6', u'\uFBA8', u'\uFBA9', u'\uFBA7', 4],
85 | u'\u06E4' : [u'\u06E4', u'\u06E4', u'\u06E4', u'\u06E4', u'\uFEEE', 2],
86 | u'\u067E' : [u'\u067E', u'\uFB56', u'\uFB58', u'\uFB59', u'\uFB57', 4],
87 | u'\u0698' : [u'\u0698', u'\uFB8A', u'\uFB8A', u'\uFB8B', u'\uFB8B', 2],
88 | u'\u06A9' : [u'\u06A9', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4],
89 | u'\u06AF' : [u'\u06AF', u'\uFB92', u'\uFB94', u'\uFB95', u'\uFB93', 4],
90 | u'\u06CC' : [u'\u06CC', u'\uFBFC', u'\uFBFE', u'\uFBFF', u'\uFBFD', 4],
91 | u'\u0686' : [u'\u0686', u'\uFB7A', u'\uFB7C', u'\uFB7D', u'\uFB7B', 4]
92 | }
93 |
94 | ARABIC_GLYPHS_LIST = [
95 | [u'\u0622', u'\uFE81', u'\uFE81', u'\uFE82', u'\uFE82', 2],
96 | [u'\u0623', u'\uFE83', u'\uFE83', u'\uFE84', u'\uFE84', 2],
97 | [u'\u0624', u'\uFE85', u'\uFE85', u'\uFE86', u'\uFE86', 2],
98 | [u'\u0625', u'\uFE87', u'\uFE87', u'\uFE88', u'\uFE88', 2],
99 | [u'\u0626', u'\uFE89', u'\uFE8B', u'\uFE8C', u'\uFE8A', 4],
100 | [u'\u0627', u'\u0627', u'\u0627', u'\uFE8E', u'\uFE8E', 2],
101 | [u'\u0628', u'\uFE8F', u'\uFE91', u'\uFE92', u'\uFE90', 4],
102 | [u'\u0629', u'\uFE93', u'\uFE93', u'\uFE94', u'\uFE94', 2],
103 | [u'\u062A', u'\uFE95', u'\uFE97', u'\uFE98', u'\uFE96', 4],
104 | [u'\u062B', u'\uFE99', u'\uFE9B', u'\uFE9C', u'\uFE9A', 4],
105 | [u'\u062C', u'\uFE9D', u'\uFE9F', u'\uFEA0', u'\uFE9E', 4],
106 | [u'\u062D', u'\uFEA1', u'\uFEA3', u'\uFEA4', u'\uFEA2', 4],
107 | [u'\u062E', u'\uFEA5', u'\uFEA7', u'\uFEA8', u'\uFEA6', 4],
108 | [u'\u062F', u'\uFEA9', u'\uFEA9', u'\uFEAA', u'\uFEAA', 2],
109 | [u'\u0630', u'\uFEAB', u'\uFEAB', u'\uFEAC', u'\uFEAC', 2],
110 | [u'\u0631', u'\uFEAD', u'\uFEAD', u'\uFEAE', u'\uFEAE', 2],
111 | [u'\u0632', u'\uFEAF', u'\uFEAF', u'\uFEB0', u'\uFEB0', 2],
112 | [u'\u0633', u'\uFEB1', u'\uFEB3', u'\uFEB4', u'\uFEB2', 4],
113 | [u'\u0634', u'\uFEB5', u'\uFEB7', u'\uFEB8', u'\uFEB6', 4],
114 | [u'\u0635', u'\uFEB9', u'\uFEBB', u'\uFEBC', u'\uFEBA', 4],
115 | [u'\u0636', u'\uFEBD', u'\uFEBF', u'\uFEC0', u'\uFEBE', 4],
116 | [u'\u0637', u'\uFEC1', u'\uFEC3', u'\uFEC4', u'\uFEC2', 4],
117 | [u'\u0638', u'\uFEC5', u'\uFEC7', u'\uFEC8', u'\uFEC6', 4],
118 | [u'\u0639', u'\uFEC9', u'\uFECB', u'\uFECC', u'\uFECA', 4],
119 | [u'\u063A', u'\uFECD', u'\uFECF', u'\uFED0', u'\uFECE', 4],
120 | [u'\u0641', u'\uFED1', u'\uFED3', u'\uFED4', u'\uFED2', 4],
121 | [u'\u0642', u'\uFED5', u'\uFED7', u'\uFED8', u'\uFED6', 4],
122 | [u'\u0643', u'\uFED9', u'\uFEDB', u'\uFEDC', u'\uFEDA', 4],
123 | [u'\u0644', u'\uFEDD', u'\uFEDF', u'\uFEE0', u'\uFEDE', 4],
124 | [u'\u0645', u'\uFEE1', u'\uFEE3', u'\uFEE4', u'\uFEE2', 4],
125 | [u'\u0646', u'\uFEE5', u'\uFEE7', u'\uFEE8', u'\uFEE6', 4],
126 | [u'\u0647', u'\uFEE9', u'\uFEEB', u'\uFEEC', u'\uFEEA', 4],
127 | [u'\u0648', u'\uFEED', u'\uFEED', u'\uFEEE', u'\uFEEE', 2],
128 | [u'\u0649', u'\uFEEF', u'\uFEEF', u'\uFEF0', u'\uFEF0', 2],
129 | [u'\u0671', u'\u0671', u'\u0671', u'\uFB51', u'\uFB51', 2],
130 | [u'\u064A', u'\uFEF1', u'\uFEF3', u'\uFEF4', u'\uFEF2', 4],
131 | [u'\u066E', u'\uFBE4', u'\uFBE8', u'\uFBE9', u'\uFBE5', 4],
132 | [u'\u06AA', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4],
133 | [u'\u06C1', u'\uFBA6', u'\uFBA8', u'\uFBA9', u'\uFBA7', 4],
134 | [u'\u067E', u'\uFB56', u'\uFB58', u'\uFB59', u'\uFB57', 4],
135 | [u'\u0698', u'\uFB8A', u'\uFB8A', u'\uFB8B', u'\uFB8B', 2],
136 | [u'\u06A9', u'\uFB8E', u'\uFB90', u'\uFB91', u'\uFB8F', 4],
137 | [u'\u06AF', u'\uFB92', u'\uFB94', u'\uFB95', u'\uFB93', 4],
138 | [u'\u06CC', u'\uFBFC', u'\uFBFE', u'\uFBFF', u'\uFBFD', 4],
139 | [u'\u0686', u'\uFB7A', u'\uFB7C', u'\uFB7D', u'\uFB7B', 4],
140 | ]
141 |
142 | def get_reshaped_glyph(target, location):
143 | if target in ARABIC_GLYPHS:
144 | return ARABIC_GLYPHS[target][location]
145 | else:
146 | return target
147 |
148 | def get_glyph_type(target):
149 | if target in ARABIC_GLYPHS:
150 | return ARABIC_GLYPHS[target][5]
151 | else:
152 | return 2
153 |
154 | def is_haraka(target):
155 | return target in HARAKAT
156 |
157 | def replace_lam_alef(unshaped_word):
158 | list_word = list(unshaped_word)
159 | letter_before = u''
160 | for i in range(len(unshaped_word)):
161 | if not is_haraka(unshaped_word[i]) and unshaped_word[i] != DEFINED_CHARACTERS_ORGINAL_LAM:
162 | letter_before = unshaped_word[i]
163 |
164 | if unshaped_word[i] == DEFINED_CHARACTERS_ORGINAL_LAM:
165 | candidate_lam = unshaped_word[i]
166 | lam_position = i
167 | haraka_position = i + 1
168 |
169 | while haraka_position < len(unshaped_word) and is_haraka(unshaped_word[haraka_position]):
170 | haraka_position += 1
171 |
172 | if haraka_position < len(unshaped_word):
173 | if lam_position > 0 and get_glyph_type(letter_before) > 2:
174 | lam_alef = get_lam_alef(list_word[haraka_position], candidate_lam, False)
175 | else:
176 | lam_alef = get_lam_alef(list_word[haraka_position], candidate_lam, True)
177 | if lam_alef != '':
178 | list_word[lam_position] = lam_alef
179 | list_word[haraka_position] = u' '
180 |
181 | return u''.join(list_word).replace(u' ', u'')
182 |
183 | def get_lam_alef(candidate_alef, candidate_lam, is_end_of_word):
184 | shift_rate = 1
185 | reshaped_lam_alef = u''
186 | if is_end_of_word:
187 | shift_rate += 1
188 |
189 | if DEFINED_CHARACTERS_ORGINAL_LAM == candidate_lam:
190 | if DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_MDD == candidate_alef:
191 | reshaped_lam_alef = LAM_ALEF_GLYPHS[0][shift_rate]
192 |
193 | if DEFINED_CHARACTERS_ORGINAL_ALF_UPPER_HAMAZA == candidate_alef:
194 | reshaped_lam_alef = LAM_ALEF_GLYPHS[1][shift_rate]
195 |
196 | if DEFINED_CHARACTERS_ORGINAL_ALF == candidate_alef:
197 | reshaped_lam_alef = LAM_ALEF_GLYPHS[2][shift_rate]
198 |
199 | if DEFINED_CHARACTERS_ORGINAL_ALF_LOWER_HAMAZA == candidate_alef:
200 | reshaped_lam_alef = LAM_ALEF_GLYPHS[3][shift_rate]
201 |
202 | return reshaped_lam_alef
203 |
204 | class DecomposedWord(object):
205 | def __init__(self, word):
206 | self.stripped_harakat = []
207 | self.harakat_positions = []
208 | self.stripped_regular_letters = []
209 | self.letters_position = []
210 |
211 | for i in range(len(word)):
212 | c = word[i]
213 | if is_haraka(c):
214 | self.harakat_positions.append(i)
215 | self.stripped_harakat.append(c)
216 | else:
217 | self.letters_position.append(i)
218 | self.stripped_regular_letters.append(c)
219 |
220 | def reconstruct_word(self, reshaped_word):
221 | l = list(u'\0' * (len(self.stripped_harakat) + len(reshaped_word)))
222 | for i in range(len(self.letters_position)):
223 | l[self.letters_position[i]] = reshaped_word[i]
224 | for i in range(len(self.harakat_positions)):
225 | l[self.harakat_positions[i]] = self.stripped_harakat[i]
226 | return u''.join(l)
227 |
228 | def get_reshaped_word(unshaped_word):
229 | unshaped_word = replace_lam_alef(unshaped_word)
230 | decomposed_word = DecomposedWord(unshaped_word)
231 | result = u''
232 | if decomposed_word.stripped_regular_letters:
233 | result = reshape_it(u''.join(decomposed_word.stripped_regular_letters))
234 | return decomposed_word.reconstruct_word(result)
235 |
236 | def reshape_it(unshaped_word):
237 | if not unshaped_word:
238 | return u''
239 | if len(unshaped_word) == 1:
240 | return get_reshaped_glyph(unshaped_word[0], 1)
241 | reshaped_word = []
242 | for i in range(len(unshaped_word)):
243 | before = False
244 | after = False
245 | if i == 0:
246 | after = get_glyph_type(unshaped_word[i]) == 4
247 | elif i == len(unshaped_word) - 1:
248 | before = get_glyph_type(unshaped_word[i - 1]) == 4
249 | else:
250 | after = get_glyph_type(unshaped_word[i]) == 4
251 | before = get_glyph_type(unshaped_word[i - 1]) == 4
252 | if after and before:
253 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 3))
254 | elif after and not before:
255 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 2))
256 | elif not after and before:
257 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 4))
258 | elif not after and not before:
259 | reshaped_word.append(get_reshaped_glyph(unshaped_word[i], 1))
260 |
261 | return u''.join(reshaped_word)
262 |
263 |
264 | def is_arabic_character(target):
265 | return target in ARABIC_GLYPHS or target in HARAKAT
266 |
267 | def get_words(sentence):
268 | if sentence:
269 | return re.split('\\s', sentence)
270 | return []
271 |
272 | def has_arabic_letters(word):
273 | for c in word:
274 | if is_arabic_character(c):
275 | return True
276 | return False
277 |
278 | def is_arabic_word(word):
279 | for c in word:
280 | if not is_arabic_character(c):
281 | return False
282 | return True
283 |
284 | def get_words_from_mixed_word(word):
285 | temp_word = u''
286 | words = []
287 | for c in word:
288 | if is_arabic_character(c):
289 | if temp_word and not is_arabic_word(temp_word):
290 | words.append(temp_word)
291 | temp_word = c
292 | else:
293 | temp_word += c
294 | else:
295 | if temp_word and is_arabic_word(temp_word):
296 | words.append(temp_word)
297 | temp_word = c
298 | else:
299 | temp_word += c
300 | if temp_word:
301 | words.append(temp_word)
302 | return words
303 |
304 | def reshape(text):
305 | if text:
306 | lines = re.split('\\r?\\n', text)
307 | for i in range(len(lines)):
308 | lines[i] = reshape_sentence(lines[i])
309 | return u'\n'.join(lines)
310 | return u''
311 |
312 | def reshape_sentence(sentence):
313 | words = get_words(sentence)
314 | for i in range(len(words)):
315 | word = words[i]
316 | if has_arabic_letters(word):
317 | if is_arabic_word(word):
318 | words[i] = get_reshaped_word(word)
319 | else:
320 | mixed_words = get_words_from_mixed_word(word)
321 | for j in range(len(mixed_words)):
322 | mixed_words[j] = get_reshaped_word(mixed_words[j])
323 | words[i] = u''.join(mixed_words)
324 | return u' '.join(words)
325 |
--------------------------------------------------------------------------------
/bidi/mirror.py:
--------------------------------------------------------------------------------
1 | # This file is part of python-bidi
2 | #
3 | # python-bidi is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU Lesser General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this program. If not, see .
15 |
16 | # Copyright (C) 2008-2010 Yaacov Zamir ,
17 | # Meir kriheli
18 | """Mirrored chars"""
19 |
20 | # Can't seem to get this data from python's unicode data, so this is imported
21 | # from http://www.unicode.org/Public/UNIDATA/BidiMirroring.txt
22 | MIRRORED = {
23 | u'\u0028': u'\u0029', # LEFT PARENTHESIS
24 | u'\u0029': u'\u0028', # RIGHT PARENTHESIS
25 | u'\u003C': u'\u003E', # LESS-THAN SIGN
26 | u'\u003E': u'\u003C', # GREATER-THAN SIGN
27 | u'\u005B': u'\u005D', # LEFT SQUARE BRACKET
28 | u'\u005D': u'\u005B', # RIGHT SQUARE BRACKET
29 | u'\u007B': u'\u007D', # LEFT CURLY BRACKET
30 | u'\u007D': u'\u007B', # RIGHT CURLY BRACKET
31 | u'\u00AB': u'\u00BB', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
32 | u'\u00BB': u'\u00AB', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
33 | u'\u0F3A': u'\u0F3B', # TIBETAN MARK GUG RTAGS GYON
34 | u'\u0F3B': u'\u0F3A', # TIBETAN MARK GUG RTAGS GYAS
35 | u'\u0F3C': u'\u0F3D', # TIBETAN MARK ANG KHANG GYON
36 | u'\u0F3D': u'\u0F3C', # TIBETAN MARK ANG KHANG GYAS
37 | u'\u169B': u'\u169C', # OGHAM FEATHER MARK
38 | u'\u169C': u'\u169B', # OGHAM REVERSED FEATHER MARK
39 | u'\u2039': u'\u203A', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
40 | u'\u203A': u'\u2039', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
41 | u'\u2045': u'\u2046', # LEFT SQUARE BRACKET WITH QUILL
42 | u'\u2046': u'\u2045', # RIGHT SQUARE BRACKET WITH QUILL
43 | u'\u207D': u'\u207E', # SUPERSCRIPT LEFT PARENTHESIS
44 | u'\u207E': u'\u207D', # SUPERSCRIPT RIGHT PARENTHESIS
45 | u'\u208D': u'\u208E', # SUBSCRIPT LEFT PARENTHESIS
46 | u'\u208E': u'\u208D', # SUBSCRIPT RIGHT PARENTHESIS
47 | u'\u2208': u'\u220B', # ELEMENT OF
48 | u'\u2209': u'\u220C', # NOT AN ELEMENT OF
49 | u'\u220A': u'\u220D', # SMALL ELEMENT OF
50 | u'\u220B': u'\u2208', # CONTAINS AS MEMBER
51 | u'\u220C': u'\u2209', # DOES NOT CONTAIN AS MEMBER
52 | u'\u220D': u'\u220A', # SMALL CONTAINS AS MEMBER
53 | u'\u2215': u'\u29F5', # DIVISION SLASH
54 | u'\u223C': u'\u223D', # TILDE OPERATOR
55 | u'\u223D': u'\u223C', # REVERSED TILDE
56 | u'\u2243': u'\u22CD', # ASYMPTOTICALLY EQUAL TO
57 | u'\u2252': u'\u2253', # APPROXIMATELY EQUAL TO OR THE IMAGE OF
58 | u'\u2253': u'\u2252', # IMAGE OF OR APPROXIMATELY EQUAL TO
59 | u'\u2254': u'\u2255', # COLON EQUALS
60 | u'\u2255': u'\u2254', # EQUALS COLON
61 | u'\u2264': u'\u2265', # LESS-THAN OR EQUAL TO
62 | u'\u2265': u'\u2264', # GREATER-THAN OR EQUAL TO
63 | u'\u2266': u'\u2267', # LESS-THAN OVER EQUAL TO
64 | u'\u2267': u'\u2266', # GREATER-THAN OVER EQUAL TO
65 | u'\u2268': u'\u2269', # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
66 | u'\u2269': u'\u2268', # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
67 | u'\u226A': u'\u226B', # MUCH LESS-THAN
68 | u'\u226B': u'\u226A', # MUCH GREATER-THAN
69 | u'\u226E': u'\u226F', # [BEST FIT] NOT LESS-THAN
70 | u'\u226F': u'\u226E', # [BEST FIT] NOT GREATER-THAN
71 | u'\u2270': u'\u2271', # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
72 | u'\u2271': u'\u2270', # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
73 | u'\u2272': u'\u2273', # [BEST FIT] LESS-THAN OR EQUIVALENT TO
74 | u'\u2273': u'\u2272', # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
75 | u'\u2274': u'\u2275', # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
76 | u'\u2275': u'\u2274', # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
77 | u'\u2276': u'\u2277', # LESS-THAN OR GREATER-THAN
78 | u'\u2277': u'\u2276', # GREATER-THAN OR LESS-THAN
79 | u'\u2278': u'\u2279', # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
80 | u'\u2279': u'\u2278', # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
81 | u'\u227A': u'\u227B', # PRECEDES
82 | u'\u227B': u'\u227A', # SUCCEEDS
83 | u'\u227C': u'\u227D', # PRECEDES OR EQUAL TO
84 | u'\u227D': u'\u227C', # SUCCEEDS OR EQUAL TO
85 | u'\u227E': u'\u227F', # [BEST FIT] PRECEDES OR EQUIVALENT TO
86 | u'\u227F': u'\u227E', # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
87 | u'\u2280': u'\u2281', # [BEST FIT] DOES NOT PRECEDE
88 | u'\u2281': u'\u2280', # [BEST FIT] DOES NOT SUCCEED
89 | u'\u2282': u'\u2283', # SUBSET OF
90 | u'\u2283': u'\u2282', # SUPERSET OF
91 | u'\u2284': u'\u2285', # [BEST FIT] NOT A SUBSET OF
92 | u'\u2285': u'\u2284', # [BEST FIT] NOT A SUPERSET OF
93 | u'\u2286': u'\u2287', # SUBSET OF OR EQUAL TO
94 | u'\u2287': u'\u2286', # SUPERSET OF OR EQUAL TO
95 | u'\u2288': u'\u2289', # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
96 | u'\u2289': u'\u2288', # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
97 | u'\u228A': u'\u228B', # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
98 | u'\u228B': u'\u228A', # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
99 | u'\u228F': u'\u2290', # SQUARE IMAGE OF
100 | u'\u2290': u'\u228F', # SQUARE ORIGINAL OF
101 | u'\u2291': u'\u2292', # SQUARE IMAGE OF OR EQUAL TO
102 | u'\u2292': u'\u2291', # SQUARE ORIGINAL OF OR EQUAL TO
103 | u'\u2298': u'\u29B8', # CIRCLED DIVISION SLASH
104 | u'\u22A2': u'\u22A3', # RIGHT TACK
105 | u'\u22A3': u'\u22A2', # LEFT TACK
106 | u'\u22A6': u'\u2ADE', # ASSERTION
107 | u'\u22A8': u'\u2AE4', # TRUE
108 | u'\u22A9': u'\u2AE3', # FORCES
109 | u'\u22AB': u'\u2AE5', # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
110 | u'\u22B0': u'\u22B1', # PRECEDES UNDER RELATION
111 | u'\u22B1': u'\u22B0', # SUCCEEDS UNDER RELATION
112 | u'\u22B2': u'\u22B3', # NORMAL SUBGROUP OF
113 | u'\u22B3': u'\u22B2', # CONTAINS AS NORMAL SUBGROUP
114 | u'\u22B4': u'\u22B5', # NORMAL SUBGROUP OF OR EQUAL TO
115 | u'\u22B5': u'\u22B4', # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
116 | u'\u22B6': u'\u22B7', # ORIGINAL OF
117 | u'\u22B7': u'\u22B6', # IMAGE OF
118 | u'\u22C9': u'\u22CA', # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
119 | u'\u22CA': u'\u22C9', # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
120 | u'\u22CB': u'\u22CC', # LEFT SEMIDIRECT PRODUCT
121 | u'\u22CC': u'\u22CB', # RIGHT SEMIDIRECT PRODUCT
122 | u'\u22CD': u'\u2243', # REVERSED TILDE EQUALS
123 | u'\u22D0': u'\u22D1', # DOUBLE SUBSET
124 | u'\u22D1': u'\u22D0', # DOUBLE SUPERSET
125 | u'\u22D6': u'\u22D7', # LESS-THAN WITH DOT
126 | u'\u22D7': u'\u22D6', # GREATER-THAN WITH DOT
127 | u'\u22D8': u'\u22D9', # VERY MUCH LESS-THAN
128 | u'\u22D9': u'\u22D8', # VERY MUCH GREATER-THAN
129 | u'\u22DA': u'\u22DB', # LESS-THAN EQUAL TO OR GREATER-THAN
130 | u'\u22DB': u'\u22DA', # GREATER-THAN EQUAL TO OR LESS-THAN
131 | u'\u22DC': u'\u22DD', # EQUAL TO OR LESS-THAN
132 | u'\u22DD': u'\u22DC', # EQUAL TO OR GREATER-THAN
133 | u'\u22DE': u'\u22DF', # EQUAL TO OR PRECEDES
134 | u'\u22DF': u'\u22DE', # EQUAL TO OR SUCCEEDS
135 | u'\u22E0': u'\u22E1', # [BEST FIT] DOES NOT PRECEDE OR EQUAL
136 | u'\u22E1': u'\u22E0', # [BEST FIT] DOES NOT SUCCEED OR EQUAL
137 | u'\u22E2': u'\u22E3', # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
138 | u'\u22E3': u'\u22E2', # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
139 | u'\u22E4': u'\u22E5', # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
140 | u'\u22E5': u'\u22E4', # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
141 | u'\u22E6': u'\u22E7', # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
142 | u'\u22E7': u'\u22E6', # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
143 | u'\u22E8': u'\u22E9', # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
144 | u'\u22E9': u'\u22E8', # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
145 | u'\u22EA': u'\u22EB', # [BEST FIT] NOT NORMAL SUBGROUP OF
146 | u'\u22EB': u'\u22EA', # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
147 | u'\u22EC': u'\u22ED', # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
148 | u'\u22ED': u'\u22EC', # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
149 | u'\u22F0': u'\u22F1', # UP RIGHT DIAGONAL ELLIPSIS
150 | u'\u22F1': u'\u22F0', # DOWN RIGHT DIAGONAL ELLIPSIS
151 | u'\u22F2': u'\u22FA', # ELEMENT OF WITH LONG HORIZONTAL STROKE
152 | u'\u22F3': u'\u22FB', # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
153 | u'\u22F4': u'\u22FC', # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
154 | u'\u22F6': u'\u22FD', # ELEMENT OF WITH OVERBAR
155 | u'\u22F7': u'\u22FE', # SMALL ELEMENT OF WITH OVERBAR
156 | u'\u22FA': u'\u22F2', # CONTAINS WITH LONG HORIZONTAL STROKE
157 | u'\u22FB': u'\u22F3', # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
158 | u'\u22FC': u'\u22F4', # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
159 | u'\u22FD': u'\u22F6', # CONTAINS WITH OVERBAR
160 | u'\u22FE': u'\u22F7', # SMALL CONTAINS WITH OVERBAR
161 | u'\u2308': u'\u2309', # LEFT CEILING
162 | u'\u2309': u'\u2308', # RIGHT CEILING
163 | u'\u230A': u'\u230B', # LEFT FLOOR
164 | u'\u230B': u'\u230A', # RIGHT FLOOR
165 | u'\u2329': u'\u232A', # LEFT-POINTING ANGLE BRACKET
166 | u'\u232A': u'\u2329', # RIGHT-POINTING ANGLE BRACKET
167 | u'\u2768': u'\u2769', # MEDIUM LEFT PARENTHESIS ORNAMENT
168 | u'\u2769': u'\u2768', # MEDIUM RIGHT PARENTHESIS ORNAMENT
169 | u'\u276A': u'\u276B', # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
170 | u'\u276B': u'\u276A', # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
171 | u'\u276C': u'\u276D', # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
172 | u'\u276D': u'\u276C', # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
173 | u'\u276E': u'\u276F', # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
174 | u'\u276F': u'\u276E', # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
175 | u'\u2770': u'\u2771', # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
176 | u'\u2771': u'\u2770', # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
177 | u'\u2772': u'\u2773', # LIGHT LEFT TORTOISE SHELL BRACKET
178 | u'\u2773': u'\u2772', # LIGHT RIGHT TORTOISE SHELL BRACKET
179 | u'\u2774': u'\u2775', # MEDIUM LEFT CURLY BRACKET ORNAMENT
180 | u'\u2775': u'\u2774', # MEDIUM RIGHT CURLY BRACKET ORNAMENT
181 | u'\u27C3': u'\u27C4', # OPEN SUBSET
182 | u'\u27C4': u'\u27C3', # OPEN SUPERSET
183 | u'\u27C5': u'\u27C6', # LEFT S-SHAPED BAG DELIMITER
184 | u'\u27C6': u'\u27C5', # RIGHT S-SHAPED BAG DELIMITER
185 | u'\u27C8': u'\u27C9', # REVERSE SOLIDUS PRECEDING SUBSET
186 | u'\u27C9': u'\u27C8', # SUPERSET PRECEDING SOLIDUS
187 | u'\u27D5': u'\u27D6', # LEFT OUTER JOIN
188 | u'\u27D6': u'\u27D5', # RIGHT OUTER JOIN
189 | u'\u27DD': u'\u27DE', # LONG RIGHT TACK
190 | u'\u27DE': u'\u27DD', # LONG LEFT TACK
191 | u'\u27E2': u'\u27E3', # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
192 | u'\u27E3': u'\u27E2', # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
193 | u'\u27E4': u'\u27E5', # WHITE SQUARE WITH LEFTWARDS TICK
194 | u'\u27E5': u'\u27E4', # WHITE SQUARE WITH RIGHTWARDS TICK
195 | u'\u27E6': u'\u27E7', # MATHEMATICAL LEFT WHITE SQUARE BRACKET
196 | u'\u27E7': u'\u27E6', # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
197 | u'\u27E8': u'\u27E9', # MATHEMATICAL LEFT ANGLE BRACKET
198 | u'\u27E9': u'\u27E8', # MATHEMATICAL RIGHT ANGLE BRACKET
199 | u'\u27EA': u'\u27EB', # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
200 | u'\u27EB': u'\u27EA', # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
201 | u'\u27EC': u'\u27ED', # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
202 | u'\u27ED': u'\u27EC', # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
203 | u'\u27EE': u'\u27EF', # MATHEMATICAL LEFT FLATTENED PARENTHESIS
204 | u'\u27EF': u'\u27EE', # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
205 | u'\u2983': u'\u2984', # LEFT WHITE CURLY BRACKET
206 | u'\u2984': u'\u2983', # RIGHT WHITE CURLY BRACKET
207 | u'\u2985': u'\u2986', # LEFT WHITE PARENTHESIS
208 | u'\u2986': u'\u2985', # RIGHT WHITE PARENTHESIS
209 | u'\u2987': u'\u2988', # Z NOTATION LEFT IMAGE BRACKET
210 | u'\u2988': u'\u2987', # Z NOTATION RIGHT IMAGE BRACKET
211 | u'\u2989': u'\u298A', # Z NOTATION LEFT BINDING BRACKET
212 | u'\u298A': u'\u2989', # Z NOTATION RIGHT BINDING BRACKET
213 | u'\u298B': u'\u298C', # LEFT SQUARE BRACKET WITH UNDERBAR
214 | u'\u298C': u'\u298B', # RIGHT SQUARE BRACKET WITH UNDERBAR
215 | u'\u298D': u'\u2990', # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
216 | u'\u298E': u'\u298F', # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
217 | u'\u298F': u'\u298E', # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
218 | u'\u2990': u'\u298D', # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
219 | u'\u2991': u'\u2992', # LEFT ANGLE BRACKET WITH DOT
220 | u'\u2992': u'\u2991', # RIGHT ANGLE BRACKET WITH DOT
221 | u'\u2993': u'\u2994', # LEFT ARC LESS-THAN BRACKET
222 | u'\u2994': u'\u2993', # RIGHT ARC GREATER-THAN BRACKET
223 | u'\u2995': u'\u2996', # DOUBLE LEFT ARC GREATER-THAN BRACKET
224 | u'\u2996': u'\u2995', # DOUBLE RIGHT ARC LESS-THAN BRACKET
225 | u'\u2997': u'\u2998', # LEFT BLACK TORTOISE SHELL BRACKET
226 | u'\u2998': u'\u2997', # RIGHT BLACK TORTOISE SHELL BRACKET
227 | u'\u29B8': u'\u2298', # CIRCLED REVERSE SOLIDUS
228 | u'\u29C0': u'\u29C1', # CIRCLED LESS-THAN
229 | u'\u29C1': u'\u29C0', # CIRCLED GREATER-THAN
230 | u'\u29C4': u'\u29C5', # SQUARED RISING DIAGONAL SLASH
231 | u'\u29C5': u'\u29C4', # SQUARED FALLING DIAGONAL SLASH
232 | u'\u29CF': u'\u29D0', # LEFT TRIANGLE BESIDE VERTICAL BAR
233 | u'\u29D0': u'\u29CF', # VERTICAL BAR BESIDE RIGHT TRIANGLE
234 | u'\u29D1': u'\u29D2', # BOWTIE WITH LEFT HALF BLACK
235 | u'\u29D2': u'\u29D1', # BOWTIE WITH RIGHT HALF BLACK
236 | u'\u29D4': u'\u29D5', # TIMES WITH LEFT HALF BLACK
237 | u'\u29D5': u'\u29D4', # TIMES WITH RIGHT HALF BLACK
238 | u'\u29D8': u'\u29D9', # LEFT WIGGLY FENCE
239 | u'\u29D9': u'\u29D8', # RIGHT WIGGLY FENCE
240 | u'\u29DA': u'\u29DB', # LEFT DOUBLE WIGGLY FENCE
241 | u'\u29DB': u'\u29DA', # RIGHT DOUBLE WIGGLY FENCE
242 | u'\u29F5': u'\u2215', # REVERSE SOLIDUS OPERATOR
243 | u'\u29F8': u'\u29F9', # BIG SOLIDUS
244 | u'\u29F9': u'\u29F8', # BIG REVERSE SOLIDUS
245 | u'\u29FC': u'\u29FD', # LEFT-POINTING CURVED ANGLE BRACKET
246 | u'\u29FD': u'\u29FC', # RIGHT-POINTING CURVED ANGLE BRACKET
247 | u'\u2A2B': u'\u2A2C', # MINUS SIGN WITH FALLING DOTS
248 | u'\u2A2C': u'\u2A2B', # MINUS SIGN WITH RISING DOTS
249 | u'\u2A2D': u'\u2A2E', # PLUS SIGN IN LEFT HALF CIRCLE
250 | u'\u2A2E': u'\u2A2D', # PLUS SIGN IN RIGHT HALF CIRCLE
251 | u'\u2A34': u'\u2A35', # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
252 | u'\u2A35': u'\u2A34', # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
253 | u'\u2A3C': u'\u2A3D', # INTERIOR PRODUCT
254 | u'\u2A3D': u'\u2A3C', # RIGHTHAND INTERIOR PRODUCT
255 | u'\u2A64': u'\u2A65', # Z NOTATION DOMAIN ANTIRESTRICTION
256 | u'\u2A65': u'\u2A64', # Z NOTATION RANGE ANTIRESTRICTION
257 | u'\u2A79': u'\u2A7A', # LESS-THAN WITH CIRCLE INSIDE
258 | u'\u2A7A': u'\u2A79', # GREATER-THAN WITH CIRCLE INSIDE
259 | u'\u2A7D': u'\u2A7E', # LESS-THAN OR SLANTED EQUAL TO
260 | u'\u2A7E': u'\u2A7D', # GREATER-THAN OR SLANTED EQUAL TO
261 | u'\u2A7F': u'\u2A80', # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
262 | u'\u2A80': u'\u2A7F', # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
263 | u'\u2A81': u'\u2A82', # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
264 | u'\u2A82': u'\u2A81', # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
265 | u'\u2A83': u'\u2A84', # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
266 | u'\u2A84': u'\u2A83', # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
267 | u'\u2A8B': u'\u2A8C', # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
268 | u'\u2A8C': u'\u2A8B', # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
269 | u'\u2A91': u'\u2A92', # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
270 | u'\u2A92': u'\u2A91', # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
271 | u'\u2A93': u'\u2A94', # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
272 | u'\u2A94': u'\u2A93', # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
273 | u'\u2A95': u'\u2A96', # SLANTED EQUAL TO OR LESS-THAN
274 | u'\u2A96': u'\u2A95', # SLANTED EQUAL TO OR GREATER-THAN
275 | u'\u2A97': u'\u2A98', # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
276 | u'\u2A98': u'\u2A97', # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
277 | u'\u2A99': u'\u2A9A', # DOUBLE-LINE EQUAL TO OR LESS-THAN
278 | u'\u2A9A': u'\u2A99', # DOUBLE-LINE EQUAL TO OR GREATER-THAN
279 | u'\u2A9B': u'\u2A9C', # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
280 | u'\u2A9C': u'\u2A9B', # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
281 | u'\u2AA1': u'\u2AA2', # DOUBLE NESTED LESS-THAN
282 | u'\u2AA2': u'\u2AA1', # DOUBLE NESTED GREATER-THAN
283 | u'\u2AA6': u'\u2AA7', # LESS-THAN CLOSED BY CURVE
284 | u'\u2AA7': u'\u2AA6', # GREATER-THAN CLOSED BY CURVE
285 | u'\u2AA8': u'\u2AA9', # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
286 | u'\u2AA9': u'\u2AA8', # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
287 | u'\u2AAA': u'\u2AAB', # SMALLER THAN
288 | u'\u2AAB': u'\u2AAA', # LARGER THAN
289 | u'\u2AAC': u'\u2AAD', # SMALLER THAN OR EQUAL TO
290 | u'\u2AAD': u'\u2AAC', # LARGER THAN OR EQUAL TO
291 | u'\u2AAF': u'\u2AB0', # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
292 | u'\u2AB0': u'\u2AAF', # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
293 | u'\u2AB3': u'\u2AB4', # PRECEDES ABOVE EQUALS SIGN
294 | u'\u2AB4': u'\u2AB3', # SUCCEEDS ABOVE EQUALS SIGN
295 | u'\u2ABB': u'\u2ABC', # DOUBLE PRECEDES
296 | u'\u2ABC': u'\u2ABB', # DOUBLE SUCCEEDS
297 | u'\u2ABD': u'\u2ABE', # SUBSET WITH DOT
298 | u'\u2ABE': u'\u2ABD', # SUPERSET WITH DOT
299 | u'\u2ABF': u'\u2AC0', # SUBSET WITH PLUS SIGN BELOW
300 | u'\u2AC0': u'\u2ABF', # SUPERSET WITH PLUS SIGN BELOW
301 | u'\u2AC1': u'\u2AC2', # SUBSET WITH MULTIPLICATION SIGN BELOW
302 | u'\u2AC2': u'\u2AC1', # SUPERSET WITH MULTIPLICATION SIGN BELOW
303 | u'\u2AC3': u'\u2AC4', # SUBSET OF OR EQUAL TO WITH DOT ABOVE
304 | u'\u2AC4': u'\u2AC3', # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
305 | u'\u2AC5': u'\u2AC6', # SUBSET OF ABOVE EQUALS SIGN
306 | u'\u2AC6': u'\u2AC5', # SUPERSET OF ABOVE EQUALS SIGN
307 | u'\u2ACD': u'\u2ACE', # SQUARE LEFT OPEN BOX OPERATOR
308 | u'\u2ACE': u'\u2ACD', # SQUARE RIGHT OPEN BOX OPERATOR
309 | u'\u2ACF': u'\u2AD0', # CLOSED SUBSET
310 | u'\u2AD0': u'\u2ACF', # CLOSED SUPERSET
311 | u'\u2AD1': u'\u2AD2', # CLOSED SUBSET OR EQUAL TO
312 | u'\u2AD2': u'\u2AD1', # CLOSED SUPERSET OR EQUAL TO
313 | u'\u2AD3': u'\u2AD4', # SUBSET ABOVE SUPERSET
314 | u'\u2AD4': u'\u2AD3', # SUPERSET ABOVE SUBSET
315 | u'\u2AD5': u'\u2AD6', # SUBSET ABOVE SUBSET
316 | u'\u2AD6': u'\u2AD5', # SUPERSET ABOVE SUPERSET
317 | u'\u2ADE': u'\u22A6', # SHORT LEFT TACK
318 | u'\u2AE3': u'\u22A9', # DOUBLE VERTICAL BAR LEFT TURNSTILE
319 | u'\u2AE4': u'\u22A8', # VERTICAL BAR DOUBLE LEFT TURNSTILE
320 | u'\u2AE5': u'\u22AB', # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
321 | u'\u2AEC': u'\u2AED', # DOUBLE STROKE NOT SIGN
322 | u'\u2AED': u'\u2AEC', # REVERSED DOUBLE STROKE NOT SIGN
323 | u'\u2AF7': u'\u2AF8', # TRIPLE NESTED LESS-THAN
324 | u'\u2AF8': u'\u2AF7', # TRIPLE NESTED GREATER-THAN
325 | u'\u2AF9': u'\u2AFA', # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
326 | u'\u2AFA': u'\u2AF9', # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
327 | u'\u2E02': u'\u2E03', # LEFT SUBSTITUTION BRACKET
328 | u'\u2E03': u'\u2E02', # RIGHT SUBSTITUTION BRACKET
329 | u'\u2E04': u'\u2E05', # LEFT DOTTED SUBSTITUTION BRACKET
330 | u'\u2E05': u'\u2E04', # RIGHT DOTTED SUBSTITUTION BRACKET
331 | u'\u2E09': u'\u2E0A', # LEFT TRANSPOSITION BRACKET
332 | u'\u2E0A': u'\u2E09', # RIGHT TRANSPOSITION BRACKET
333 | u'\u2E0C': u'\u2E0D', # LEFT RAISED OMISSION BRACKET
334 | u'\u2E0D': u'\u2E0C', # RIGHT RAISED OMISSION BRACKET
335 | u'\u2E1C': u'\u2E1D', # LEFT LOW PARAPHRASE BRACKET
336 | u'\u2E1D': u'\u2E1C', # RIGHT LOW PARAPHRASE BRACKET
337 | u'\u2E20': u'\u2E21', # LEFT VERTICAL BAR WITH QUILL
338 | u'\u2E21': u'\u2E20', # RIGHT VERTICAL BAR WITH QUILL
339 | u'\u2E22': u'\u2E23', # TOP LEFT HALF BRACKET
340 | u'\u2E23': u'\u2E22', # TOP RIGHT HALF BRACKET
341 | u'\u2E24': u'\u2E25', # BOTTOM LEFT HALF BRACKET
342 | u'\u2E25': u'\u2E24', # BOTTOM RIGHT HALF BRACKET
343 | u'\u2E26': u'\u2E27', # LEFT SIDEWAYS U BRACKET
344 | u'\u2E27': u'\u2E26', # RIGHT SIDEWAYS U BRACKET
345 | u'\u2E28': u'\u2E29', # LEFT DOUBLE PARENTHESIS
346 | u'\u2E29': u'\u2E28', # RIGHT DOUBLE PARENTHESIS
347 | u'\u3008': u'\u3009', # LEFT ANGLE BRACKET
348 | u'\u3009': u'\u3008', # RIGHT ANGLE BRACKET
349 | u'\u300A': u'\u300B', # LEFT DOUBLE ANGLE BRACKET
350 | u'\u300B': u'\u300A', # RIGHT DOUBLE ANGLE BRACKET
351 | u'\u300C': u'\u300D', # [BEST FIT] LEFT CORNER BRACKET
352 | u'\u300D': u'\u300C', # [BEST FIT] RIGHT CORNER BRACKET
353 | u'\u300E': u'\u300F', # [BEST FIT] LEFT WHITE CORNER BRACKET
354 | u'\u300F': u'\u300E', # [BEST FIT] RIGHT WHITE CORNER BRACKET
355 | u'\u3010': u'\u3011', # LEFT BLACK LENTICULAR BRACKET
356 | u'\u3011': u'\u3010', # RIGHT BLACK LENTICULAR BRACKET
357 | u'\u3014': u'\u3015', # LEFT TORTOISE SHELL BRACKET
358 | u'\u3015': u'\u3014', # RIGHT TORTOISE SHELL BRACKET
359 | u'\u3016': u'\u3017', # LEFT WHITE LENTICULAR BRACKET
360 | u'\u3017': u'\u3016', # RIGHT WHITE LENTICULAR BRACKET
361 | u'\u3018': u'\u3019', # LEFT WHITE TORTOISE SHELL BRACKET
362 | u'\u3019': u'\u3018', # RIGHT WHITE TORTOISE SHELL BRACKET
363 | u'\u301A': u'\u301B', # LEFT WHITE SQUARE BRACKET
364 | u'\u301B': u'\u301A', # RIGHT WHITE SQUARE BRACKET
365 | u'\uFE59': u'\uFE5A', # SMALL LEFT PARENTHESIS
366 | u'\uFE5A': u'\uFE59', # SMALL RIGHT PARENTHESIS
367 | u'\uFE5B': u'\uFE5C', # SMALL LEFT CURLY BRACKET
368 | u'\uFE5C': u'\uFE5B', # SMALL RIGHT CURLY BRACKET
369 | u'\uFE5D': u'\uFE5E', # SMALL LEFT TORTOISE SHELL BRACKET
370 | u'\uFE5E': u'\uFE5D', # SMALL RIGHT TORTOISE SHELL BRACKET
371 | u'\uFE64': u'\uFE65', # SMALL LESS-THAN SIGN
372 | u'\uFE65': u'\uFE64', # SMALL GREATER-THAN SIGN
373 | u'\uFF08': u'\uFF09', # FULLWIDTH LEFT PARENTHESIS
374 | u'\uFF09': u'\uFF08', # FULLWIDTH RIGHT PARENTHESIS
375 | u'\uFF1C': u'\uFF1E', # FULLWIDTH LESS-THAN SIGN
376 | u'\uFF1E': u'\uFF1C', # FULLWIDTH GREATER-THAN SIGN
377 | u'\uFF3B': u'\uFF3D', # FULLWIDTH LEFT SQUARE BRACKET
378 | u'\uFF3D': u'\uFF3B', # FULLWIDTH RIGHT SQUARE BRACKET
379 | u'\uFF5B': u'\uFF5D', # FULLWIDTH LEFT CURLY BRACKET
380 | u'\uFF5D': u'\uFF5B', # FULLWIDTH RIGHT CURLY BRACKET
381 | u'\uFF5F': u'\uFF60', # FULLWIDTH LEFT WHITE PARENTHESIS
382 | u'\uFF60': u'\uFF5F', # FULLWIDTH RIGHT WHITE PARENTHESIS
383 | u'\uFF62': u'\uFF63', # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
384 | u'\uFF63': u'\uFF62', # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
385 | }
386 |
--------------------------------------------------------------------------------
/bidi/algorithm.py:
--------------------------------------------------------------------------------
1 | # This file is part of python-bidi
2 | #
3 | # python-bidi is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Lesser General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU Lesser General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this program. If not, see .
15 |
16 | # Copyright (C) 2008-2010 Yaacov Zamir ,
17 | # Meir kriheli
18 | "bidirectional alogrithm implementation"
19 |
20 | from unicodedata import bidirectional, mirrored
21 | import inspect
22 | import sys
23 | from collections import deque
24 |
25 | try:
26 |
27 | # Python 3
28 |
29 | from .mirror import MIRRORED
30 | except ValueError:
31 |
32 | # Python 2
33 | from bidi.mirror import MIRRORED
34 |
35 | # Some definitions
36 | PARAGRAPH_LEVELS = { 'L':0, 'AL':1, 'R': 1 }
37 | EXPLICIT_LEVEL_LIMIT = 62
38 |
39 | _LEAST_GREATER_ODD = lambda x: (x + 1) | 1
40 | _LEAST_GREATER_EVEN = lambda x: (x + 2) & ~1
41 |
42 | X2_X5_MAPPINGS = {
43 | 'RLE': (_LEAST_GREATER_ODD, 'N'),
44 | 'LRE': (_LEAST_GREATER_EVEN, 'N'),
45 | 'RLO': (_LEAST_GREATER_ODD, 'R'),
46 | 'LRO': (_LEAST_GREATER_EVEN, 'L'),
47 | }
48 |
49 | # Added 'B' so X6 won't execute in that case and X8 will run it's course
50 | X6_IGNORED = list(X2_X5_MAPPINGS.keys()) + ['BN', 'PDF', 'B']
51 | X9_REMOVED = list(X2_X5_MAPPINGS.keys()) + ['BN', 'PDF']
52 |
53 | _embedding_direction = lambda x:('L', 'R')[x % 2]
54 |
55 | _IS_UCS2 = sys.maxunicode == 65535
56 | _SURROGATE_MIN, _SURROGATE_MAX = 55296, 56319 # D800, DBFF
57 |
58 | def debug_storage(storage, base_info=False, chars=True, runs=False):
59 | "Display debug information for the storage"
60 |
61 | import codecs
62 | import locale
63 | import sys
64 |
65 | stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr)
66 |
67 | caller = inspect.stack()[1][3]
68 | stderr.write('in %s\n' % caller)
69 |
70 | if base_info:
71 | stderr.write(u' base level : %d\n' % storage['base_level'])
72 | stderr.write(u' base dir : %s\n' % storage['base_dir'])
73 |
74 | if runs:
75 | stderr.write(u' runs : %s\n' % list(storage['runs']))
76 |
77 | if chars:
78 | output = u' Chars : '
79 | for _ch in storage['chars']:
80 | if _ch != '\n':
81 | output += _ch['ch']
82 | else:
83 | output += 'C'
84 | stderr.write(output + u'\n')
85 |
86 | output = u' Res. levels : %s\n' % u''.join(
87 | [unicode(_ch['level']) for _ch in storage['chars']])
88 | stderr.write(output)
89 |
90 | _types = [_ch['type'].ljust(3) for _ch in storage['chars']]
91 |
92 | for i in range(3):
93 | if i:
94 | output = u' %s\n'
95 | else:
96 | output = u' Res. types : %s\n'
97 | stderr.write(output % u''.join([_t[i] for _t in _types]))
98 |
99 |
100 | def get_base_level(text, upper_is_rtl=False):
101 | """Get the paragraph base embedding level. Returns 0 for LTR,
102 | 1 for RTL.
103 |
104 | `text` a unicode object.
105 |
106 | Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
107 | for debugging (default: False).
108 |
109 | """
110 |
111 | base_level = None
112 |
113 | prev_surrogate = False
114 | # P2
115 | for _ch in text:
116 | # surrogate in case of ucs2
117 | if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX):
118 | prev_surrogate = _ch
119 | continue
120 | elif prev_surrogate:
121 | _ch = prev_surrogate + _ch
122 | prev_surrogate = False
123 |
124 | # treat upper as RTL ?
125 | if upper_is_rtl and _ch.isupper():
126 | base_level = 1
127 | break
128 |
129 | bidi_type = bidirectional(_ch)
130 |
131 | if bidi_type in ('AL', 'R'):
132 | base_level = 1
133 | break
134 |
135 | elif bidi_type == 'L':
136 | base_level = 0
137 | break
138 |
139 | # P3
140 | if base_level is None:
141 | base_level = 0
142 |
143 | return base_level
144 |
145 | def get_embedding_levels(text, storage, upper_is_rtl=False, debug=False):
146 | """Get the paragraph base embedding level and direction,
147 | set the storage to the array of chars"""
148 |
149 | prev_surrogate = False
150 | base_level = storage['base_level']
151 |
152 | # preset the storage's chars
153 | for _ch in text:
154 | if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX):
155 | prev_surrogate = _ch
156 | continue
157 | elif prev_surrogate:
158 | _ch = prev_surrogate + _ch
159 | prev_surrogate = False
160 |
161 | if upper_is_rtl and _ch.isupper():
162 | bidi_type = 'R'
163 | else:
164 | bidi_type = bidirectional(_ch)
165 | storage['chars'].append({'ch':_ch, 'level':base_level, 'type':bidi_type,
166 | 'orig':bidi_type})
167 | if debug:
168 | debug_storage(storage, base_info=True)
169 |
170 | def explicit_embed_and_overrides(storage, debug=False):
171 | """Apply X1 to X9 rules of the unicode algorithm.
172 |
173 | See http://unicode.org/reports/tr9/#Explicit_Levels_and_Directions
174 |
175 | """
176 | overflow_counter = almost_overflow_counter = 0
177 | directional_override = 'N'
178 | levels = deque()
179 |
180 | #X1
181 | embedding_level = storage['base_level']
182 |
183 | for _ch in storage['chars']:
184 | bidi_type = _ch['type']
185 |
186 | level_func, override = X2_X5_MAPPINGS.get(bidi_type, (None, None))
187 |
188 | if level_func:
189 | # So this is X2 to X5
190 | # if we've past EXPLICIT_LEVEL_LIMIT, note it and do nothing
191 |
192 | if overflow_counter != 0:
193 | overflow_counter += 1
194 | continue
195 |
196 | new_level = level_func(embedding_level)
197 | if new_level < EXPLICIT_LEVEL_LIMIT:
198 | levels.append( (embedding_level, directional_override) )
199 | embedding_level, directional_override = new_level, override
200 |
201 | elif embedding_level == EXPLICIT_LEVEL_LIMIT -2:
202 | # The new level is invalid, but a valid level can still be
203 | # achieved if this level is 60 and we encounter an RLE or
204 | # RLO further on. So record that we 'almost' overflowed.
205 | almost_overflow_counter += 1
206 |
207 | else:
208 | overflow_counter += 1
209 | else:
210 | # X6
211 | if bidi_type not in X6_IGNORED:
212 | _ch['level'] = embedding_level
213 | if directional_override != 'N':
214 | _ch['type'] = directional_override
215 |
216 | # X7
217 | elif bidi_type == 'PDF':
218 | if overflow_counter:
219 | overflow_counter -= 1
220 | elif almost_overflow_counter and \
221 | embedding_level != EXPLICIT_LEVEL_LIMIT - 1:
222 | almost_overflow_counter -= 1
223 | elif levels:
224 | embedding_level, directional_override = levels.pop()
225 |
226 | # X8
227 | elif bidi_type == 'B':
228 | levels.clear()
229 | overflow_counter = almost_overflow_counter = 0
230 | embedding_level = _ch['level'] = storage['base_level']
231 | directional_override = 'N'
232 |
233 | #Removes the explicit embeds and overrides of types
234 | #RLE, LRE, RLO, LRO, PDF, and BN. Adjusts extended chars
235 | #next and prev as well
236 |
237 | #Applies X9. See http://unicode.org/reports/tr9/#X9
238 | storage['chars'] = [_ch for _ch in storage['chars']\
239 | if _ch['type'] not in X9_REMOVED]
240 |
241 | calc_level_runs(storage)
242 |
243 | if debug:
244 | debug_storage(storage, runs=True)
245 |
246 | def calc_level_runs(storage):
247 | """Split the storage to run of char types at the same level.
248 |
249 | Applies X10. See http://unicode.org/reports/tr9/#X10
250 | """
251 | #run level depends on the higher of the two levels on either side of
252 | #the boundary If the higher level is odd, the type is R; otherwise,
253 | #it is L
254 |
255 | storage['runs'].clear()
256 | chars = storage['chars']
257 |
258 | #empty string ?
259 | if not chars:
260 | return
261 |
262 | calc_level_run = lambda b_l, b_r: ['L', 'R'][max(b_l, b_r) % 2]
263 |
264 | first_char = chars[0]
265 |
266 | sor = calc_level_run(storage['base_level'], first_char['level'])
267 | eor = None
268 |
269 | run_start = run_length = 0
270 |
271 | prev_level, prev_type = first_char['level'], first_char['type']
272 |
273 | for _ch in chars:
274 | curr_level, curr_type = _ch['level'], _ch['type']
275 |
276 | if curr_level == prev_level:
277 | run_length += 1
278 | else:
279 | eor = calc_level_run(prev_level, curr_level)
280 | storage['runs'].append({'sor':sor, 'eor':eor, 'start':run_start,
281 | 'type': prev_type,'length': run_length})
282 | sor = eor
283 | run_start += run_length
284 | run_length = 1
285 |
286 | prev_level, prev_type = curr_level, curr_type
287 |
288 | # for the last char/runlevel
289 | eor = calc_level_run(curr_level, storage['base_level'])
290 | storage['runs'].append({'sor':sor, 'eor':eor, 'start':run_start,
291 | 'type':curr_type, 'length': run_length})
292 |
293 | def resolve_weak_types(storage, debug=False):
294 | """Reslove weak type rules W1 - W3.
295 |
296 | See: http://unicode.org/reports/tr9/#Resolving_Weak_Types
297 |
298 | """
299 |
300 | for run in storage['runs']:
301 | prev_strong = prev_type = run['sor']
302 | start, length = run['start'], run['length']
303 | chars = storage['chars'][start:start+length]
304 | for _ch in chars:
305 | # W1. Examine each nonspacing mark (NSM) in the level run, and
306 | # change the type of the NSM to the type of the previous character.
307 | # If the NSM is at the start of the level run, it will get the type
308 | # of sor.
309 | bidi_type = _ch['type']
310 |
311 | if bidi_type == 'NSM':
312 | _ch['type'] = bidi_type = prev_type
313 |
314 | # W2. Search backward from each instance of a European number until
315 | # the first strong type (R, L, AL, or sor) is found. If an AL is
316 | # found, change the type of the European number to Arabic number.
317 | if bidi_type == 'EN' and prev_strong == 'AL':
318 | _ch['type'] = 'AN'
319 |
320 | # update prev_strong if needed
321 | if bidi_type in ('R', 'L', 'AL'):
322 | prev_strong = bidi_type
323 |
324 | prev_type = _ch['type']
325 |
326 | # W3. Change all ALs to R
327 | for _ch in chars:
328 | if _ch['type'] == 'AL':
329 | _ch['type'] = 'R'
330 |
331 | # W4. A single European separator between two European numbers changes
332 | # to a European number. A single common separator between two numbers of
333 | # the same type changes to that type.
334 | for idx in range(1, len(chars) -1 ):
335 | bidi_type = chars[idx]['type']
336 | prev_type = chars[idx-1]['type']
337 | next_type = chars[idx+1]['type']
338 |
339 | if bidi_type == 'ES' and (prev_type == next_type == 'EN'):
340 | chars[idx]['type'] = 'EN'
341 |
342 | if bidi_type == 'CS' and prev_type == next_type and \
343 | prev_type in ('AN', 'EN'):
344 | chars[idx]['type'] = prev_type
345 |
346 |
347 | # W5. A sequence of European terminators adjacent to European numbers
348 | # changes to all European numbers.
349 | for idx in range(len(chars)):
350 | if chars[idx]['type'] == 'EN':
351 | for et_idx in range(idx-1, -1, -1):
352 | if chars[et_idx]['type'] == 'ET':
353 | chars[et_idx]['type'] = 'EN'
354 | else:
355 | break
356 | for et_idx in range(idx+1, len(chars)):
357 | if chars[et_idx]['type'] == 'ET':
358 | chars[et_idx]['type'] = 'EN'
359 | else:
360 | break
361 |
362 | # W6. Otherwise, separators and terminators change to Other Neutral.
363 | for _ch in chars:
364 | if _ch['type'] in ('ET', 'ES', 'CS'):
365 | _ch['type'] = 'ON'
366 |
367 | # W7. Search backward from each instance of a European number until the
368 | # first strong type (R, L, or sor) is found. If an L is found, then
369 | # change the type of the European number to L.
370 | prev_strong = run['sor']
371 | for _ch in chars:
372 | if _ch['type'] == 'EN' and prev_strong == 'L':
373 | _ch['type'] = 'L'
374 |
375 | if _ch['type'] in ('L', 'R'):
376 | prev_strong = _ch['type']
377 |
378 | if debug:
379 | debug_storage(storage, runs=True)
380 |
381 | def resolve_neutral_types(storage, debug):
382 | """Resolving neutral types. Implements N1 and N2
383 |
384 | See: http://unicode.org/reports/tr9/#Resolving_Neutral_Types
385 |
386 | """
387 |
388 | for run in storage['runs']:
389 | start, length = run['start'], run['length']
390 | # use sor and eor
391 | chars = [{'type':run['sor']}] + storage['chars'][start:start+length] +\
392 | [{'type':run['eor']}]
393 | total_chars = len(chars)
394 |
395 | seq_start = None
396 | for idx in range(total_chars):
397 | _ch = chars[idx]
398 | if _ch['type'] in ('B', 'S', 'WS', 'ON'):
399 | # N1. A sequence of neutrals takes the direction of the
400 | # surrounding strong text if the text on both sides has the same
401 | # direction. European and Arabic numbers act as if they were R
402 | # in terms of their influence on neutrals. Start-of-level-run
403 | # (sor) and end-of-level-run (eor) are used at level run
404 | # boundaries.
405 | if seq_start is None:
406 | seq_start = idx
407 | prev_bidi_type = chars[idx-1]['type']
408 | else:
409 | if seq_start is not None:
410 | next_bidi_type = chars[idx]['type']
411 |
412 | if prev_bidi_type in ('AN', 'EN'):
413 | prev_bidi_type = 'R'
414 |
415 | if next_bidi_type in ('AN', 'EN'):
416 | next_bidi_type = 'R'
417 |
418 | for seq_idx in range(seq_start, idx):
419 | if prev_bidi_type == next_bidi_type:
420 | chars[seq_idx]['type'] = prev_bidi_type
421 | else:
422 | # N2. Any remaining neutrals take the embedding
423 | # direction. The embedding direction for the given
424 | # neutral character is derived from its embedding
425 | # level: L if the character is set to an even level,
426 | # and R if the level is odd.
427 | chars[seq_idx]['type'] = \
428 | _embedding_direction(chars[seq_idx]['level'])
429 |
430 | seq_start = None
431 |
432 | if debug:
433 | debug_storage(storage)
434 |
435 | def resolve_implicit_levels(storage, debug):
436 | """Resolving implicit levels (I1, I2)
437 |
438 | See: http://unicode.org/reports/tr9/#Resolving_Implicit_Levels
439 |
440 | """
441 | for run in storage['runs']:
442 | start, length = run['start'], run['length']
443 | chars = storage['chars'][start:start+length]
444 |
445 | for _ch in chars:
446 | # only those types are allowed at this stage
447 | assert _ch['type'] in ('L', 'R', 'EN', 'AN'),\
448 | '%s not allowed here' % _ch['type']
449 |
450 | if _embedding_direction(_ch['level']) == 'L':
451 | # I1. For all characters with an even (left-to-right) embedding
452 | # direction, those of type R go up one level and those of type
453 | # AN or EN go up two levels.
454 | if _ch['type'] == 'R':
455 | _ch['level'] += 1
456 | elif _ch['type'] != 'L':
457 | _ch['level'] += 2
458 | else:
459 | # I2. For all characters with an odd (right-to-left) embedding
460 | # direction, those of type L, EN or AN go up one level.
461 | if _ch['type'] != 'R':
462 | _ch['level'] += 1
463 |
464 | if debug:
465 | debug_storage(storage, runs=True)
466 |
467 | def reverse_contiguous_sequence(chars, line_start, line_end, highest_level,
468 | lowest_odd_level):
469 | """L2. From the highest level found in the text to the lowest odd
470 | level on each line, including intermediate levels not actually
471 | present in the text, reverse any contiguous sequence of characters
472 | that are at that level or higher.
473 |
474 | """
475 | for level in range(highest_level, lowest_odd_level-1, -1):
476 | _start = _end = None
477 |
478 | for run_idx in range(line_start, line_end+1):
479 | run_ch = chars[run_idx]
480 |
481 | if run_ch['level'] >= level:
482 | if _start is None:
483 | _start = _end = run_idx
484 | else:
485 | _end = run_idx
486 | else:
487 | if _end:
488 | chars[_start:+_end+1] = \
489 | reversed(chars[_start:+_end+1])
490 | _start = _end = None
491 |
492 | # anything remaining ?
493 | if _start is not None:
494 | chars[_start:+_end+1] = \
495 | reversed(chars[_start:+_end+1])
496 |
497 |
498 | def reorder_resolved_levels(storage, debug):
499 | """L1 and L2 rules"""
500 |
501 | # Applies L1.
502 |
503 | should_reset = True
504 | chars = storage['chars']
505 |
506 | for _ch in chars[::-1]:
507 | # L1. On each line, reset the embedding level of the following
508 | # characters to the paragraph embedding level:
509 | if _ch['orig'] in ('B', 'S'):
510 | # 1. Segment separators,
511 | # 2. Paragraph separators,
512 | _ch['level'] = storage['base_level']
513 | should_reset = True
514 | elif should_reset and _ch['orig'] in ('BN', 'WS'):
515 | # 3. Any sequence of whitespace characters preceding a segment
516 | # separator or paragraph separator
517 | # 4. Any sequence of white space characters at the end of the
518 | # line.
519 | _ch['level'] = storage['base_level']
520 | else:
521 | should_reset = False
522 |
523 | max_len = len(chars)
524 |
525 | # L2 should be per line
526 | # Calculates highest level and loweset odd level on the fly.
527 |
528 | line_start = line_end = 0
529 | highest_level = 0
530 | lowest_odd_level = EXPLICIT_LEVEL_LIMIT
531 |
532 | for idx in range(max_len):
533 | _ch = chars[idx]
534 |
535 | # calc the levels
536 | char_level = _ch['level']
537 | if char_level > highest_level:
538 | highest_level = char_level
539 |
540 | if char_level % 2 and char_level < lowest_odd_level:
541 | lowest_odd_level = char_level
542 |
543 | if _ch['orig'] == 'B' or idx == max_len -1:
544 | line_end = idx
545 | # omit line breaks
546 | if _ch['orig'] == 'B':
547 | line_end -= 1
548 |
549 | reverse_contiguous_sequence(chars, line_start, line_end,
550 | highest_level, lowest_odd_level)
551 |
552 | # reset for next line run
553 | line_start = idx+1
554 | highest_level = 0
555 | lowest_odd_level = EXPLICIT_LEVEL_LIMIT
556 |
557 | if debug:
558 | debug_storage(storage)
559 |
560 |
561 | def apply_mirroring(storage, debug):
562 | """Applies L4: mirroring
563 |
564 | See: http://unicode.org/reports/tr9/#L4
565 |
566 | """
567 | # L4. A character is depicted by a mirrored glyph if and only if (a) the
568 | # resolved directionality of that character is R, and (b) the
569 | # Bidi_Mirrored property value of that character is true.
570 | for _ch in storage['chars']:
571 | unichar = _ch['ch']
572 | if mirrored(unichar) and \
573 | _embedding_direction(_ch['level']) == 'R':
574 | _ch['ch'] = MIRRORED.get(unichar, unichar)
575 |
576 | if debug:
577 | debug_storage(storage)
578 |
579 | def get_empty_storage():
580 | """Return an empty storage skeleton, usable for testing"""
581 | return {
582 | 'base_level': None,
583 | 'base_dir' : None,
584 | 'chars': [],
585 | 'runs' : deque(),
586 | }
587 |
588 |
589 | def get_display(unicode_or_str, encoding='utf-8', upper_is_rtl=False,
590 | base_dir=None, debug=False):
591 | """Accepts unicode or string. In case it's a string, `encoding`
592 | is needed as it works on unicode ones (default:"utf-8").
593 |
594 | Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
595 | for debugging (default: False).
596 |
597 | Set `base_dir` to 'L' or 'R' to override the calculated base_level.
598 |
599 | Set `debug` to True to display (using sys.stderr) the steps taken with the
600 | algorithm.
601 |
602 | Returns the display layout, either as unicode or `encoding` encoded
603 | string.
604 |
605 | """
606 | storage = get_empty_storage()
607 |
608 | # utf-8 ? we need unicode
609 | if isinstance(unicode_or_str, str):
610 | text = unicode_or_str
611 | decoded = False
612 | else:
613 | text = unicode_or_str.decode(encoding)
614 | decoded = True
615 |
616 | if base_dir is None:
617 | base_level = get_base_level(text, upper_is_rtl)
618 | else:
619 | base_level = PARAGRAPH_LEVELS[base_dir]
620 |
621 | storage['base_level'] = base_level
622 | storage['base_dir'] = ('L', 'R')[base_level]
623 |
624 | get_embedding_levels(text, storage, upper_is_rtl, debug)
625 | explicit_embed_and_overrides(storage, debug)
626 | resolve_weak_types(storage, debug)
627 | resolve_neutral_types(storage, debug)
628 | resolve_implicit_levels(storage, debug)
629 | reorder_resolved_levels(storage, debug)
630 | apply_mirroring(storage, debug)
631 |
632 | chars = storage['chars']
633 | display = u''.join([_ch['ch'] for _ch in chars])
634 |
635 | if decoded:
636 | return display.encode(encoding)
637 | else:
638 | return display
639 |
--------------------------------------------------------------------------------