├── LineBreakSample
├── LineBreakSample.opensdf
├── LineBreakSample.sdf
├── LineBreakSample.v11.suo
├── LineBreakSample
│ ├── stdafx.h
│ ├── stdafx.cpp
│ ├── targetver.h
│ ├── LineBreakSample.cpp
│ ├── ReadMe.txt
│ ├── LineBreakSample.vcxproj.filters
│ ├── LineBreak.h
│ ├── LineBreakSample.vcxproj
│ └── LineBreak.cpp
├── ipch
│ ├── linebreaksample-4a8ae08d
│ │ └── linebreaksample-cbf7993b.ipch
│ └── linebreaksample-656b7c4d
│ │ └── linebreaksample-cbf7993b.ipch
└── LineBreakSample.sln
├── .gitignore
├── README.md
└── LICENSE
/LineBreakSample/LineBreakSample.opensdf:
--------------------------------------------------------------------------------
1 | m a r l o n l u M A R L O N L U - P C 0
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample.sdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample.sdf
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample.v11.suo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample.v11.suo
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/stdafx.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/stdafx.h
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/stdafx.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/stdafx.cpp
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/targetver.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/targetver.h
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/LineBreakSample.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/LineBreakSample.cpp
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files
2 | *.slo
3 | *.lo
4 | *.o
5 |
6 | # Compiled Dynamic libraries
7 | *.so
8 | *.dylib
9 |
10 | # Compiled Static libraries
11 | *.lai
12 | *.la
13 | *.a
14 |
--------------------------------------------------------------------------------
/LineBreakSample/ipch/linebreaksample-4a8ae08d/linebreaksample-cbf7993b.ipch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/ipch/linebreaksample-4a8ae08d/linebreaksample-cbf7993b.ipch
--------------------------------------------------------------------------------
/LineBreakSample/ipch/linebreaksample-656b7c4d/linebreaksample-cbf7993b.ipch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/ipch/linebreaksample-656b7c4d/linebreaksample-cbf7993b.ipch
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2012
4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LineBreakSample", "LineBreakSample\LineBreakSample.vcxproj", "{BD60E6CB-A246-4DEF-B3F7-56B381692F78}"
5 | EndProject
6 | Global
7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
8 | Debug|Win32 = Debug|Win32
9 | Release|Win32 = Release|Win32
10 | EndGlobalSection
11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Debug|Win32.ActiveCfg = Debug|Win32
13 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Debug|Win32.Build.0 = Debug|Win32
14 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Release|Win32.ActiveCfg = Release|Win32
15 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Release|Win32.Build.0 = Release|Win32
16 | EndGlobalSection
17 | GlobalSection(SolutionProperties) = preSolution
18 | HideSolutionNode = FALSE
19 | EndGlobalSection
20 | EndGlobal
21 |
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/ReadMe.txt:
--------------------------------------------------------------------------------
1 | ========================================================================
2 | 控制台应用程序:LineBreakSample 项目概述
3 | ========================================================================
4 |
5 | 应用程序向导已为您创建了此 LineBreakSample 应用程序。
6 |
7 | 本文件概要介绍组成 LineBreakSample 应用程序的每个文件的内容。
8 |
9 |
10 | LineBreakSample.vcxproj
11 | 这是使用应用程序向导生成的 VC++ 项目的主项目文件,其中包含生成该文件的 Visual C++ 的版本信息,以及有关使用应用程序向导选择的平台、配置和项目功能的信息。
12 |
13 | LineBreakSample.vcxproj.filters
14 | 这是使用“应用程序向导”生成的 VC++ 项目筛选器文件。它包含有关项目文件与筛选器之间的关联信息。在 IDE 中,通过这种关联,在特定节点下以分组形式显示具有相似扩展名的文件。例如,“.cpp”文件与“源文件”筛选器关联。
15 |
16 | LineBreakSample.cpp
17 | 这是主应用程序源文件。
18 |
19 | /////////////////////////////////////////////////////////////////////////////
20 | 其他标准文件:
21 |
22 | StdAfx.h, StdAfx.cpp
23 | 这些文件用于生成名为 LineBreakSample.pch 的预编译头 (PCH) 文件和名为 StdAfx.obj 的预编译类型文件。
24 |
25 | /////////////////////////////////////////////////////////////////////////////
26 | 其他注释:
27 |
28 | 应用程序向导使用“TODO:”注释来指示应添加或自定义的源代码部分。
29 |
30 | /////////////////////////////////////////////////////////////////////////////
31 |
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/LineBreakSample.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hpp;hxx;hm;inl;inc;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | 头文件
23 |
24 |
25 | 头文件
26 |
27 |
28 | 头文件
29 |
30 |
31 |
32 |
33 | 源文件
34 |
35 |
36 | 源文件
37 |
38 |
39 | 源文件
40 |
41 |
42 |
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/LineBreak.h:
--------------------------------------------------------------------------------
1 | #ifndef _LINEBRRK_H_
2 | #define _LINEBRK_H_
3 |
4 | /*
5 | 此换行算法核心规则来源于Unicode Line Breaking Algorithm (http://www.unicode.org/unicode/reports/tr14/),
6 | 由于英语换行有明确的规则,Unicode Line Breaking Algorithm预先定义规则,并利用表格驱动方法来判断字符是否可以换行,
7 |
8 | 具体代码绝大部分借鉴于http://www.unicode.org/Public/PROGRAMS/LineBreakSampleCpp/5.2.0/
9 | 作者仅修复其中不合理BUG并在此基础上添加对中文以及中文全角标点符号支持
10 | 中文全角字符的判断区间函数isIdeographic参考 Android 源代码 StaticLayout.java中的函数
11 | private static final boolean isIdeographic(char c, boolean includeNonStarters)
12 |
13 |
14 | 未经过大量测试,中英文 半角/全角符号 数学表达式换行比较完美,对于同样是CJK表意文字的韩语 日文支持良好
15 | @author marlonlu
16 | @date: 2014.2.28
17 | */
18 |
19 | #include
20 | #include
21 |
22 | namespace LINE_BREAK
23 | {
24 | typedef unsigned long u32;
25 |
26 |
27 | typedef std::wstring UString;
28 | typedef wchar_t WChar;
29 |
30 | #define TCHAR WChar
31 | #define LPTSTR WChar*
32 |
33 | #define CHAR_FIRST_CJK 0x2E80
34 |
35 | enum break_class
36 | {
37 | // input types
38 | OP = 0, // open
39 | CL, // closing punctuation
40 | CP, // closing parentheses (from 5.2.0) (before 5.2.0 treat like CL)
41 | QU, // quotation
42 | GL, // glue
43 | NS, // no-start
44 | EX, // exclamation/interrogation
45 | SY, // Syntax (slash)
46 | IS, // infix (numeric) separator
47 | PR, // prefix
48 | PO, // postfix
49 | NU, // numeric
50 | AL, // alphabetic
51 | ID, // ideograph (atomic)
52 | IN, // inseparable
53 | HY, // hyphen
54 | BA, // break after
55 | BB, // break before
56 | B2, // break both
57 | ZW, // ZW space
58 | CM, // combining mark
59 | WJ, // word joiner
60 |
61 | // used for Korean Syllable Block pair table
62 | H2, // Hamgul 2 Jamo Syllable
63 | H3, // Hangul 3 Jamo Syllable
64 | JL, // Jamo leading consonant
65 | JV, // Jamo vowel
66 | JT, // Jamo trailing consonant
67 |
68 | // these are not handled in the pair tables
69 | SA, // South (East) Asian
70 | SP, // space
71 | PS, // paragraph and line separators
72 | BK, // hard break (newline)
73 | CR, // carriage return
74 | LF, // line feed
75 | NL, // next line
76 | CB, // contingent break opportunity
77 | SG, // surrogate
78 | AI, // ambiguous
79 | XX, // unknown
80 | };
81 |
82 |
83 | // Break actions are the types of break opportunities that may occur at a particular
84 | // point in the input. Values for these are also needed in the UI portion of the code
85 | // so they are already defined here - for explanation see below in the line break
86 | // section.
87 | enum break_action
88 | {
89 | DIRECT_BRK,
90 | INDIRECT_BRK,
91 | COMBINING_INDIRECT_BRK,
92 | COMBINING_PROHIBITED_BRK,
93 | PROHIBITED_BRK,
94 | EXPLICIT_BRK,
95 | HANGUL_SPACE_BRK,
96 | };
97 |
98 | int classifyLnBrk(const LPTSTR pszText, break_class * pcls, int cch);
99 | int findLineBrk(break_class *pcls, break_action *pbrk, int cch);
100 | int findComplexBreak(break_class cls, break_class *pcls, break_action *pbrk, int cch);
101 |
102 | break_class LBClassFromCh(TCHAR ch);
103 | int CharFromVisible(int ch);
104 | bool isIdeographic(TCHAR c);
105 |
106 | }
107 |
108 | #endif // ifndef _LINEBRK_H_
109 |
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/LineBreakSample.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 |
14 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}
15 | Win32Proj
16 | LineBreakSample
17 |
18 |
19 |
20 | Application
21 | true
22 | v110
23 | Unicode
24 |
25 |
26 | Application
27 | false
28 | v110
29 | true
30 | Unicode
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | true
44 |
45 |
46 | false
47 |
48 |
49 |
50 | NotUsing
51 | Level3
52 | Disabled
53 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
54 | true
55 |
56 |
57 | Console
58 | true
59 |
60 |
61 |
62 |
63 | Level3
64 | Use
65 | MaxSpeed
66 | true
67 | true
68 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
69 | true
70 |
71 |
72 | Console
73 | true
74 | true
75 | true
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 | Create
91 | Create
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | LineBreak
2 | =========
3 |
4 | 国内游戏对于中英文混杂的字符串在自动断行上不够完善,根据网上Unicode Line Break算法重新完善,支持中英文 数字 全角半角标点符号自动换行
5 |
6 |
7 | ####Unicode Line Breaking Algorithm实现
8 |
9 | #####算法背景:
10 |
11 | 由于公司内游戏对国际化支持不够友好,其中文字断行方面并未够完美,仅仅支持空格以及换行符,并未考虑中文全角标点符号,英语以及数字混合等复杂情况,本算法正是为了完美解决中文 英语以及标点符号 数学表达式的自动换行。
12 |
13 | #####算法综述:
14 |
15 | 此算法基于http://www.unicode.org/reports/tr14/规则并参考http://www.unicode.org/Public/PROGRAMS/LineBreakSampleCpp/5.2.0/实现。
16 |
17 | 由于英语换行是有明确规则,Unicode Line Breaking Algorithm上明确定义了所有英语换行规则,例如:
18 |
19 | 1.英语字母与空格一起时运行空格后面换行
20 |
21 | 2.当英语字母处于破折号-前面时不允许换行,但允许在-后面换行,同时如果-后面是数字的话则不允许换行。
22 |
23 | 3.当左括号(后面接着英语字母或者数字时不允许换行,同时当英语字母或者数字紧挨着右括号)时不允许换行。
24 |
25 |
26 | 因此Unicode Line Breaking Algorithm先把所有字符归类,然后根据一定的规则判断字符间是否可以换行,算法主要用途在于解析出给定字符串可以换行的地方。
27 |
28 | ######一 字符归类
29 |
30 | 因此Unicode Line Breaking Algorithm首先便利需要判断断行的字符串,并将其中的字符转换成同样的一类:
31 |
32 | 1.大小写字母为AL,数字为NU,CJK表意文字(中文 韩文 日文)则为ID,空格则为SP
33 |
34 | 2.( 左括号 左引号则为OP, ) 则为CP
35 | ………
36 |
37 |
38 |
39 | 下图则是对ANSI编码 00—7F的字符归类处理
40 | ```
41 | break_class LnBrkClassFromChar[] =
42 | {
43 | // treat CB as BB for demo purposes
44 | // 0 1 2 3 4 5 6 7 8 9 a b c d e f
45 | AL, ZW, GL, GL, BA, GL, AL, B2, IN, BA, LF, CB, AL, CR, NL, AL, // 00-0f
46 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 10-1f
47 |
48 | // ' ' ! " $ % & ' ( ) * + , - . /
49 | SP, EX, QU, IN, PR, PO, BB, QU, OP, CP, BA, PR, IN, HY, IN, SY, // 20-2f
50 | // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
51 | NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NS, AL, AL, GL, AL, EX, // 30-3f
52 |
53 | // @, A B C D E F G H I J K L M N O
54 | CB, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 40-4f
55 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CP, AL, IS, // 50-5f ... [ \ ] ^ _
56 | CM, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 60-6f
57 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CL, AL, SA, // 70-7f ... { | } ~ DEL
58 | // p q r s t u v w x y z
59 | };
60 | ```
61 |
62 |
63 | ######二 字符断行判断
64 |
65 | 当预处理字符后,则根据在http://www.unicode.org/reports/tr14/ 预先定义好的规则进行判断两个字符中间是否可以断行,例如左括号OP紧挨着字母AL时,不允许断行
66 |
67 | 核心规则表:
68 |
69 | ```
70 | break_action brkPairs[][JT+1]=
71 | { // --- 'after' class ------
72 | // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
73 | // OP, CL, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, = after class
74 | /*OP*/ XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, CC, XX, XX, XX, XX, XX, XX, // OP open
75 | /*CL*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close
76 | /*CP*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close
77 | /*QU*/ XX, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // QU quotation
78 | /*GL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // GL glue
79 | /*NS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NS no-start
80 | /*EX*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // EX exclamation/interrogation
81 | /*SY*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // SY Syntax (slash)
82 | /*IS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IS infix (numeric) separator
83 | /*PR*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, SS, oo, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, SS, // PR prefix
84 | /*PO*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NU numeric
85 |
86 | // Version 5.2.0 and higher
87 | /*NU*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic
88 | /*AL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic
89 |
90 | /*ID*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // ID ideograph (atomic)
91 | /*IN*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IN inseparable
92 |
93 | /*HY*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // HY hyphens and spaces
94 |
95 | /*BA*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // BA break after
96 | /*BB*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // BB break before
97 | /*B2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, XX, XX, cc, XX, oo, oo, oo, oo, oo, // B2 break either side, but not pair
98 | /*ZW*/ oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, XX, oo, oo, oo, oo, oo, oo, oo, // ZW zero width space
99 | /*CM*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CM combining mark
100 | /*WJ*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // WJ word joiner
101 |
102 | /*H2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Hangul 2 Jamo syllable
103 | /*H3*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Hangul 3 Jamo syllable
104 | /*JL*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, oo, // Jamo Leading Consonant
105 | /*JV*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Jamo Vowel
106 | /*JT*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Jamo Trailing Consonant
107 |
108 | };
109 | ```
110 |
111 | 其中规则表里定义的动作acticon:
112 | ```
113 | // Define some short-cuts for the table
114 | #define oo DIRECT_BRK // '_' break allowed
115 | #define SS INDIRECT_BRK // '%' only break across space (aka 'indirect break' below)
116 | #define cc COMBINING_INDIRECT_BRK // '#' indirect break for combining marks
117 | #define CC COMBINING_PROHIBITED_BRK // '@' indirect break for combining marks
118 | #define XX PROHIBITED_BRK // '^' no break allowed_BRK
119 | #define xS HANGUL_SPACE_BRK // break allowed, except when spaces are used with Hangul (not used)
120 | ```
121 |
122 | OO表示可以直接断行
123 | SS 表示如果两个字符间有空格则允许断行,否则不允许断行,例如两个英语字母之间不允许断行
124 | XX 任何情况下都不允许断行
125 |
126 | ######三 中文CJK判断
127 |
128 | 这部分实现主要参考 Android 源代码 StaticLayout.java中的函数
129 | private static final boolean isIdeographic(char c, boolean includeNonStarters)
130 |
131 | CJK文字主要Unicode范围判断则可以参考此文档
132 | 完整的CJK Unicode范围(5.0版)
133 |
134 |
135 | #####四 版权说明
136 |
137 | 此换行算法核心规则来源于Unicode Line Breaking Algorithm (http://www.unicode.org/unicode/reports/tr14/),
138 | 由于英语换行有明确的规则,Unicode Line Breaking Algorithm预先定义规则,并利用表格驱动方法来判断字符是否可以换行,
139 |
140 | 具体代码绝大部分借鉴于
141 | http://www.unicode.org/Public/PROGRAMS/LineBreakSampleCpp/5.2.0/,作者仅修复其中不合理BUG并在此基础上添加对中文以及中文全角标点符号支持
142 | 中文全角字符的判断区间函数isIdeographic参考 Android 源代码 StaticLayout.java中的函数
143 | boolean isIdeographic(char c, boolean includeNonStarters)
144 |
145 | 未经过大量测试,中英文 半角/全角符号 数学表达式换行比较完美.
146 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/LineBreakSample/LineBreakSample/LineBreak.cpp:
--------------------------------------------------------------------------------
1 | #include "LineBreak.h"
2 |
3 |
4 | // Line Break Character Types
5 | // These correspond to the line break class values defined in UAX#14, Version
6 | // 5.0.0. In a real implementation, there would be a mapping from character
7 | // code to line break class value. In this demo version, the mapping is from
8 | // a pseudo alphabet to these line break classes. The actual line break algorithm
9 | // takes as input only line break classes, so, by changing the mapping from
10 | // pseudo alphabet to actual Unicode Characters, this demo could be adapted
11 | // for use in actual line breaking.
12 |
13 | namespace LINE_BREAK
14 | {
15 |
16 |
17 | break_class LnBrkClassFromChar[] =
18 | {
19 | // treat CB as BB for demo purposes
20 | // 0 1 2 3 4 5 6 7 8 9 a b c d e f
21 | AL, ZW, GL, GL, BA, GL, AL, B2, IN, BA, LF, CB, AL, CR, NL, AL, // 00-0f
22 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 10-1f
23 |
24 | // ' ' ! " $ % & ' ( ) * + , - . /
25 | SP, EX, QU, IN, PR, PO, BB, QU, OP, CP, BA, PR, IN, HY, IN, SY, // 20-2f
26 | // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
27 | NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NS, AL, AL, GL, AL, EX, // 30-3f
28 |
29 | // @, A B C D E F G H I J K L M N O
30 | CB, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 40-4f
31 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CP, AL, IS, // 50-5f ... [ \ ] ^ _
32 | CM, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 60-6f
33 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CL, AL, SA, // 70-7f ... { | } ~ DEL
34 | // p q r s t u v w x y z
35 | };
36 |
37 |
38 |
39 | /*---------------------------------------------------------------------------
40 | Function: classify
41 |
42 | Determines the character classes for all following
43 | passes of the algorithm
44 |
45 | This uses a pseudo alphabet as input - see the szExplain string
46 | above for a description. In a production version, this function
47 | would implement the line break property lookup for actual Unicode
48 | characters.
49 |
50 | Input: Text string
51 | Character count
52 |
53 | Output: Array of linebreak classes
54 |
55 | ----------------------------------------------------------------------------*/
56 | int classifyLnBrk(const LPTSTR pszText, break_class * pcls, int cch)
57 | {
58 | int ich;
59 | for (ich = 0; ich < cch; ich++)
60 | {
61 | if(pszText[ich] > CHAR_FIRST_CJK)
62 | {
63 | pcls[ich] = isIdeographic(pszText[ich]) ? ID : NS;
64 |
65 | //对于小括号 中括号 大括号进行特殊处理
66 | TCHAR c = pszText[ich];
67 | if(c == 0xFF08 || c == 0xFF3B || c == 0xFF5B) //小 中 大括号
68 | pcls[ich] = OP;
69 | else if(c == 0xFF09 || c == 0xFF3D || c == 0xFF5D)
70 | pcls[ich] = CP;
71 |
72 | continue;
73 | }
74 |
75 | TCHAR c = pszText[ich];
76 | //增加中文单引号 双引号支持
77 | if(c == 8216 || c == 8220)
78 | {
79 | pcls[ich] = OP;
80 | continue;
81 | }
82 |
83 | if(c == 8217 || c == 8221)
84 | {
85 | pcls[ich] = CP;
86 | continue;
87 | }
88 |
89 | pcls[ich] = LBClassFromCh(pszText[ich]);
90 |
91 | // map unknown, and ambiguous to AL by default
92 | if (pcls[ich] == XX || pcls[ich] == AI)
93 | pcls[ich] = AL;
94 |
95 | // map contingent break to B2 by default
96 | // this saves a row/col for CB in the table
97 | // but only approximates rule 20
98 | if (pcls[ich] == CB)
99 | pcls[ich] = B2;
100 |
101 | /* If the following remapping is enabled, all tests involving
102 | NL can be removed from the main loop below.
103 |
104 | // map NL to BK as there's no difference
105 | if (pcls[ich] == NL)
106 | pcls[ich] = BK;
107 | */
108 | }
109 | return ich;
110 | }
111 |
112 | // mapping of special character codes to Unicode symbols for visualization
113 | int chVisibleFromSpecial[] =
114 | {
115 | /* ZW 1 chZWSP */ 0x2020, // show as dagger
116 | /* GL 2 chZWNBSP */ 0x2021, // show as double dagger
117 | /* GL 3 chNBHY */ 0x00AC, // show as not sign
118 | /* BA 4 chSHY */ 0x00B7, // show as dot
119 | /* GL 5 chNBSP */ 0x2017, // show as low line
120 | /* -- 6 chDummy1 */ 0x203E, // show as double low line
121 | /* B2 7 chEM */ 0x2014, // show as em dash
122 | /* IN 8 chELLIPSIS */ 0x2026, // show as ellipsis
123 | /* CM 9 chTB */ 0x2310, // show as not sign
124 | /* LF 10 chLFx */ 0x2580, // show as high square
125 | /* CB 11 chOBJ */ 0x2302, // show as house (delete)
126 | /* -- 12 chdummy2 */ 0x2222,
127 | /* CR 13 chCRx */ 0x2584, // show as low square
128 | /* NL 14 chNLx */ 0x258C, // show as left half block
129 | };
130 |
131 | // map visible symbol to character
132 | int CharFromVisible(int ch)
133 | {
134 | for (int ich = 0; ich < sizeof chVisibleFromSpecial / sizeof (int); ich ++)
135 | {
136 | if (ch == chVisibleFromSpecial[ich])
137 | {
138 | return ich + 1;
139 | }
140 | }
141 | return ch;
142 | }
143 |
144 | break_class LBClassFromCh(TCHAR ch)
145 | {
146 | ch = LINE_BREAK::CharFromVisible((int)ch);
147 | if (ch >= 0x7f)
148 | return XX;
149 | return LnBrkClassFromChar[ch];
150 | }
151 |
152 |
153 | //2 // === LINE BREAK DEFINITIONS ===================================================
154 |
155 | // Define some short-cuts for the table
156 | #define oo DIRECT_BRK // '_' break allowed
157 | #define SS INDIRECT_BRK // '%' only break across space (aka 'indirect break' below)
158 | #define cc COMBINING_INDIRECT_BRK // '#' indirect break for combining marks
159 | #define CC COMBINING_PROHIBITED_BRK // '@' indirect break for combining marks
160 | #define XX PROHIBITED_BRK // '^' no break allowed_BRK
161 | #define xS HANGUL_SPACE_BRK // break allowed, except when spaces are used with Hangul (not used)
162 |
163 | // xS not yet assigned in the table below
164 |
165 | //2 // === LINE BREAK PAIR TABLE ===================================================
166 |
167 | // Line Break Pair Table corresponding to Table 2 of UAX#14, Version 5.0.0
168 | // plus Korean Syllable Block extensions - for details see that document
169 |
170 | // Additional rows added or replaced for versions 5.0.1, 5.1.0 or 5.2.0 as needed by conditional compilation
171 | // Additional column added for version 5.2.0 (CP). In earlier versions this acts identical to col for CL.
172 |
173 | break_action brkPairs[][JT+1]=
174 | { // --- 'after' class ------
175 | // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
176 | // OP, CL, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, = after class
177 | /*OP*/ XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, CC, XX, XX, XX, XX, XX, XX, // OP open
178 | /*CL*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close
179 | /*CP*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close
180 | /*QU*/ XX, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // QU quotation
181 | /*GL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // GL glue
182 | /*NS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NS no-start
183 | /*EX*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // EX exclamation/interrogation
184 | /*SY*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // SY Syntax (slash)
185 | /*IS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IS infix (numeric) separator
186 | /*PR*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, SS, oo, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, SS, // PR prefix
187 | /*PO*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NU numeric
188 |
189 | // Version 5.2.0 and higher
190 | /*NU*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic
191 | /*AL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic
192 |
193 | /*ID*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // ID ideograph (atomic)
194 | /*IN*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IN inseparable
195 |
196 | /*HY*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // HY hyphens and spaces
197 |
198 | /*BA*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // BA break after
199 | /*BB*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // BB break before
200 | /*B2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, XX, XX, cc, XX, oo, oo, oo, oo, oo, // B2 break either side, but not pair
201 | /*ZW*/ oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, XX, oo, oo, oo, oo, oo, oo, oo, // ZW zero width space
202 | /*CM*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CM combining mark
203 | /*WJ*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // WJ word joiner
204 |
205 | /*H2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Hangul 2 Jamo syllable
206 | /*H3*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Hangul 3 Jamo syllable
207 | /*JL*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, oo, // Jamo Leading Consonant
208 | /*JV*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Jamo Vowel
209 | /*JT*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Jamo Trailing Consonant
210 |
211 | };
212 |
213 | // handle spaces separately, all others by table
214 | // pcls - pointer to array of line breaking classes (input)
215 | // pbrk - pointer to array of line break opportunities (output)
216 | // cch - number of elements in the arrays (揷ount of characters? (input)
217 | // ich - current index into the arrays (variable) (returned value)
218 | // cls - current resolved line break class for 'before' character (variable)
219 | int findLineBrk(break_class *pcls, break_action *pbrk, int cch)
220 | {
221 | if (cch <= 0)
222 | return 0;
223 |
224 | break_class cls = pcls[0];
225 |
226 | // handle case where input starts with an LF
227 | if (cls == LF)
228 | cls = BK;
229 |
230 | // treat NL like BK
231 | if (cls == NL)
232 | cls = BK;
233 |
234 | // treat SP at start of input as if it followed WJ
235 | if (cls == SP)
236 | cls = WJ;
237 |
238 | // loop over all pairs in the string up to a hard break or CRLF pair
239 | int ich;
240 | for (ich = 1; (ich < cch) && (cls != BK) && (cls != CR || pcls[ich] == LF); ich++) {
241 |
242 | // handle spaces explicitly
243 | if (pcls[ich] == SP) {
244 | pbrk[ich-1] = PROHIBITED_BRK; // apply rule LB 7: ?SP
245 | continue; // do not update cls
246 | }
247 |
248 | if (pcls[ich] == BK || pcls[ich] == NL || pcls[ich] == LF) {
249 | pbrk[ich-1] = PROHIBITED_BRK;
250 | cls = BK;
251 | continue;
252 | }
253 |
254 | if (pcls[ich] == CR)
255 | {
256 | pbrk[ich-1] = PROHIBITED_BRK;
257 | cls = CR;
258 | continue;
259 | }
260 |
261 | // handle complex scripts in a separate function
262 | if (cls == SA || pcls[ich] == SA) {
263 | ich += findComplexBreak(cls, &pcls[ich-1], &pbrk[ich-1], cch - (ich-1));
264 | if (ich < cch)
265 | cls = pcls[ich];
266 | continue;
267 | }
268 |
269 | if(!(cls < SP) || !(pcls[ich] < SP))
270 | {
271 | continue;
272 | }
273 |
274 | // lookup pair table information in brkPairs[before, after];
275 | enum break_action brk = brkPairs[cls][pcls[ich]];
276 |
277 | pbrk[ich-1] = brk; // save break action in output array
278 |
279 | if (brk == INDIRECT_BRK) { // resolve indirect break
280 | if (pcls[ich - 1] == SP) // if context is A SP * B
281 | pbrk[ich - 1] = DIRECT_BRK; // break opportunity
282 | else // else
283 | pbrk[ich-1] = PROHIBITED_BRK; // no break opportunity
284 | } else if (brk == COMBINING_PROHIBITED_BRK) { // this is the case OP SP* CM
285 | pbrk[ich-1] = COMBINING_PROHIBITED_BRK; // no break allowed
286 | if (pcls[ich-1] != SP)
287 | continue; // apply rule 9: X CM* -> X
288 | } else if (brk == COMBINING_INDIRECT_BRK) { // resolve combining mark break
289 | pbrk[ich-1] = PROHIBITED_BRK; // don't break before CM
290 | if (pcls[ich-1] == SP){
291 | if (false) // new: SP is not a base
292 | pbrk[ich-1] = COMBINING_INDIRECT_BRK; // apply rule SP ?
293 | else
294 | {
295 | pbrk[ich-1] = PROHIBITED_BRK; // legacy: keep SP CM together
296 | if (ich > 1)
297 | pbrk[ich-2] = ((pcls[ich - 2] == SP) ? INDIRECT_BRK : DIRECT_BRK);
298 | }
299 | } else // apply rule 9: X CM * -> X
300 | continue; // don't update cls
301 | }
302 | cls = pcls[ich]; // save cls of current character
303 | }
304 | // always break at the end
305 | pbrk[ich-1] = EXPLICIT_BRK;
306 |
307 | return ich;
308 | }
309 |
310 | // placeholder function for complex break analysis
311 | // cls - last resolved line break class (this is !SA)
312 | // pcls - pointer to array of line breaking classes with pcls[0] == SA (input)
313 | // pbrk - pointer to array of line breaking opportunities (output)
314 | //
315 | int findComplexBreak(break_class cls, break_class *pcls, break_action *pbrk, int cch)
316 | {
317 | if (!cch)
318 | return 0;
319 |
320 | int ich;
321 | for (ich = 1; ich < cch; ich++) {
322 |
323 | // .. do complex break analysis here
324 | // and report any break opportunities in pbrk ..
325 |
326 | pbrk[ich-1] = PROHIBITED_BRK; // by default: no break
327 |
328 | if (pcls[ich] != SA)
329 | break;
330 | }
331 | return ich;
332 | }
333 |
334 |
335 | /**
336 | * Returns true if the specified character is one of those specified
337 | * as being Ideographic (class ID) by the Unicode Line Breaking Algorithm
338 | * (http://www.unicode.org/unicode/reports/tr14/), and is therefore OK
339 | * to break between a pair of.
340 | *
341 | * @see Android Source Code StaticLayout.Java (isIdeographic)
342 | */
343 | bool isIdeographic(WChar c) {
344 | if (c >= 0x2E80 && c <= 0x2FFF)
345 | {
346 | return true; // CJK, KANGXI RADICALS, DESCRIPTION SYMBOLS
347 | }
348 | if (c == 0x3000)
349 | {
350 | return true; // IDEOGRAPHIC SPACE
351 | }
352 |
353 | if(c >= 0x3000 && c <= 0x303f) //CJK标点符号
354 | {
355 | return false;
356 | }
357 |
358 | if (c >= 0x3040 && c <= 0x309F) //日文平假名
359 | {
360 | switch (c)
361 | {
362 | case 0x3041: // # HIRAGANA LETTER SMALL A
363 | case 0x3043: // # HIRAGANA LETTER SMALL I
364 | case 0x3045: // # HIRAGANA LETTER SMALL U
365 | case 0x3047: // # HIRAGANA LETTER SMALL E
366 | case 0x3049: // # HIRAGANA LETTER SMALL O
367 | case 0x3063: // # HIRAGANA LETTER SMALL TU
368 | case 0x3083: // # HIRAGANA LETTER SMALL YA
369 | case 0x3085: // # HIRAGANA LETTER SMALL YU
370 | case 0x3087: // # HIRAGANA LETTER SMALL YO
371 | case 0x308E: // # HIRAGANA LETTER SMALL WA
372 | case 0x3095: // # HIRAGANA LETTER SMALL KA
373 | case 0x3096: // # HIRAGANA LETTER SMALL KE
374 | case 0x309B: // # KATAKANA-HIRAGANA VOICED SOUND MARK
375 | case 0x309C: // # KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
376 | case 0x309D: // # HIRAGANA ITERATION MARK
377 | case 0x309E: // # HIRAGANA VOICED ITERATION MARK
378 | return false;
379 | break;
380 | default :
381 | return true;
382 |
383 | }
384 | }
385 |
386 | if (c >= 0x30A0 && c <= 0x30FF) //日文片假名
387 | {
388 | switch (c) {
389 | case 0x30A0: // # KATAKANA-HIRAGANA DOUBLE HYPHEN
390 | case 0x30A1: // # KATAKANA LETTER SMALL A
391 | case 0x30A3: // # KATAKANA LETTER SMALL I
392 | case 0x30A5: // # KATAKANA LETTER SMALL U
393 | case 0x30A7: // # KATAKANA LETTER SMALL E
394 | case 0x30A9: // # KATAKANA LETTER SMALL O
395 | case 0x30C3: // # KATAKANA LETTER SMALL TU
396 | case 0x30E3: // # KATAKANA LETTER SMALL YA
397 | case 0x30E5: // # KATAKANA LETTER SMALL YU
398 | case 0x30E7: // # KATAKANA LETTER SMALL YO
399 | case 0x30EE: // # KATAKANA LETTER SMALL WA
400 | case 0x30F5: // # KATAKANA LETTER SMALL KA
401 | case 0x30F6: // # KATAKANA LETTER SMALL KE
402 | case 0x30FB: // # KATAKANA MIDDLE DOT
403 | case 0x30FC: // # KATAKANA-HIRAGANA PROLONGED SOUND MARK
404 | case 0x30FD: // # KATAKANA ITERATION MARK
405 | case 0x30FE: // # KATAKANA VOICED ITERATION MARK
406 | return false;
407 | break;
408 | default :
409 | return true;
410 | }
411 | }
412 |
413 | if (c >= 0x3400 && c <= 0x4DB5) {
414 | return true; // CJK UNIFIED IDEOGRAPHS EXTENSION A
415 | }
416 | if (c >= 0x4E00 && c <= 0x9FBB) {
417 | return true; // CJK UNIFIED IDEOGRAPHS
418 | }
419 | if (c >= 0xF900 && c <= 0xFAD9) {
420 | return true; // CJK COMPATIBILITY IDEOGRAPHS
421 | }
422 | if (c >= 0xA000 && c <= 0xA48F) {
423 | return true; // YI SYLLABLES
424 | }
425 | if (c >= 0xA490 && c <= 0xA4CF) {
426 | return true; // YI RADICALS
427 | }
428 | if (c >= 0xFE62 && c <= 0xFE66) {
429 | return true; // SMALL PLUS SIGN to SMALL EQUALS SIGN
430 | }
431 | if (c >= 0xFF10 && c <= 0xFF19) {
432 | return true; // WIDE DIGITS
433 | }
434 |
435 | if(c >= 0xFF01 && c <= 0xFF0F) {
436 | return false;
437 | }
438 |
439 | if(c >= 0xFF1A && c <= 0xFF20) {
440 | return false;
441 | }
442 |
443 | if((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A)) {
444 | return true; //WIDTH Letter
445 | }
446 |
447 | return false;
448 | }
449 |
450 | }
--------------------------------------------------------------------------------