├── LineBreakSample ├── LineBreakSample.opensdf ├── LineBreakSample.sdf ├── LineBreakSample.v11.suo ├── LineBreakSample │ ├── stdafx.h │ ├── stdafx.cpp │ ├── targetver.h │ ├── LineBreakSample.cpp │ ├── ReadMe.txt │ ├── LineBreakSample.vcxproj.filters │ ├── LineBreak.h │ ├── LineBreakSample.vcxproj │ └── LineBreak.cpp ├── ipch │ ├── linebreaksample-4a8ae08d │ │ └── linebreaksample-cbf7993b.ipch │ └── linebreaksample-656b7c4d │ │ └── linebreaksample-cbf7993b.ipch └── LineBreakSample.sln ├── .gitignore ├── README.md └── LICENSE /LineBreakSample/LineBreakSample.opensdf: -------------------------------------------------------------------------------- 1 | marlonluMARLONLU-PC0 -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample.sdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample.sdf -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample.v11.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample.v11.suo -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/stdafx.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/stdafx.h -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/stdafx.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/stdafx.cpp -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/targetver.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/targetver.h -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/LineBreakSample.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/LineBreakSample/LineBreakSample.cpp -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | 6 | # Compiled Dynamic libraries 7 | *.so 8 | *.dylib 9 | 10 | # Compiled Static libraries 11 | *.lai 12 | *.la 13 | *.a 14 | -------------------------------------------------------------------------------- /LineBreakSample/ipch/linebreaksample-4a8ae08d/linebreaksample-cbf7993b.ipch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/ipch/linebreaksample-4a8ae08d/linebreaksample-cbf7993b.ipch -------------------------------------------------------------------------------- /LineBreakSample/ipch/linebreaksample-656b7c4d/linebreaksample-cbf7993b.ipch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dm04806/LineBreak/HEAD/LineBreakSample/ipch/linebreaksample-656b7c4d/linebreaksample-cbf7993b.ipch -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2012 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LineBreakSample", "LineBreakSample\LineBreakSample.vcxproj", "{BD60E6CB-A246-4DEF-B3F7-56B381692F78}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Release|Win32 = Release|Win32 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Debug|Win32.ActiveCfg = Debug|Win32 13 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Debug|Win32.Build.0 = Debug|Win32 14 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Release|Win32.ActiveCfg = Release|Win32 15 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78}.Release|Win32.Build.0 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/ReadMe.txt: -------------------------------------------------------------------------------- 1 | ======================================================================== 2 | 控制台应用程序:LineBreakSample 项目概述 3 | ======================================================================== 4 | 5 | 应用程序向导已为您创建了此 LineBreakSample 应用程序。 6 | 7 | 本文件概要介绍组成 LineBreakSample 应用程序的每个文件的内容。 8 | 9 | 10 | LineBreakSample.vcxproj 11 | 这是使用应用程序向导生成的 VC++ 项目的主项目文件,其中包含生成该文件的 Visual C++ 的版本信息,以及有关使用应用程序向导选择的平台、配置和项目功能的信息。 12 | 13 | LineBreakSample.vcxproj.filters 14 | 这是使用“应用程序向导”生成的 VC++ 项目筛选器文件。它包含有关项目文件与筛选器之间的关联信息。在 IDE 中,通过这种关联,在特定节点下以分组形式显示具有相似扩展名的文件。例如,“.cpp”文件与“源文件”筛选器关联。 15 | 16 | LineBreakSample.cpp 17 | 这是主应用程序源文件。 18 | 19 | ///////////////////////////////////////////////////////////////////////////// 20 | 其他标准文件: 21 | 22 | StdAfx.h, StdAfx.cpp 23 | 这些文件用于生成名为 LineBreakSample.pch 的预编译头 (PCH) 文件和名为 StdAfx.obj 的预编译类型文件。 24 | 25 | ///////////////////////////////////////////////////////////////////////////// 26 | 其他注释: 27 | 28 | 应用程序向导使用“TODO:”注释来指示应添加或自定义的源代码部分。 29 | 30 | ///////////////////////////////////////////////////////////////////////////// 31 | -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/LineBreakSample.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 头文件 23 | 24 | 25 | 头文件 26 | 27 | 28 | 头文件 29 | 30 | 31 | 32 | 33 | 源文件 34 | 35 | 36 | 源文件 37 | 38 | 39 | 源文件 40 | 41 | 42 | -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/LineBreak.h: -------------------------------------------------------------------------------- 1 | #ifndef _LINEBRRK_H_ 2 | #define _LINEBRK_H_ 3 | 4 | /* 5 | 此换行算法核心规则来源于Unicode Line Breaking Algorithm (http://www.unicode.org/unicode/reports/tr14/), 6 | 由于英语换行有明确的规则,Unicode Line Breaking Algorithm预先定义规则,并利用表格驱动方法来判断字符是否可以换行, 7 | 8 | 具体代码绝大部分借鉴于http://www.unicode.org/Public/PROGRAMS/LineBreakSampleCpp/5.2.0/ 9 | 作者仅修复其中不合理BUG并在此基础上添加对中文以及中文全角标点符号支持 10 | 中文全角字符的判断区间函数isIdeographic参考 Android 源代码 StaticLayout.java中的函数 11 | private static final boolean isIdeographic(char c, boolean includeNonStarters) 12 | 13 | 14 | 未经过大量测试,中英文 半角/全角符号 数学表达式换行比较完美,对于同样是CJK表意文字的韩语 日文支持良好 15 | @author marlonlu 16 | @date: 2014.2.28 17 | */ 18 | 19 | #include 20 | #include 21 | 22 | namespace LINE_BREAK 23 | { 24 | typedef unsigned long u32; 25 | 26 | 27 | typedef std::wstring UString; 28 | typedef wchar_t WChar; 29 | 30 | #define TCHAR WChar 31 | #define LPTSTR WChar* 32 | 33 | #define CHAR_FIRST_CJK 0x2E80 34 | 35 | enum break_class 36 | { 37 | // input types 38 | OP = 0, // open 39 | CL, // closing punctuation 40 | CP, // closing parentheses (from 5.2.0) (before 5.2.0 treat like CL) 41 | QU, // quotation 42 | GL, // glue 43 | NS, // no-start 44 | EX, // exclamation/interrogation 45 | SY, // Syntax (slash) 46 | IS, // infix (numeric) separator 47 | PR, // prefix 48 | PO, // postfix 49 | NU, // numeric 50 | AL, // alphabetic 51 | ID, // ideograph (atomic) 52 | IN, // inseparable 53 | HY, // hyphen 54 | BA, // break after 55 | BB, // break before 56 | B2, // break both 57 | ZW, // ZW space 58 | CM, // combining mark 59 | WJ, // word joiner 60 | 61 | // used for Korean Syllable Block pair table 62 | H2, // Hamgul 2 Jamo Syllable 63 | H3, // Hangul 3 Jamo Syllable 64 | JL, // Jamo leading consonant 65 | JV, // Jamo vowel 66 | JT, // Jamo trailing consonant 67 | 68 | // these are not handled in the pair tables 69 | SA, // South (East) Asian 70 | SP, // space 71 | PS, // paragraph and line separators 72 | BK, // hard break (newline) 73 | CR, // carriage return 74 | LF, // line feed 75 | NL, // next line 76 | CB, // contingent break opportunity 77 | SG, // surrogate 78 | AI, // ambiguous 79 | XX, // unknown 80 | }; 81 | 82 | 83 | // Break actions are the types of break opportunities that may occur at a particular 84 | // point in the input. Values for these are also needed in the UI portion of the code 85 | // so they are already defined here - for explanation see below in the line break 86 | // section. 87 | enum break_action 88 | { 89 | DIRECT_BRK, 90 | INDIRECT_BRK, 91 | COMBINING_INDIRECT_BRK, 92 | COMBINING_PROHIBITED_BRK, 93 | PROHIBITED_BRK, 94 | EXPLICIT_BRK, 95 | HANGUL_SPACE_BRK, 96 | }; 97 | 98 | int classifyLnBrk(const LPTSTR pszText, break_class * pcls, int cch); 99 | int findLineBrk(break_class *pcls, break_action *pbrk, int cch); 100 | int findComplexBreak(break_class cls, break_class *pcls, break_action *pbrk, int cch); 101 | 102 | break_class LBClassFromCh(TCHAR ch); 103 | int CharFromVisible(int ch); 104 | bool isIdeographic(TCHAR c); 105 | 106 | } 107 | 108 | #endif // ifndef _LINEBRK_H_ 109 | -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/LineBreakSample.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {BD60E6CB-A246-4DEF-B3F7-56B381692F78} 15 | Win32Proj 16 | LineBreakSample 17 | 18 | 19 | 20 | Application 21 | true 22 | v110 23 | Unicode 24 | 25 | 26 | Application 27 | false 28 | v110 29 | true 30 | Unicode 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | true 44 | 45 | 46 | false 47 | 48 | 49 | 50 | NotUsing 51 | Level3 52 | Disabled 53 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 54 | true 55 | 56 | 57 | Console 58 | true 59 | 60 | 61 | 62 | 63 | Level3 64 | Use 65 | MaxSpeed 66 | true 67 | true 68 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 69 | true 70 | 71 | 72 | Console 73 | true 74 | true 75 | true 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | Create 91 | Create 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | LineBreak 2 | ========= 3 | 4 | 国内游戏对于中英文混杂的字符串在自动断行上不够完善,根据网上Unicode Line Break算法重新完善,支持中英文 数字 全角半角标点符号自动换行 5 | 6 | 7 | ####Unicode Line Breaking Algorithm实现 8 | 9 | #####算法背景: 10 | 11 | 由于公司内游戏对国际化支持不够友好,其中文字断行方面并未够完美,仅仅支持空格以及换行符,并未考虑中文全角标点符号,英语以及数字混合等复杂情况,本算法正是为了完美解决中文 英语以及标点符号 数学表达式的自动换行。 12 | 13 | #####算法综述: 14 | 15 | 此算法基于http://www.unicode.org/reports/tr14/规则并参考http://www.unicode.org/Public/PROGRAMS/LineBreakSampleCpp/5.2.0/实现。 16 | 17 | 由于英语换行是有明确规则,Unicode Line Breaking Algorithm上明确定义了所有英语换行规则,例如: 18 | 19 | 1.英语字母与空格一起时运行空格后面换行 20 | 21 | 2.当英语字母处于破折号-前面时不允许换行,但允许在-后面换行,同时如果-后面是数字的话则不允许换行。 22 | 23 | 3.当左括号(后面接着英语字母或者数字时不允许换行,同时当英语字母或者数字紧挨着右括号)时不允许换行。 24 | 25 | 26 | 因此Unicode Line Breaking Algorithm先把所有字符归类,然后根据一定的规则判断字符间是否可以换行,算法主要用途在于解析出给定字符串可以换行的地方。 27 | 28 | ######一 字符归类 29 | 30 | 因此Unicode Line Breaking Algorithm首先便利需要判断断行的字符串,并将其中的字符转换成同样的一类: 31 | 32 | 1.大小写字母为AL,数字为NU,CJK表意文字(中文 韩文 日文)则为ID,空格则为SP 33 | 34 | 2.( 左括号 左引号则为OP, ) 则为CP 35 | ……… 36 | 37 | 38 | 39 | 下图则是对ANSI编码 00—7F的字符归类处理 40 | ``` 41 | break_class LnBrkClassFromChar[] = 42 | { 43 | // treat CB as BB for demo purposes 44 | // 0 1 2 3 4 5 6 7 8 9 a b c d e f 45 | AL, ZW, GL, GL, BA, GL, AL, B2, IN, BA, LF, CB, AL, CR, NL, AL, // 00-0f 46 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 10-1f 47 | 48 | // ' ' ! " $ % & ' ( ) * + , - . / 49 | SP, EX, QU, IN, PR, PO, BB, QU, OP, CP, BA, PR, IN, HY, IN, SY, // 20-2f 50 | // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 51 | NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NS, AL, AL, GL, AL, EX, // 30-3f 52 | 53 | // @, A B C D E F G H I J K L M N O 54 | CB, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 40-4f 55 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CP, AL, IS, // 50-5f ... [ \ ] ^ _ 56 | CM, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 60-6f 57 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CL, AL, SA, // 70-7f ... { | } ~ DEL 58 | // p q r s t u v w x y z 59 | }; 60 | ``` 61 | 62 | 63 | ######二 字符断行判断 64 | 65 | 当预处理字符后,则根据在http://www.unicode.org/reports/tr14/ 预先定义好的规则进行判断两个字符中间是否可以断行,例如左括号OP紧挨着字母AL时,不允许断行 66 | 67 | 核心规则表: 68 | 69 | ``` 70 | break_action brkPairs[][JT+1]= 71 | { // --- 'after' class ------ 72 | // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 73 | // OP, CL, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, = after class 74 | /*OP*/ XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, CC, XX, XX, XX, XX, XX, XX, // OP open 75 | /*CL*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close 76 | /*CP*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close 77 | /*QU*/ XX, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // QU quotation 78 | /*GL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // GL glue 79 | /*NS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NS no-start 80 | /*EX*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // EX exclamation/interrogation 81 | /*SY*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // SY Syntax (slash) 82 | /*IS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IS infix (numeric) separator 83 | /*PR*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, SS, oo, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, SS, // PR prefix 84 | /*PO*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NU numeric 85 | 86 | // Version 5.2.0 and higher 87 | /*NU*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic 88 | /*AL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic 89 | 90 | /*ID*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // ID ideograph (atomic) 91 | /*IN*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IN inseparable 92 | 93 | /*HY*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // HY hyphens and spaces 94 | 95 | /*BA*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // BA break after 96 | /*BB*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // BB break before 97 | /*B2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, XX, XX, cc, XX, oo, oo, oo, oo, oo, // B2 break either side, but not pair 98 | /*ZW*/ oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, XX, oo, oo, oo, oo, oo, oo, oo, // ZW zero width space 99 | /*CM*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CM combining mark 100 | /*WJ*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // WJ word joiner 101 | 102 | /*H2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Hangul 2 Jamo syllable 103 | /*H3*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Hangul 3 Jamo syllable 104 | /*JL*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, oo, // Jamo Leading Consonant 105 | /*JV*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Jamo Vowel 106 | /*JT*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Jamo Trailing Consonant 107 | 108 | }; 109 | ``` 110 | 111 | 其中规则表里定义的动作acticon: 112 | ``` 113 | // Define some short-cuts for the table 114 | #define oo DIRECT_BRK // '_' break allowed 115 | #define SS INDIRECT_BRK // '%' only break across space (aka 'indirect break' below) 116 | #define cc COMBINING_INDIRECT_BRK // '#' indirect break for combining marks 117 | #define CC COMBINING_PROHIBITED_BRK // '@' indirect break for combining marks 118 | #define XX PROHIBITED_BRK // '^' no break allowed_BRK 119 | #define xS HANGUL_SPACE_BRK // break allowed, except when spaces are used with Hangul (not used) 120 | ``` 121 | 122 | OO表示可以直接断行 123 | SS 表示如果两个字符间有空格则允许断行,否则不允许断行,例如两个英语字母之间不允许断行 124 | XX 任何情况下都不允许断行 125 | 126 | ######三 中文CJK判断 127 | 128 | 这部分实现主要参考 Android 源代码 StaticLayout.java中的函数 129 | private static final boolean isIdeographic(char c, boolean includeNonStarters) 130 | 131 | CJK文字主要Unicode范围判断则可以参考此文档 132 | 完整的CJK Unicode范围(5.0版) 133 | 134 | 135 | #####四 版权说明 136 | 137 | 此换行算法核心规则来源于Unicode Line Breaking Algorithm (http://www.unicode.org/unicode/reports/tr14/), 138 | 由于英语换行有明确的规则,Unicode Line Breaking Algorithm预先定义规则,并利用表格驱动方法来判断字符是否可以换行, 139 | 140 | 具体代码绝大部分借鉴于 141 | http://www.unicode.org/Public/PROGRAMS/LineBreakSampleCpp/5.2.0/,作者仅修复其中不合理BUG并在此基础上添加对中文以及中文全角标点符号支持 142 | 中文全角字符的判断区间函数isIdeographic参考 Android 源代码 StaticLayout.java中的函数 143 | boolean isIdeographic(char c, boolean includeNonStarters) 144 | 145 | 未经过大量测试,中英文 半角/全角符号 数学表达式换行比较完美. 146 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /LineBreakSample/LineBreakSample/LineBreak.cpp: -------------------------------------------------------------------------------- 1 | #include "LineBreak.h" 2 | 3 | 4 | // Line Break Character Types 5 | // These correspond to the line break class values defined in UAX#14, Version 6 | // 5.0.0. In a real implementation, there would be a mapping from character 7 | // code to line break class value. In this demo version, the mapping is from 8 | // a pseudo alphabet to these line break classes. The actual line break algorithm 9 | // takes as input only line break classes, so, by changing the mapping from 10 | // pseudo alphabet to actual Unicode Characters, this demo could be adapted 11 | // for use in actual line breaking. 12 | 13 | namespace LINE_BREAK 14 | { 15 | 16 | 17 | break_class LnBrkClassFromChar[] = 18 | { 19 | // treat CB as BB for demo purposes 20 | // 0 1 2 3 4 5 6 7 8 9 a b c d e f 21 | AL, ZW, GL, GL, BA, GL, AL, B2, IN, BA, LF, CB, AL, CR, NL, AL, // 00-0f 22 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 10-1f 23 | 24 | // ' ' ! " $ % & ' ( ) * + , - . / 25 | SP, EX, QU, IN, PR, PO, BB, QU, OP, CP, BA, PR, IN, HY, IN, SY, // 20-2f 26 | // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 27 | NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NS, AL, AL, GL, AL, EX, // 30-3f 28 | 29 | // @, A B C D E F G H I J K L M N O 30 | CB, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 40-4f 31 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CP, AL, IS, // 50-5f ... [ \ ] ^ _ 32 | CM, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 60-6f 33 | AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CL, AL, SA, // 70-7f ... { | } ~ DEL 34 | // p q r s t u v w x y z 35 | }; 36 | 37 | 38 | 39 | /*--------------------------------------------------------------------------- 40 | Function: classify 41 | 42 | Determines the character classes for all following 43 | passes of the algorithm 44 | 45 | This uses a pseudo alphabet as input - see the szExplain string 46 | above for a description. In a production version, this function 47 | would implement the line break property lookup for actual Unicode 48 | characters. 49 | 50 | Input: Text string 51 | Character count 52 | 53 | Output: Array of linebreak classes 54 | 55 | ----------------------------------------------------------------------------*/ 56 | int classifyLnBrk(const LPTSTR pszText, break_class * pcls, int cch) 57 | { 58 | int ich; 59 | for (ich = 0; ich < cch; ich++) 60 | { 61 | if(pszText[ich] > CHAR_FIRST_CJK) 62 | { 63 | pcls[ich] = isIdeographic(pszText[ich]) ? ID : NS; 64 | 65 | //对于小括号 中括号 大括号进行特殊处理 66 | TCHAR c = pszText[ich]; 67 | if(c == 0xFF08 || c == 0xFF3B || c == 0xFF5B) //小 中 大括号 68 | pcls[ich] = OP; 69 | else if(c == 0xFF09 || c == 0xFF3D || c == 0xFF5D) 70 | pcls[ich] = CP; 71 | 72 | continue; 73 | } 74 | 75 | TCHAR c = pszText[ich]; 76 | //增加中文单引号 双引号支持 77 | if(c == 8216 || c == 8220) 78 | { 79 | pcls[ich] = OP; 80 | continue; 81 | } 82 | 83 | if(c == 8217 || c == 8221) 84 | { 85 | pcls[ich] = CP; 86 | continue; 87 | } 88 | 89 | pcls[ich] = LBClassFromCh(pszText[ich]); 90 | 91 | // map unknown, and ambiguous to AL by default 92 | if (pcls[ich] == XX || pcls[ich] == AI) 93 | pcls[ich] = AL; 94 | 95 | // map contingent break to B2 by default 96 | // this saves a row/col for CB in the table 97 | // but only approximates rule 20 98 | if (pcls[ich] == CB) 99 | pcls[ich] = B2; 100 | 101 | /* If the following remapping is enabled, all tests involving 102 | NL can be removed from the main loop below. 103 | 104 | // map NL to BK as there's no difference 105 | if (pcls[ich] == NL) 106 | pcls[ich] = BK; 107 | */ 108 | } 109 | return ich; 110 | } 111 | 112 | // mapping of special character codes to Unicode symbols for visualization 113 | int chVisibleFromSpecial[] = 114 | { 115 | /* ZW 1 chZWSP */ 0x2020, // show as dagger 116 | /* GL 2 chZWNBSP */ 0x2021, // show as double dagger 117 | /* GL 3 chNBHY */ 0x00AC, // show as not sign 118 | /* BA 4 chSHY */ 0x00B7, // show as dot 119 | /* GL 5 chNBSP */ 0x2017, // show as low line 120 | /* -- 6 chDummy1 */ 0x203E, // show as double low line 121 | /* B2 7 chEM */ 0x2014, // show as em dash 122 | /* IN 8 chELLIPSIS */ 0x2026, // show as ellipsis 123 | /* CM 9 chTB */ 0x2310, // show as not sign 124 | /* LF 10 chLFx */ 0x2580, // show as high square 125 | /* CB 11 chOBJ */ 0x2302, // show as house (delete) 126 | /* -- 12 chdummy2 */ 0x2222, 127 | /* CR 13 chCRx */ 0x2584, // show as low square 128 | /* NL 14 chNLx */ 0x258C, // show as left half block 129 | }; 130 | 131 | // map visible symbol to character 132 | int CharFromVisible(int ch) 133 | { 134 | for (int ich = 0; ich < sizeof chVisibleFromSpecial / sizeof (int); ich ++) 135 | { 136 | if (ch == chVisibleFromSpecial[ich]) 137 | { 138 | return ich + 1; 139 | } 140 | } 141 | return ch; 142 | } 143 | 144 | break_class LBClassFromCh(TCHAR ch) 145 | { 146 | ch = LINE_BREAK::CharFromVisible((int)ch); 147 | if (ch >= 0x7f) 148 | return XX; 149 | return LnBrkClassFromChar[ch]; 150 | } 151 | 152 | 153 | //2 // === LINE BREAK DEFINITIONS =================================================== 154 | 155 | // Define some short-cuts for the table 156 | #define oo DIRECT_BRK // '_' break allowed 157 | #define SS INDIRECT_BRK // '%' only break across space (aka 'indirect break' below) 158 | #define cc COMBINING_INDIRECT_BRK // '#' indirect break for combining marks 159 | #define CC COMBINING_PROHIBITED_BRK // '@' indirect break for combining marks 160 | #define XX PROHIBITED_BRK // '^' no break allowed_BRK 161 | #define xS HANGUL_SPACE_BRK // break allowed, except when spaces are used with Hangul (not used) 162 | 163 | // xS not yet assigned in the table below 164 | 165 | //2 // === LINE BREAK PAIR TABLE =================================================== 166 | 167 | // Line Break Pair Table corresponding to Table 2 of UAX#14, Version 5.0.0 168 | // plus Korean Syllable Block extensions - for details see that document 169 | 170 | // Additional rows added or replaced for versions 5.0.1, 5.1.0 or 5.2.0 as needed by conditional compilation 171 | // Additional column added for version 5.2.0 (CP). In earlier versions this acts identical to col for CL. 172 | 173 | break_action brkPairs[][JT+1]= 174 | { // --- 'after' class ------ 175 | // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 176 | // OP, CL, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, = after class 177 | /*OP*/ XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, CC, XX, XX, XX, XX, XX, XX, // OP open 178 | /*CL*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close 179 | /*CP*/ oo, XX, XX, SS, SS, XX, XX, XX, XX, SS, SS, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close 180 | /*QU*/ XX, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // QU quotation 181 | /*GL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // GL glue 182 | /*NS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NS no-start 183 | /*EX*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // EX exclamation/interrogation 184 | /*SY*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // SY Syntax (slash) 185 | /*IS*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IS infix (numeric) separator 186 | /*PR*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, SS, oo, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, SS, // PR prefix 187 | /*PO*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NU numeric 188 | 189 | // Version 5.2.0 and higher 190 | /*NU*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic 191 | /*AL*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic 192 | 193 | /*ID*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // ID ideograph (atomic) 194 | /*IN*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IN inseparable 195 | 196 | /*HY*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // HY hyphens and spaces 197 | 198 | /*BA*/ oo, XX, XX, SS, oo, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // BA break after 199 | /*BB*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // BB break before 200 | /*B2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, XX, XX, cc, XX, oo, oo, oo, oo, oo, // B2 break either side, but not pair 201 | /*ZW*/ oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, XX, oo, oo, oo, oo, oo, oo, oo, // ZW zero width space 202 | /*CM*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CM combining mark 203 | /*WJ*/ SS, XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // WJ word joiner 204 | 205 | /*H2*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Hangul 2 Jamo syllable 206 | /*H3*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Hangul 3 Jamo syllable 207 | /*JL*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, oo, // Jamo Leading Consonant 208 | /*JV*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Jamo Vowel 209 | /*JT*/ oo, XX, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Jamo Trailing Consonant 210 | 211 | }; 212 | 213 | // handle spaces separately, all others by table 214 | // pcls - pointer to array of line breaking classes (input) 215 | // pbrk - pointer to array of line break opportunities (output) 216 | // cch - number of elements in the arrays (揷ount of characters? (input) 217 | // ich - current index into the arrays (variable) (returned value) 218 | // cls - current resolved line break class for 'before' character (variable) 219 | int findLineBrk(break_class *pcls, break_action *pbrk, int cch) 220 | { 221 | if (cch <= 0) 222 | return 0; 223 | 224 | break_class cls = pcls[0]; 225 | 226 | // handle case where input starts with an LF 227 | if (cls == LF) 228 | cls = BK; 229 | 230 | // treat NL like BK 231 | if (cls == NL) 232 | cls = BK; 233 | 234 | // treat SP at start of input as if it followed WJ 235 | if (cls == SP) 236 | cls = WJ; 237 | 238 | // loop over all pairs in the string up to a hard break or CRLF pair 239 | int ich; 240 | for (ich = 1; (ich < cch) && (cls != BK) && (cls != CR || pcls[ich] == LF); ich++) { 241 | 242 | // handle spaces explicitly 243 | if (pcls[ich] == SP) { 244 | pbrk[ich-1] = PROHIBITED_BRK; // apply rule LB 7: ?SP 245 | continue; // do not update cls 246 | } 247 | 248 | if (pcls[ich] == BK || pcls[ich] == NL || pcls[ich] == LF) { 249 | pbrk[ich-1] = PROHIBITED_BRK; 250 | cls = BK; 251 | continue; 252 | } 253 | 254 | if (pcls[ich] == CR) 255 | { 256 | pbrk[ich-1] = PROHIBITED_BRK; 257 | cls = CR; 258 | continue; 259 | } 260 | 261 | // handle complex scripts in a separate function 262 | if (cls == SA || pcls[ich] == SA) { 263 | ich += findComplexBreak(cls, &pcls[ich-1], &pbrk[ich-1], cch - (ich-1)); 264 | if (ich < cch) 265 | cls = pcls[ich]; 266 | continue; 267 | } 268 | 269 | if(!(cls < SP) || !(pcls[ich] < SP)) 270 | { 271 | continue; 272 | } 273 | 274 | // lookup pair table information in brkPairs[before, after]; 275 | enum break_action brk = brkPairs[cls][pcls[ich]]; 276 | 277 | pbrk[ich-1] = brk; // save break action in output array 278 | 279 | if (brk == INDIRECT_BRK) { // resolve indirect break 280 | if (pcls[ich - 1] == SP) // if context is A SP * B 281 | pbrk[ich - 1] = DIRECT_BRK; // break opportunity 282 | else // else 283 | pbrk[ich-1] = PROHIBITED_BRK; // no break opportunity 284 | } else if (brk == COMBINING_PROHIBITED_BRK) { // this is the case OP SP* CM 285 | pbrk[ich-1] = COMBINING_PROHIBITED_BRK; // no break allowed 286 | if (pcls[ich-1] != SP) 287 | continue; // apply rule 9: X CM* -> X 288 | } else if (brk == COMBINING_INDIRECT_BRK) { // resolve combining mark break 289 | pbrk[ich-1] = PROHIBITED_BRK; // don't break before CM 290 | if (pcls[ich-1] == SP){ 291 | if (false) // new: SP is not a base 292 | pbrk[ich-1] = COMBINING_INDIRECT_BRK; // apply rule SP ? 293 | else 294 | { 295 | pbrk[ich-1] = PROHIBITED_BRK; // legacy: keep SP CM together 296 | if (ich > 1) 297 | pbrk[ich-2] = ((pcls[ich - 2] == SP) ? INDIRECT_BRK : DIRECT_BRK); 298 | } 299 | } else // apply rule 9: X CM * -> X 300 | continue; // don't update cls 301 | } 302 | cls = pcls[ich]; // save cls of current character 303 | } 304 | // always break at the end 305 | pbrk[ich-1] = EXPLICIT_BRK; 306 | 307 | return ich; 308 | } 309 | 310 | // placeholder function for complex break analysis 311 | // cls - last resolved line break class (this is !SA) 312 | // pcls - pointer to array of line breaking classes with pcls[0] == SA (input) 313 | // pbrk - pointer to array of line breaking opportunities (output) 314 | // 315 | int findComplexBreak(break_class cls, break_class *pcls, break_action *pbrk, int cch) 316 | { 317 | if (!cch) 318 | return 0; 319 | 320 | int ich; 321 | for (ich = 1; ich < cch; ich++) { 322 | 323 | // .. do complex break analysis here 324 | // and report any break opportunities in pbrk .. 325 | 326 | pbrk[ich-1] = PROHIBITED_BRK; // by default: no break 327 | 328 | if (pcls[ich] != SA) 329 | break; 330 | } 331 | return ich; 332 | } 333 | 334 | 335 | /** 336 | * Returns true if the specified character is one of those specified 337 | * as being Ideographic (class ID) by the Unicode Line Breaking Algorithm 338 | * (http://www.unicode.org/unicode/reports/tr14/), and is therefore OK 339 | * to break between a pair of. 340 | * 341 | * @see Android Source Code StaticLayout.Java (isIdeographic) 342 | */ 343 | bool isIdeographic(WChar c) { 344 | if (c >= 0x2E80 && c <= 0x2FFF) 345 | { 346 | return true; // CJK, KANGXI RADICALS, DESCRIPTION SYMBOLS 347 | } 348 | if (c == 0x3000) 349 | { 350 | return true; // IDEOGRAPHIC SPACE 351 | } 352 | 353 | if(c >= 0x3000 && c <= 0x303f) //CJK标点符号 354 | { 355 | return false; 356 | } 357 | 358 | if (c >= 0x3040 && c <= 0x309F) //日文平假名 359 | { 360 | switch (c) 361 | { 362 | case 0x3041: // # HIRAGANA LETTER SMALL A 363 | case 0x3043: // # HIRAGANA LETTER SMALL I 364 | case 0x3045: // # HIRAGANA LETTER SMALL U 365 | case 0x3047: // # HIRAGANA LETTER SMALL E 366 | case 0x3049: // # HIRAGANA LETTER SMALL O 367 | case 0x3063: // # HIRAGANA LETTER SMALL TU 368 | case 0x3083: // # HIRAGANA LETTER SMALL YA 369 | case 0x3085: // # HIRAGANA LETTER SMALL YU 370 | case 0x3087: // # HIRAGANA LETTER SMALL YO 371 | case 0x308E: // # HIRAGANA LETTER SMALL WA 372 | case 0x3095: // # HIRAGANA LETTER SMALL KA 373 | case 0x3096: // # HIRAGANA LETTER SMALL KE 374 | case 0x309B: // # KATAKANA-HIRAGANA VOICED SOUND MARK 375 | case 0x309C: // # KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 376 | case 0x309D: // # HIRAGANA ITERATION MARK 377 | case 0x309E: // # HIRAGANA VOICED ITERATION MARK 378 | return false; 379 | break; 380 | default : 381 | return true; 382 | 383 | } 384 | } 385 | 386 | if (c >= 0x30A0 && c <= 0x30FF) //日文片假名 387 | { 388 | switch (c) { 389 | case 0x30A0: // # KATAKANA-HIRAGANA DOUBLE HYPHEN 390 | case 0x30A1: // # KATAKANA LETTER SMALL A 391 | case 0x30A3: // # KATAKANA LETTER SMALL I 392 | case 0x30A5: // # KATAKANA LETTER SMALL U 393 | case 0x30A7: // # KATAKANA LETTER SMALL E 394 | case 0x30A9: // # KATAKANA LETTER SMALL O 395 | case 0x30C3: // # KATAKANA LETTER SMALL TU 396 | case 0x30E3: // # KATAKANA LETTER SMALL YA 397 | case 0x30E5: // # KATAKANA LETTER SMALL YU 398 | case 0x30E7: // # KATAKANA LETTER SMALL YO 399 | case 0x30EE: // # KATAKANA LETTER SMALL WA 400 | case 0x30F5: // # KATAKANA LETTER SMALL KA 401 | case 0x30F6: // # KATAKANA LETTER SMALL KE 402 | case 0x30FB: // # KATAKANA MIDDLE DOT 403 | case 0x30FC: // # KATAKANA-HIRAGANA PROLONGED SOUND MARK 404 | case 0x30FD: // # KATAKANA ITERATION MARK 405 | case 0x30FE: // # KATAKANA VOICED ITERATION MARK 406 | return false; 407 | break; 408 | default : 409 | return true; 410 | } 411 | } 412 | 413 | if (c >= 0x3400 && c <= 0x4DB5) { 414 | return true; // CJK UNIFIED IDEOGRAPHS EXTENSION A 415 | } 416 | if (c >= 0x4E00 && c <= 0x9FBB) { 417 | return true; // CJK UNIFIED IDEOGRAPHS 418 | } 419 | if (c >= 0xF900 && c <= 0xFAD9) { 420 | return true; // CJK COMPATIBILITY IDEOGRAPHS 421 | } 422 | if (c >= 0xA000 && c <= 0xA48F) { 423 | return true; // YI SYLLABLES 424 | } 425 | if (c >= 0xA490 && c <= 0xA4CF) { 426 | return true; // YI RADICALS 427 | } 428 | if (c >= 0xFE62 && c <= 0xFE66) { 429 | return true; // SMALL PLUS SIGN to SMALL EQUALS SIGN 430 | } 431 | if (c >= 0xFF10 && c <= 0xFF19) { 432 | return true; // WIDE DIGITS 433 | } 434 | 435 | if(c >= 0xFF01 && c <= 0xFF0F) { 436 | return false; 437 | } 438 | 439 | if(c >= 0xFF1A && c <= 0xFF20) { 440 | return false; 441 | } 442 | 443 | if((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A)) { 444 | return true; //WIDTH Letter 445 | } 446 | 447 | return false; 448 | } 449 | 450 | } --------------------------------------------------------------------------------