├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── extlib
├── UTF8-CPP
│ └── include
│ │ └── utf8
│ │ ├── checked.h
│ │ ├── core.h
│ │ ├── unchecked.h
│ │ └── utf8.h
└── u5e
│ ├── LICENSE
│ ├── include
│ └── u5e
│ │ ├── basic_encodedstring.hpp
│ │ ├── basic_grapheme.hpp
│ │ ├── basic_grapheme_iterator.hpp
│ │ ├── canonical_combining_order.hpp
│ │ ├── canonical_composition.hpp
│ │ ├── canonical_decomposition.hpp
│ │ ├── codepoint.hpp
│ │ ├── codepoint_decomposition.hpp
│ │ ├── codepoint_traits.hpp
│ │ ├── compatibility_and_canonical_decomposition.hpp
│ │ ├── encoding_assertion.hpp
│ │ ├── filter.hpp
│ │ ├── iterator_assertion.hpp
│ │ ├── normalization_form_c.hpp
│ │ ├── normalization_form_d.hpp
│ │ ├── normalization_form_kc.hpp
│ │ ├── normalization_form_kd.hpp
│ │ ├── props
│ │ ├── canonical_combining_class.hpp
│ │ ├── canonical_composition_mapping.hpp
│ │ ├── canonical_decomposition_mapping.hpp
│ │ ├── compatibility_and_canonical_decomposition_mapping.hpp
│ │ └── grapheme_cluster_break.hpp
│ │ ├── utf32ne.hpp
│ │ ├── utf32ne_string.hpp
│ │ ├── utf32ne_string_grapheme.hpp
│ │ ├── utf32ne_string_grapheme_iterator.hpp
│ │ ├── utf32ne_string_view.hpp
│ │ ├── utf32ne_string_view_grapheme.hpp
│ │ ├── utf32ne_string_view_grapheme_iterator.hpp
│ │ ├── utf8.hpp
│ │ ├── utf8_bounds.hpp
│ │ ├── utf8_iterator.hpp
│ │ ├── utf8_string.hpp
│ │ ├── utf8_string_grapheme.hpp
│ │ ├── utf8_string_grapheme_iterator.hpp
│ │ ├── utf8_string_view.hpp
│ │ ├── utf8_string_view_grapheme.hpp
│ │ ├── utf8_string_view_grapheme_iterator.hpp
│ │ ├── utf8_util.hpp
│ │ └── version.hpp
│ └── src
│ └── u5e
│ ├── props
│ ├── CompositionExclusions.txt
│ ├── GraphemeBreakProperty.txt
│ ├── UnicodeData.txt
│ ├── canonical_combining_class.cpp
│ ├── canonical_combining_class_data.hpp
│ ├── canonical_combining_class_data.pl
│ ├── canonical_composition_mapping.cpp
│ ├── canonical_composition_mapping_data.hpp
│ ├── canonical_composition_mapping_data.pl
│ ├── canonical_decomposition_mapping.cpp
│ ├── canonical_decomposition_mapping_data.hpp
│ ├── canonical_decomposition_mapping_data.pl
│ ├── compatibility_and_canonical_decomposition_mapping.cpp
│ ├── compatibility_and_canonical_decomposition_mapping_data.hpp
│ ├── compatibility_and_canonical_decomposition_mapping_data.pl
│ ├── grapheme_cluster_break.cpp
│ ├── grapheme_cluster_break_data.hpp
│ └── grapheme_cluster_break_data.sh
│ └── version.cpp
├── include
└── Aheuiplusplus
│ ├── Aheuiplusplus.hpp
│ ├── code.hpp
│ ├── command_line.hpp
│ ├── cursor.hpp
│ ├── debugger.hpp
│ ├── element.hpp
│ ├── extension.hpp
│ ├── function.hpp
│ ├── interpreter.hpp
│ ├── mode.hpp
│ ├── namespace.hpp
│ ├── storage.hpp
│ └── version.hpp
└── src
├── command_line.cpp
├── cursor.cpp
├── debugger.cpp
├── element.cpp
├── extension.cpp
├── function.cpp
├── interpreter.cpp
├── main.cpp
├── mode.cpp
├── namespace.cpp
├── storage.cpp
└── version_.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | test*.*
3 |
4 | # CMake
5 | CMakeFiles/
6 | CMakeScripts/
7 | CMakeCache.txt
8 | cmake_install.cmake
9 | install_manifest.txt
10 | compile_commands.json
11 | CTestTestfile.cmake
12 |
13 | # Make
14 | Makefile
15 |
16 | # Visual Studio
17 | .vs/
18 | *.sln
19 | *.vcxproj
20 | *.filters
21 | *.user
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8.0)
2 | project(Aheuiplusplus CXX)
3 |
4 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
5 | set(CMAKE_CXX_STANDARD 17)
6 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
7 | set(CMAKE_CXX_EXTENSIONS OFF)
8 |
9 | set(INCLUDE_DIR "./include")
10 | set(SOURCE_DIR "./src")
11 | set(OUTPUT_DIR "./bin")
12 | set(EXTLIB_DIR "./extlib")
13 |
14 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_DIR})
15 |
16 | # Cache
17 | ## COMPILE_TARGET
18 | set(COMPILE_TARGET "Executable" CACHE STRING "아희++을 어떤 형태로 컴파일 할지 설정합니다.")
19 | set_property(CACHE COMPILE_TARGET PROPERTY STRINGS "Executable" "Library")
20 | string(TOLOWER ${COMPILE_TARGET} COMPILE_TARGET)
21 |
22 | if(${COMPILE_TARGET} STREQUAL "e" OR ${COMPILE_TARGET} STREQUAL "exe")
23 | set(COMPILE_TARGET "executable")
24 | elseif(${COMPILE_TARGET} STREQUAL "l" OR ${COMPILE_TARGET} STREQUAL "lib")
25 | set(COMPILE_TARGET "library")
26 | endif(${COMPILE_TARGET} STREQUAL "e" OR ${COMPILE_TARGET} STREQUAL "exe")
27 |
28 | ## USE_EXTENSION
29 | set(USE_EXTENSION ON CACHE BOOL "아희++ 표준 인터프리터 확장을 사용할지 설정합니다.")
30 |
31 | ## PRINT_BENCHMARK
32 | set(PRINT_BENCHMARK OFF CACHE BOOL "아희++ 표준 인터프리터의 성능을 출력할지 설정합니다.")
33 |
34 | # Searching files
35 | ## Header files
36 | include_directories(${INCLUDE_DIR})
37 | include_directories(${EXTLIB_DIR}/u5e/include)
38 |
39 | ## Source files
40 | file(GLOB SOURCE_LIST ${SOURCE_DIR}/*.cpp)
41 | file(GLOB EXTLIB_U5E_SOURCE_LIST ${EXTLIB_DIR}/u5e/src/u5e/*.cpp ${EXTLIB_DIR}/u5e/src/u5e/props/*.cpp)
42 |
43 | # Compilation
44 | add_definitions(-D__STDC_CONSTANT_MACROS)
45 | add_definitions(-D__STDC_LIMIT_MACROS)
46 |
47 | if(${COMPILE_TARGET} STREQUAL "executable")
48 | add_definitions(-DAHEUIPLUSPLUS_TARGET=1)
49 |
50 | add_executable(${PROJECT_NAME} ${SOURCE_LIST} ${EXTLIB_U5E_SOURCE_LIST})
51 | elseif(${COMPILE_TARGET} STREQUAL "library")
52 | add_definitions(-DAHEUIPLUSPLUS_TARGET=2)
53 |
54 | add_library(${PROJECT_NAME} STATIC ${SOURCE_LIST} ${EXTLIB_U5E_SOURCE_LIST})
55 | endif(${COMPILE_TARGET} STREQUAL "executable")
56 |
57 | if(${USE_EXTENSION})
58 | add_definitions(-DAHEUIPLUSPLUS_USE_EXTENSION)
59 | endif(${USE_EXTENSION})
60 |
61 | if(${PRINT_BENCHMARK})
62 | add_definitions(-DAHEUIPLUSPLUS_PRINT_BENCHMARK)
63 | endif(${PRINT_BENCHMARK})
64 |
65 | # Installation
66 | if(${COMPILE_TARGET} STREQUAL "executable")
67 | install(TARGETS ${PROJECT_NAME} DESTINATION bin)
68 | elseif(${COMPILE_TARGET} STREQUAL "library")
69 | install(TARGETS ${PROJECT_NAME} DESTINATION lib)
70 |
71 | ## Header files
72 | file(GLOB HEADER_LIST ${INCLUDE_DIR}/Aheuiplusplus/*.hpp)
73 | file(GLOB EXTLIB_U5E_HEADER_LIST ${EXTLIB_DIR}/u5e/include/u5e/*.hpp)
74 | file(GLOB EXTLIB_U5E_PROPS_HEADER_LIST ${EXTLIB_DIR}/u5e/include/u5e/props/*.hpp)
75 | file(GLOB EXTLIB_UTF8_CPP_HEADER_LIST ${EXTLIB_DIR}/UTF8-CPP/include/utf8/*.h)
76 |
77 | install(FILES ${HEADER_LIST} DESTINATION include/Aheuiplusplus)
78 | install(FILES ${EXTLIB_U5E_HEADER_LIST} DESTINATION include/u5e)
79 | install(FILES ${EXTLIB_U5E_PROPS_HEADER_LIST} DESTINATION include/u5e/props)
80 | install(FILES ${EXTLIB_UTF8_CPP_HEADER_LIST} DESTINATION include/utf8)
81 | endif(${COMPILE_TARGET} STREQUAL "executable")
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # 아희++ 표준 인터프리터에 대한 기여에 관한 작은 규칙
2 | 최신 버전이 아닐 수 있습니다. 최신 버전은 [이곳](https://github.com/kmc7468/Aheuiplusplus/blob/master/CONTRIBUTING.md)에서 확인할 수 있습니다.
3 | ## 기본적인 예의
4 | - 커밋 제목 및 메세지, 커밋에 포함된 기여 내용 등 모든 부분에서 기본적인 예의를 준수하여 주십시오.
5 | - 예를 들어, 비존대어를 사용하거나, 비속어를 사용하지 마십시오.
6 | - 커밋 제목 및 메세지, 커밋에 포함된 기여 내용 등 모든 부분에서 인권 및 기여자의 소속 국가의 법률을 준수하여 주십시오.
7 | - 예를 들어, 라이선스 문제가 있는 소스 코드를 사용하거나, 커밋 메세지에 차별적 표현을 사용하지 마십시오.
8 | ## 파일
9 | - 모든 텍스트 파일은 **BOM이 있는 UTF-8**로 인코딩 해 주십시오.
10 | - 모든 헤더 파일 및 소스 파일의 줄바꿈 형식은 **CRLF**(`"\r\n"`)로 해주십시오.
11 | ## 커밋
12 | - 커밋 제목 및 메세지는 반드시 **한국어의 표준어**로 작성해 주십시오.
13 | - 커밋 제목은 동사로 끝나는 명사형(예: ~ 수정, ~ 업데이트, ~ 개선 등)을 사용해 주십시오.
14 | - 반드시 master 브랜치에만 커밋하여 주십시오.
15 | ## PR과 이슈
16 | - 반드시 master 브랜치에 대하여 열어 주십시오.
17 | - PR은 반드시 수락되는 것은 아님에 유의하십시오.
18 | ## 프로그래밍
19 | - 소스 코드는 반드시 크로스 플랫폼이 가능하도록 프로그래밍 하여 주십시오.
20 | ## 브랜치
21 | - **master 브랜치**
22 | 주 브랜치로, 모든 커밋은 반드시 이 브랜치에만 해야 합니다.
23 | - **stable 브랜치**
24 | master 브랜치에서 버전의 개발이 완료되어 릴리즈를 할 준비가 완료되었으며, 정식 출시가 가능한 안정화 된 버전일 경우 master 브랜치에서 stable 브랜치로 커밋을 병합합니다.
25 | - **pre-release 브랜치**
26 | master 브랜치에서 버전의 개발이 완료되어 릴리즈를 할 준비가 완료되었으나, 정식 출시 전 프리릴리즈일 경우 master 브랜치에서 pre-release 브랜치로 커밋을 병합합니다.
27 | ### 브랜칭 전략
28 | - 신규 기능을 구현할 때, 작업이 오래 걸릴 것으로 보이는 기능을 구현한다면 `"feature/(기능 이름)"`의 이름을 가진 브랜치를 master 브랜치에서 분기시킬 수 있습니다. 작업이 끝난 후에는 다시 master 브랜치로 병합시켜야 합니다. 병합을 한 후에는 브랜치를 삭제합니다.
29 | - 기능 이름은 알파벳 소문자, 언더바(`'_'`)로만 이루어진 명령문 형태의 영어 문장으로 되어 있어야 합니다. 문장은 최대한 간결하게 만듭니다. 예를 들어, `encoding` 클래스를 추가하는 작업을 할 예정이라면, 브랜치 이름을 `"feature/add_encoding_class"`로 지으면 됩니다.
30 | - 해당 브랜치에는 해당 기능 구현과 관련 없는 작업은 하지 마십시오.
31 | - 해당 브랜치에는 예외적으로 이슈와 PR을 넣을 수 있습니다.
32 | - 버그를 수정할 때, 작업이 오래 걸릴 것으로 보이는 기능을 구현한다면 `"bugfix/(버그 이름)"`의 이름을 가진 브랜치를 master 브랜치에서 분기시킬 수 있습니다. 작업이 끝난 후에는 다시 master 브랜치로 병합시켜야 합니다. 병합을 한 후에는 브랜치를 삭제합니다.
33 | - 버그 이름은 알파벳 소문자, 언더바(`'_'`)로만 이루어진 명령문 형태의 영어 문장으로 되어 있어야 합니다. 문장은 최대한 간결하게 만듭니다. 예를 들어, 리눅스에서 문자 입력이 되지 않는 버그를 수정할 예정이라면, 브랜치 이름을 `"bugfix/cannot_read_character"`로 지으면 됩니다.
34 | - 해당 브랜치에는 예외적으로 이슈와 PR을 넣을 수 있습니다.
35 | - 이미 출시된 릴리즈에 심각한 버그가 있을 경우 master 브랜치에서 stable 또는 pre-release 브랜치로 병합하는 커밋(안정된 릴리즈일 경우 stable 브랜치로, 프리릴리즈일 경우 pre-release 브랜치로 병합하는 커밋에서 분기합니다.)에서 `"bugfix/(버그 이름)"`의 이름을 가진 브랜치를 master 브랜치에서 분기한 후, 버그를 수정한 후 master 브랜치 및 stable 또는 pre-release 브랜치(안정된 릴리즈일 경우 stable 브랜치로, 프리릴리즈일 경우 pre-release 브랜치로 병합합니다.)로 병합합니다. 이때, master 브랜치에 먼저 병합을 한 후에 stable 또는 pre-release 브랜치에 병합해야 합니다. 병합을 한 후에는 브랜치를 삭제합니다.
36 | - 단, 프리릴리즈를 포함하여 가장 최신 릴리즈에서 심각한 버그가 발견되었을 경우, master 브랜치에 부 버전을 올려야 하는 기능 구현을 하지 않았다면 브랜치를 분기하지 않고 master 브랜치에서 작업합니다(단, 이 경우에도 버그의 수정이 오래 걸릴 것으로 보이면 브랜치를 분기할 수 있습니다.).
37 | - 해당 브랜치에는 예외적으로 이슈와 PR을 넣을 수 있습니다.
38 | ## 릴리즈 절차
39 | 1. master 브랜치에서 특정 버전에 대한 개발을 완료합니다.
40 | 2. 만약 정식 출시가 가능한 안정화 된 버전일 경우 stable 브랜치로 커밋을 병합합니다. 만약 정식 출시 전 프리릴리즈일 경우 pre-release 브랜치로 커밋을 병합합니다.
41 | 3. 만약 stable 브랜치로 병합했다면, stable 브랜치를 pre-release 브랜치에 병합합니다. master에서 pre-release로 병합하면 안됩니다.
42 | 4. Release를 작성합니다. 반드시 기존에 작성된 Release 게시글의 형식을 따라 주십시오.
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 kmc7468
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | **개발이 중단되었습니다! 최초로 통로를 제대로 구현한 아희 인터프리터 [톡희](https://github.com/kmc7468/talkheui)는 어떠신가요?**
2 |
3 | [](https://shields.io/) [](https://shields.io/)
4 | # 아희++
5 | 아희와 호환성이 있는 난해한 객체지향 한글 프로그래밍 언어
6 | - 인터프리터 버전: 2.0.0 (불안정한 버전)
7 | - 다른 버전: [1.2.1](https://github.com/kmc7468/Aheuiplusplus/tree/version/1.2.1)
8 | - 아직 개발중인 버전입니다.
9 |
10 | **개발이 중단되었습니다! 최초로 통로를 제대로 구현한 아희 인터프리터 [톡희](https://github.com/kmc7468/talkheui)는 어떠신가요?**
11 | ## [레퍼런스](https://github.com/kmc7468/Aheuiplusplus/wiki)
12 | 아희++의 표준안과 예제를 수록하고 있습니다. 표준안에 애매한 내용이나 질문이 있다면 이슈 등을 통해 알려주시면 감사하겠습니다.
13 | ## 아희++ 표준 인터프리터의 특징
14 | - **강력한 유니코드 지원**
15 | 코드에 이모지 등의 2개 이상의 코드 포인트로 구성되는 다양한 유니코드 문자를 사용해도 1글자로 정상적으로 인식합니다.
16 | - **범용성**
17 | 아희++은 물론이며, 아희도 인터프리팅 할 수 있습니다.
18 | - **통로 지원**
19 | 아희 구현체 최초로 통로를 *제대로* 지원합니다. C++로 작성된 '아희++ 표준 인터프리터 확장'을 연결하면 통로를 통해 확장과 통신할 수 있습니다. C++을 사용할 수 있다면 누구나 확장을 만들 수 있습니다.
20 | - **높은 이식성**
21 | 아희++ 표준 인터프리터의 모든 소스 코드는 컴파일러 확장 등이 사용되지 않아 C++17 표준을 정상적으로 지원하는 컴파일러 모두에서 정상적으로 컴파일 될 수 있습니다.
22 | ## 컴파일
23 | ### 필요한 소프트웨어
24 | - CMake 3.8.0 이상
25 | - C++17 표준을 정상적으로 지원하는 C++ 컴파일러
26 | ### 컴파일 방법
27 | 1. 이 레포지토리를 로컬에 복제합니다.
28 | 2. 복제된 디렉토리 내부에 있는 CMakeLists.txt 파일을 CMake로 실행합니다.
29 | 3. CMake가 생성한 빌드 스크립트를 적절한 소프트웨어로 실행합니다.
30 | ### Git+Makefile
31 | ```
32 | $ git clone -b stable https://github.com/kmc7468/Aheuiplusplus.git
33 | $ cd ./Aheuiplusplus
34 | $ cmake CMakeLists.txt
35 | $ make
36 | ```
37 | `-b stable` 옵션은 릴리즈 중 *정식 버전만* 보았을 때, 가장 최신의 릴리즈의 소스 코드를 복제하도록 하는 옵션입니다. `-b pre-release` 옵션으로 수정할 경우 모든 릴리즈 중 가장 최신의 릴리즈의 소스 코드를 복제하게 되며, 옵션을 삭제할 경우 여기에 릴리즈 되지 않은 소스 코드도 포함해 가장 최신의 소스 코드를 복제하게 됩니다. 옵션을 삭제하는 것은 권장되지 않습니다.
38 | ### CMake 옵션
39 | - `COMPILE_TARGET`:
40 | 아희++ 표준 인터프리터를 실행 파일의 형태로 컴파일 할지, 정적 라이브러리의 형태로 컴파일 할지 설정하는 옵션입니다.
41 | - 값은 `Executable`, `Library` 중 하나이며, 대소문자는 구분되지 않습니다. 전자는 실행 파일, 후자는 정적 라이브러리의 형태를 의미합니다.
42 | - `Executable`은 `E`와 `Exe`로 축약할 수 있으며, `Library`는 `L`과 `Lib`로 축약할 수 있습니다. 축약형 역시 대소문자는 구분되지 않습니다.
43 | - `USE_EXTENSION`:
44 | 아희++ 표준 인터프리터 확장을 사용할지 설정하는 옵션합니다.
45 | - `PRINT_BENCHMARK`:
46 | 아희++ 표준 인터프리터의 성능을 출력할지 설정하는 옵션입니다.
47 | ## 예제
48 | 더 많은 예제는 레퍼런스에서 확인하실 수 있습니다.
49 | ### [개발자 수다방](https://gist.github.com/RanolP/6ecb4b1030fccad19dc05f3716d6c2c7) by [RanolP](https://gist.github.com/RanolP)
50 | ```
51 | 개반뭉반붓밪두빥붖빠뭏밠뭉박누망뭏따뿌삭뿌밪붅파투밣뚜타댜뎌뭏뷺다두타두밢두밙뚜빥푸다뿑빠뿌빥분받뚜삽쑤밪불빥두받투밧누
52 | 발꾔바몽나몽망봀타뽀바몽맣본빠몽밤봃싹뫃빠소따뽅빥볼타빠쑺봃밠뽅소두봎뭏또두볻두봃쑵봃붖뽀뿌토붅또투도수소뚜도푸토뭏본뭉
53 | 자두변번뻕떠벌벚멓더떠벓벐더머퍼뻕더뻕벒뻕더벇뻕떠벐번멓서볻퍼두뫃불포두봀뭏뽅뭏뽅투뫃불속뭏볾뚜쏩뭏뽅투뫃뿑노투도분소붋
54 | 수뺝리밪밤따다맣밪타빥밠빥파타반밧나타타삭맣사맣밢타빥맣발다뽅맣속타뽅빥본밦토밦도밞토따도사뫃빠뽀밦도맣속반봇밠뽅삭뫃뿌
55 | 다총통각하만세삼창해멓북번붏멓뚜벖두뻕숙멓붊번붇썩투퍼투너뚜벓수멓두번푸뻕푸터두번불벚두벘뿑벐뿑더뿑벑숮멓투떠붍번뿌떠붐
56 | 방망희됴아하는난로당도너또범토더봆벌토벌토더토너뽀퍼뽅터봇번볻뻐속멓토머볾터포뻕뽅떠뫃더토더토퍼본더뫃뻐속멓봆더도뻕또더
57 | ```
58 | 출력: `2018.07.11. 개발자 수다방: 텔레그램 에디션 제 1회 개천절 경축!`
59 | ## 외부 라이브러리
60 | 아래에 열거된 외부 라이브러리들은 아희++ 표준 인터프리터를 컴파일 할 때 같이 컴파일 되므로 추가적인 작업이 필요하지 않습니다.
61 | - [u5e](https://github.com/ruoso/u5e)의 커밋 [3b970d5](https://github.com/ruoso/u5e/tree/3b970d5bc251fdef341d039d66c84ec5eaf4cb6a) - 2-clause BSD license
62 | - include/u5e/basic_grapheme_iterator.hpp 파일의 159번 줄 및 170번 줄이 수정되었습니다. (커밋 [abca129](https://github.com/kmc7468/Aheuiplusplus/commit/abca1292fe6c421d835516e00b33d62ae5710200))
63 | - [UTF8-CPP](https://github.com/nemtrif/utfcpp) 2.3.5
64 | ## 이런 프로젝트는 어떠세요?
65 | - [TemplatedAH](https://github.com/kmc7468/TemplatedAH) - 템플릿 메타 프로그래밍을 이용한 아희 인터프리터
66 | - **[톡희](https://github.com/kmc7468/talkheui) - 새로운 아희 인터프리터**
67 | 최초로 통로를 제대로 구현한 아희 인터프리터! 아희++의 정신적 후속작입니다.
68 | ## 라이선스
69 | 아희++ 표준 인터프리터의 모든 소스 코드는 MIT 라이선스가 적용됩니다. 단, 외부 라이브러리에는 적용되지 않습니다.
70 | ```
71 | MIT License
72 |
73 | Copyright (c) 2018 kmc7468
74 |
75 | Permission is hereby granted, free of charge, to any person obtaining a copy
76 | of this software and associated documentation files (the "Software"), to deal
77 | in the Software without restriction, including without limitation the rights
78 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
79 | copies of the Software, and to permit persons to whom the Software is
80 | furnished to do so, subject to the following conditions:
81 |
82 | The above copyright notice and this permission notice shall be included in all
83 | copies or substantial portions of the Software.
84 |
85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
86 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
88 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
89 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
90 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
91 | SOFTWARE.
92 | ```
--------------------------------------------------------------------------------
/extlib/UTF8-CPP/include/utf8/checked.h:
--------------------------------------------------------------------------------
1 | // Copyright 2006-2016 Nemanja Trifunovic
2 |
3 | /*
4 | Permission is hereby granted, free of charge, to any person or organization
5 | obtaining a copy of the software and accompanying documentation covered by
6 | this license (the "Software") to use, reproduce, display, distribute,
7 | execute, and transmit the Software, and to prepare derivative works of the
8 | Software, and to permit third-parties to whom the Software is furnished to
9 | do so, all subject to the following:
10 |
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 |
27 |
28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 |
31 | #include "core.h"
32 | #include
33 |
34 | namespace utf8
35 | {
36 | // Base for the exceptions that may be thrown from the library
37 | class exception : public ::std::exception {
38 | };
39 |
40 | // Exceptions that may be thrown from the library functions.
41 | class invalid_code_point : public exception {
42 | uint32_t cp;
43 | public:
44 | invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
45 | virtual const char* what() const throw() { return "Invalid code point"; }
46 | uint32_t code_point() const {return cp;}
47 | };
48 |
49 | class invalid_utf8 : public exception {
50 | uint8_t u8;
51 | public:
52 | invalid_utf8 (uint8_t u) : u8(u) {}
53 | virtual const char* what() const throw() { return "Invalid UTF-8"; }
54 | uint8_t utf8_octet() const {return u8;}
55 | };
56 |
57 | class invalid_utf16 : public exception {
58 | uint16_t u16;
59 | public:
60 | invalid_utf16 (uint16_t u) : u16(u) {}
61 | virtual const char* what() const throw() { return "Invalid UTF-16"; }
62 | uint16_t utf16_word() const {return u16;}
63 | };
64 |
65 | class not_enough_room : public exception {
66 | public:
67 | virtual const char* what() const throw() { return "Not enough space"; }
68 | };
69 |
70 | /// The library API - functions intended to be called by the users
71 |
72 | template
73 | octet_iterator append(uint32_t cp, octet_iterator result)
74 | {
75 | if (!utf8::internal::is_code_point_valid(cp))
76 | throw invalid_code_point(cp);
77 |
78 | if (cp < 0x80) // one octet
79 | *(result++) = static_cast(cp);
80 | else if (cp < 0x800) { // two octets
81 | *(result++) = static_cast((cp >> 6) | 0xc0);
82 | *(result++) = static_cast((cp & 0x3f) | 0x80);
83 | }
84 | else if (cp < 0x10000) { // three octets
85 | *(result++) = static_cast((cp >> 12) | 0xe0);
86 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80);
87 | *(result++) = static_cast((cp & 0x3f) | 0x80);
88 | }
89 | else { // four octets
90 | *(result++) = static_cast((cp >> 18) | 0xf0);
91 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80);
92 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80);
93 | *(result++) = static_cast((cp & 0x3f) | 0x80);
94 | }
95 | return result;
96 | }
97 |
98 | template
99 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
100 | {
101 | while (start != end) {
102 | octet_iterator sequence_start = start;
103 | internal::utf_error err_code = utf8::internal::validate_next(start, end);
104 | switch (err_code) {
105 | case internal::UTF8_OK :
106 | for (octet_iterator it = sequence_start; it != start; ++it)
107 | *out++ = *it;
108 | break;
109 | case internal::NOT_ENOUGH_ROOM:
110 | throw not_enough_room();
111 | case internal::INVALID_LEAD:
112 | out = utf8::append (replacement, out);
113 | ++start;
114 | break;
115 | case internal::INCOMPLETE_SEQUENCE:
116 | case internal::OVERLONG_SEQUENCE:
117 | case internal::INVALID_CODE_POINT:
118 | out = utf8::append (replacement, out);
119 | ++start;
120 | // just one replacement mark for the sequence
121 | while (start != end && utf8::internal::is_trail(*start))
122 | ++start;
123 | break;
124 | }
125 | }
126 | return out;
127 | }
128 |
129 | template
130 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
131 | {
132 | static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
133 | return utf8::replace_invalid(start, end, out, replacement_marker);
134 | }
135 |
136 | template
137 | uint32_t next(octet_iterator& it, octet_iterator end)
138 | {
139 | uint32_t cp = 0;
140 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
141 | switch (err_code) {
142 | case internal::UTF8_OK :
143 | break;
144 | case internal::NOT_ENOUGH_ROOM :
145 | throw not_enough_room();
146 | case internal::INVALID_LEAD :
147 | case internal::INCOMPLETE_SEQUENCE :
148 | case internal::OVERLONG_SEQUENCE :
149 | throw invalid_utf8(*it);
150 | case internal::INVALID_CODE_POINT :
151 | throw invalid_code_point(cp);
152 | }
153 | return cp;
154 | }
155 |
156 | template
157 | uint32_t peek_next(octet_iterator it, octet_iterator end)
158 | {
159 | return utf8::next(it, end);
160 | }
161 |
162 | template
163 | uint32_t prior(octet_iterator& it, octet_iterator start)
164 | {
165 | // can't do much if it == start
166 | if (it == start)
167 | throw not_enough_room();
168 |
169 | octet_iterator end = it;
170 | // Go back until we hit either a lead octet or start
171 | while (utf8::internal::is_trail(*(--it)))
172 | if (it == start)
173 | throw invalid_utf8(*it); // error - no lead byte in the sequence
174 | return utf8::peek_next(it, end);
175 | }
176 |
177 | /// Deprecated in versions that include "prior"
178 | template
179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start)
180 | {
181 | octet_iterator end = it;
182 | while (utf8::internal::is_trail(*(--it)))
183 | if (it == pass_start)
184 | throw invalid_utf8(*it); // error - no lead byte in the sequence
185 | octet_iterator temp = it;
186 | return utf8::next(temp, end);
187 | }
188 |
189 | template
190 | void advance (octet_iterator& it, distance_type n, octet_iterator end)
191 | {
192 | for (distance_type i = 0; i < n; ++i)
193 | utf8::next(it, end);
194 | }
195 |
196 | template
197 | typename std::iterator_traits::difference_type
198 | distance (octet_iterator first, octet_iterator last)
199 | {
200 | typename std::iterator_traits::difference_type dist;
201 | for (dist = 0; first < last; ++dist)
202 | utf8::next(first, last);
203 | return dist;
204 | }
205 |
206 | template
207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
208 | {
209 | while (start != end) {
210 | uint32_t cp = utf8::internal::mask16(*start++);
211 | // Take care of surrogate pairs first
212 | if (utf8::internal::is_lead_surrogate(cp)) {
213 | if (start != end) {
214 | uint32_t trail_surrogate = utf8::internal::mask16(*start++);
215 | if (utf8::internal::is_trail_surrogate(trail_surrogate))
216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
217 | else
218 | throw invalid_utf16(static_cast(trail_surrogate));
219 | }
220 | else
221 | throw invalid_utf16(static_cast(cp));
222 |
223 | }
224 | // Lone trail surrogate
225 | else if (utf8::internal::is_trail_surrogate(cp))
226 | throw invalid_utf16(static_cast(cp));
227 |
228 | result = utf8::append(cp, result);
229 | }
230 | return result;
231 | }
232 |
233 | template
234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
235 | {
236 | while (start < end) {
237 | uint32_t cp = utf8::next(start, end);
238 | if (cp > 0xffff) { //make a surrogate pair
239 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET);
240 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
241 | }
242 | else
243 | *result++ = static_cast(cp);
244 | }
245 | return result;
246 | }
247 |
248 | template
249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
250 | {
251 | while (start != end)
252 | result = utf8::append(*(start++), result);
253 |
254 | return result;
255 | }
256 |
257 | template
258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
259 | {
260 | while (start < end)
261 | (*result++) = utf8::next(start, end);
262 |
263 | return result;
264 | }
265 |
266 | // The iterator class
267 | template
268 | class iterator : public std::iterator {
269 | octet_iterator it;
270 | octet_iterator range_start;
271 | octet_iterator range_end;
272 | public:
273 | iterator () {}
274 | explicit iterator (const octet_iterator& octet_it,
275 | const octet_iterator& rangestart,
276 | const octet_iterator& rangeend) :
277 | it(octet_it), range_start(rangestart), range_end(rangeend)
278 | {
279 | if (it < range_start || it > range_end)
280 | throw std::out_of_range("Invalid utf-8 iterator position");
281 | }
282 | // the default "big three" are OK
283 | octet_iterator base () const { return it; }
284 | uint32_t operator * () const
285 | {
286 | octet_iterator temp = it;
287 | return utf8::next(temp, range_end);
288 | }
289 | bool operator == (const iterator& rhs) const
290 | {
291 | if (range_start != rhs.range_start || range_end != rhs.range_end)
292 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
293 | return (it == rhs.it);
294 | }
295 | bool operator != (const iterator& rhs) const
296 | {
297 | return !(operator == (rhs));
298 | }
299 | iterator& operator ++ ()
300 | {
301 | utf8::next(it, range_end);
302 | return *this;
303 | }
304 | iterator operator ++ (int)
305 | {
306 | iterator temp = *this;
307 | utf8::next(it, range_end);
308 | return temp;
309 | }
310 | iterator& operator -- ()
311 | {
312 | utf8::prior(it, range_start);
313 | return *this;
314 | }
315 | iterator operator -- (int)
316 | {
317 | iterator temp = *this;
318 | utf8::prior(it, range_start);
319 | return temp;
320 | }
321 | }; // class iterator
322 |
323 | } // namespace utf8
324 |
325 | #endif //header guard
326 |
327 |
328 |
--------------------------------------------------------------------------------
/extlib/UTF8-CPP/include/utf8/core.h:
--------------------------------------------------------------------------------
1 | // Copyright 2006 Nemanja Trifunovic
2 |
3 | /*
4 | Permission is hereby granted, free of charge, to any person or organization
5 | obtaining a copy of the software and accompanying documentation covered by
6 | this license (the "Software") to use, reproduce, display, distribute,
7 | execute, and transmit the Software, and to prepare derivative works of the
8 | Software, and to permit third-parties to whom the Software is furnished to
9 | do so, all subject to the following:
10 |
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 |
27 |
28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 |
31 | #include
32 |
33 | namespace utf8
34 | {
35 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
36 | // You may need to change them to match your system.
37 | // These typedefs have the same names as ones from cstdint, or boost/cstdint
38 | typedef unsigned char uint8_t;
39 | typedef unsigned short uint16_t;
40 | typedef unsigned int uint32_t;
41 |
42 | // Helper code - not intended to be directly called by the library users. May be changed at any time
43 | namespace internal
44 | {
45 | // Unicode constants
46 | // Leading (high) surrogates: 0xd800 - 0xdbff
47 | // Trailing (low) surrogates: 0xdc00 - 0xdfff
48 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
49 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
50 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
51 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
52 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
53 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
54 |
55 | // Maximum valid value for a Unicode code point
56 | const uint32_t CODE_POINT_MAX = 0x0010ffffu;
57 |
58 | template
59 | inline uint8_t mask8(octet_type oc)
60 | {
61 | return static_cast(0xff & oc);
62 | }
63 | template
64 | inline uint16_t mask16(u16_type oc)
65 | {
66 | return static_cast(0xffff & oc);
67 | }
68 | template
69 | inline bool is_trail(octet_type oc)
70 | {
71 | return ((utf8::internal::mask8(oc) >> 6) == 0x2);
72 | }
73 |
74 | template
75 | inline bool is_lead_surrogate(u16 cp)
76 | {
77 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
78 | }
79 |
80 | template
81 | inline bool is_trail_surrogate(u16 cp)
82 | {
83 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
84 | }
85 |
86 | template
87 | inline bool is_surrogate(u16 cp)
88 | {
89 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
90 | }
91 |
92 | template
93 | inline bool is_code_point_valid(u32 cp)
94 | {
95 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
96 | }
97 |
98 | template
99 | inline typename std::iterator_traits::difference_type
100 | sequence_length(octet_iterator lead_it)
101 | {
102 | uint8_t lead = utf8::internal::mask8(*lead_it);
103 | if (lead < 0x80)
104 | return 1;
105 | else if ((lead >> 5) == 0x6)
106 | return 2;
107 | else if ((lead >> 4) == 0xe)
108 | return 3;
109 | else if ((lead >> 3) == 0x1e)
110 | return 4;
111 | else
112 | return 0;
113 | }
114 |
115 | template
116 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117 | {
118 | if (cp < 0x80) {
119 | if (length != 1)
120 | return true;
121 | }
122 | else if (cp < 0x800) {
123 | if (length != 2)
124 | return true;
125 | }
126 | else if (cp < 0x10000) {
127 | if (length != 3)
128 | return true;
129 | }
130 |
131 | return false;
132 | }
133 |
134 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
135 |
136 | /// Helper for get_sequence_x
137 | template
138 | utf_error increase_safely(octet_iterator& it, octet_iterator end)
139 | {
140 | if (++it == end)
141 | return NOT_ENOUGH_ROOM;
142 |
143 | if (!utf8::internal::is_trail(*it))
144 | return INCOMPLETE_SEQUENCE;
145 |
146 | return UTF8_OK;
147 | }
148 |
149 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
150 |
151 | /// get_sequence_x functions decode utf-8 sequences of the length x
152 | template
153 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
154 | {
155 | if (it == end)
156 | return NOT_ENOUGH_ROOM;
157 |
158 | code_point = utf8::internal::mask8(*it);
159 |
160 | return UTF8_OK;
161 | }
162 |
163 | template
164 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
165 | {
166 | if (it == end)
167 | return NOT_ENOUGH_ROOM;
168 |
169 | code_point = utf8::internal::mask8(*it);
170 |
171 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
172 |
173 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
174 |
175 | return UTF8_OK;
176 | }
177 |
178 | template
179 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
180 | {
181 | if (it == end)
182 | return NOT_ENOUGH_ROOM;
183 |
184 | code_point = utf8::internal::mask8(*it);
185 |
186 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
187 |
188 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
189 |
190 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
191 |
192 | code_point += (*it) & 0x3f;
193 |
194 | return UTF8_OK;
195 | }
196 |
197 | template
198 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
199 | {
200 | if (it == end)
201 | return NOT_ENOUGH_ROOM;
202 |
203 | code_point = utf8::internal::mask8(*it);
204 |
205 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
206 |
207 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
208 |
209 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
210 |
211 | code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
212 |
213 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
214 |
215 | code_point += (*it) & 0x3f;
216 |
217 | return UTF8_OK;
218 | }
219 |
220 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
221 |
222 | template
223 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
224 | {
225 | if (it == end)
226 | return NOT_ENOUGH_ROOM;
227 |
228 | // Save the original value of it so we can go back in case of failure
229 | // Of course, it does not make much sense with i.e. stream iterators
230 | octet_iterator original_it = it;
231 |
232 | uint32_t cp = 0;
233 | // Determine the sequence length based on the lead octet
234 | typedef typename std::iterator_traits::difference_type octet_difference_type;
235 | const octet_difference_type length = utf8::internal::sequence_length(it);
236 |
237 | // Get trail octets and calculate the code point
238 | utf_error err = UTF8_OK;
239 | switch (length) {
240 | case 0:
241 | return INVALID_LEAD;
242 | case 1:
243 | err = utf8::internal::get_sequence_1(it, end, cp);
244 | break;
245 | case 2:
246 | err = utf8::internal::get_sequence_2(it, end, cp);
247 | break;
248 | case 3:
249 | err = utf8::internal::get_sequence_3(it, end, cp);
250 | break;
251 | case 4:
252 | err = utf8::internal::get_sequence_4(it, end, cp);
253 | break;
254 | }
255 |
256 | if (err == UTF8_OK) {
257 | // Decoding succeeded. Now, security checks...
258 | if (utf8::internal::is_code_point_valid(cp)) {
259 | if (!utf8::internal::is_overlong_sequence(cp, length)){
260 | // Passed! Return here.
261 | code_point = cp;
262 | ++it;
263 | return UTF8_OK;
264 | }
265 | else
266 | err = OVERLONG_SEQUENCE;
267 | }
268 | else
269 | err = INVALID_CODE_POINT;
270 | }
271 |
272 | // Failure branch - restore the original value of the iterator
273 | it = original_it;
274 | return err;
275 | }
276 |
277 | template
278 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
279 | uint32_t ignored;
280 | return utf8::internal::validate_next(it, end, ignored);
281 | }
282 |
283 | } // namespace internal
284 |
285 | /// The library API - functions intended to be called by the users
286 |
287 | // Byte order mark
288 | const uint8_t bom[] = {0xef, 0xbb, 0xbf};
289 |
290 | template
291 | octet_iterator find_invalid(octet_iterator start, octet_iterator end)
292 | {
293 | octet_iterator result = start;
294 | while (result != end) {
295 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
296 | if (err_code != internal::UTF8_OK)
297 | return result;
298 | }
299 | return result;
300 | }
301 |
302 | template
303 | inline bool is_valid(octet_iterator start, octet_iterator end)
304 | {
305 | return (utf8::find_invalid(start, end) == end);
306 | }
307 |
308 | template
309 | inline bool starts_with_bom (octet_iterator it, octet_iterator end)
310 | {
311 | return (
312 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
313 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
314 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
315 | );
316 | }
317 |
318 | //Deprecated in release 2.3
319 | template
320 | inline bool is_bom (octet_iterator it)
321 | {
322 | return (
323 | (utf8::internal::mask8(*it++)) == bom[0] &&
324 | (utf8::internal::mask8(*it++)) == bom[1] &&
325 | (utf8::internal::mask8(*it)) == bom[2]
326 | );
327 | }
328 | } // namespace utf8
329 |
330 | #endif // header guard
331 |
332 |
333 |
--------------------------------------------------------------------------------
/extlib/UTF8-CPP/include/utf8/unchecked.h:
--------------------------------------------------------------------------------
1 | // Copyright 2006 Nemanja Trifunovic
2 |
3 | /*
4 | Permission is hereby granted, free of charge, to any person or organization
5 | obtaining a copy of the software and accompanying documentation covered by
6 | this license (the "Software") to use, reproduce, display, distribute,
7 | execute, and transmit the Software, and to prepare derivative works of the
8 | Software, and to permit third-parties to whom the Software is furnished to
9 | do so, all subject to the following:
10 |
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 |
27 |
28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 |
31 | #include "core.h"
32 |
33 | namespace utf8
34 | {
35 | namespace unchecked
36 | {
37 | template
38 | octet_iterator append(uint32_t cp, octet_iterator result)
39 | {
40 | if (cp < 0x80) // one octet
41 | *(result++) = static_cast(cp);
42 | else if (cp < 0x800) { // two octets
43 | *(result++) = static_cast((cp >> 6) | 0xc0);
44 | *(result++) = static_cast((cp & 0x3f) | 0x80);
45 | }
46 | else if (cp < 0x10000) { // three octets
47 | *(result++) = static_cast((cp >> 12) | 0xe0);
48 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80);
49 | *(result++) = static_cast((cp & 0x3f) | 0x80);
50 | }
51 | else { // four octets
52 | *(result++) = static_cast((cp >> 18) | 0xf0);
53 | *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80);
54 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80);
55 | *(result++) = static_cast((cp & 0x3f) | 0x80);
56 | }
57 | return result;
58 | }
59 |
60 | template
61 | uint32_t next(octet_iterator& it)
62 | {
63 | uint32_t cp = utf8::internal::mask8(*it);
64 | typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it);
65 | switch (length) {
66 | case 1:
67 | break;
68 | case 2:
69 | it++;
70 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
71 | break;
72 | case 3:
73 | ++it;
74 | cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
75 | ++it;
76 | cp += (*it) & 0x3f;
77 | break;
78 | case 4:
79 | ++it;
80 | cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
81 | ++it;
82 | cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
83 | ++it;
84 | cp += (*it) & 0x3f;
85 | break;
86 | }
87 | ++it;
88 | return cp;
89 | }
90 |
91 | template
92 | uint32_t peek_next(octet_iterator it)
93 | {
94 | return utf8::unchecked::next(it);
95 | }
96 |
97 | template
98 | uint32_t prior(octet_iterator& it)
99 | {
100 | while (utf8::internal::is_trail(*(--it))) ;
101 | octet_iterator temp = it;
102 | return utf8::unchecked::next(temp);
103 | }
104 |
105 | // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
106 | template
107 | inline uint32_t previous(octet_iterator& it)
108 | {
109 | return utf8::unchecked::prior(it);
110 | }
111 |
112 | template
113 | void advance (octet_iterator& it, distance_type n)
114 | {
115 | for (distance_type i = 0; i < n; ++i)
116 | utf8::unchecked::next(it);
117 | }
118 |
119 | template
120 | typename std::iterator_traits::difference_type
121 | distance (octet_iterator first, octet_iterator last)
122 | {
123 | typename std::iterator_traits::difference_type dist;
124 | for (dist = 0; first < last; ++dist)
125 | utf8::unchecked::next(first);
126 | return dist;
127 | }
128 |
129 | template
130 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
131 | {
132 | while (start != end) {
133 | uint32_t cp = utf8::internal::mask16(*start++);
134 | // Take care of surrogate pairs first
135 | if (utf8::internal::is_lead_surrogate(cp)) {
136 | uint32_t trail_surrogate = utf8::internal::mask16(*start++);
137 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
138 | }
139 | result = utf8::unchecked::append(cp, result);
140 | }
141 | return result;
142 | }
143 |
144 | template
145 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
146 | {
147 | while (start < end) {
148 | uint32_t cp = utf8::unchecked::next(start);
149 | if (cp > 0xffff) { //make a surrogate pair
150 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET);
151 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
152 | }
153 | else
154 | *result++ = static_cast(cp);
155 | }
156 | return result;
157 | }
158 |
159 | template
160 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
161 | {
162 | while (start != end)
163 | result = utf8::unchecked::append(*(start++), result);
164 |
165 | return result;
166 | }
167 |
168 | template
169 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
170 | {
171 | while (start < end)
172 | (*result++) = utf8::unchecked::next(start);
173 |
174 | return result;
175 | }
176 |
177 | // The iterator class
178 | template
179 | class iterator : public std::iterator {
180 | octet_iterator it;
181 | public:
182 | iterator () {}
183 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
184 | // the default "big three" are OK
185 | octet_iterator base () const { return it; }
186 | uint32_t operator * () const
187 | {
188 | octet_iterator temp = it;
189 | return utf8::unchecked::next(temp);
190 | }
191 | bool operator == (const iterator& rhs) const
192 | {
193 | return (it == rhs.it);
194 | }
195 | bool operator != (const iterator& rhs) const
196 | {
197 | return !(operator == (rhs));
198 | }
199 | iterator& operator ++ ()
200 | {
201 | ::std::advance(it, utf8::internal::sequence_length(it));
202 | return *this;
203 | }
204 | iterator operator ++ (int)
205 | {
206 | iterator temp = *this;
207 | ::std::advance(it, utf8::internal::sequence_length(it));
208 | return temp;
209 | }
210 | iterator& operator -- ()
211 | {
212 | utf8::unchecked::prior(it);
213 | return *this;
214 | }
215 | iterator operator -- (int)
216 | {
217 | iterator temp = *this;
218 | utf8::unchecked::prior(it);
219 | return temp;
220 | }
221 | }; // class iterator
222 |
223 | } // namespace utf8::unchecked
224 | } // namespace utf8
225 |
226 |
227 | #endif // header guard
228 |
229 |
--------------------------------------------------------------------------------
/extlib/UTF8-CPP/include/utf8/utf8.h:
--------------------------------------------------------------------------------
1 | // Copyright 2006 Nemanja Trifunovic
2 |
3 | /*
4 | Permission is hereby granted, free of charge, to any person or organization
5 | obtaining a copy of the software and accompanying documentation covered by
6 | this license (the "Software") to use, reproduce, display, distribute,
7 | execute, and transmit the Software, and to prepare derivative works of the
8 | Software, and to permit third-parties to whom the Software is furnished to
9 | do so, all subject to the following:
10 |
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 |
27 |
28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 |
31 | #include "checked.h"
32 | #include "unchecked.h"
33 |
34 | #endif // header guard
35 |
--------------------------------------------------------------------------------
/extlib/u5e/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Daniel Ruoso
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/basic_encodedstring.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_BASIC_ENCODEDSTRING
2 | #define INCLUDED_U5E_BASIC_ENCODEDSTRING
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | namespace u5e {
11 | template class basic_grapheme_iterator;
12 |
13 | /**
14 | * \brief basic encoding support over string-like objects.
15 | *
16 | * u5e::basic_encodedstring implements encoding support on top of a
17 | * string-like object, it is implemented by simply wrapping the
18 | * native string type in order to provide a customized iterator
19 | * that offers codepoint-by-codepoint access instead of iterating
20 | * over the native type.
21 | *
22 | * \tparam Encoding Text is always represented in a specific
23 | * encoding, there is no such thing as a "natural", or "native"
24 | * representation of text, for that reason, the encoding is a part
25 | * of the type.
26 | *
27 | * \tparam NativeString In order to re-use the string support,
28 | * this will always be implemented as a wrapper around an
29 | * native string-like type. The idea is that the C++ string
30 | * libraries operate on unencoded memory, while the u5e types
31 | * offer a layer on top of that for the purposes of implementing
32 | * unicode in a type-safe way. Note that this applies to any
33 | * 'string-like' object, such as string or string_view.
34 | */
35 | template
37 | class basic_encodedstring {
38 | public:
39 | //@{
40 | /**
41 | * Offer an interface such that the size of the thing you're
42 | * iterating over is a codepoint, regardless of the native
43 | * type.
44 | */
45 | typedef u5e::codepoint_traits traits_type;
46 | typedef u5e::codepoint value_type;
47 | typedef u5e::codepoint_traits::pos_type size_type;
48 | typedef u5e::codepoint_traits::off_type difference_type;
49 | typedef value_type& reference;
50 | typedef const value_type& const_reference;
51 | typedef typename NativeString::pointer pointer;
52 | typedef typename NativeString::const_pointer const_pointer;
53 | //@}
54 |
55 | //@{
56 | /**
57 | * The Encoding template argument must provide iterator and
58 | * const_iterator member types. Those should iterate over
59 | * codepoints, regardless of the encoding and the native type.
60 | *
61 | * The iterator and const_iterator member types must be themselves
62 | * templates that take the NativeString type as a template
63 | * argument.
64 | */
65 | typedef typename Encoding::template iterator
66 | iterator;
67 | typedef typename Encoding::template const_iterator
68 | const_iterator;
69 | //@}
70 |
71 | //@{
72 | /**
73 | * Delegated to std::reverse_iterator
74 | */
75 | typedef std::reverse_iterator reverse_iterator;
76 | typedef std::reverse_iterator const_reverse_iterator;
77 | //@}
78 |
79 | /**
80 | * \brief Raw buffer as specified by the native type.
81 | *
82 | * This means that this class is exactly as expensive as whichever
83 | * native type is being used, it also means this class delegates
84 | * all memory management to that native type.
85 | *
86 | * This member is public because you should be able to completely
87 | * manage the native object if you need to.
88 | */
89 | NativeString native_string;
90 |
91 | /**
92 | * Default constructor, delegated to the native type.
93 | */
94 | basic_encodedstring() = default;
95 |
96 | /**
97 | * Implicit conversion from the native type.
98 | */
99 | basic_encodedstring(const NativeString& s)
100 | : native_string(s) { };
101 |
102 | /**
103 | * Assignment operator, assigns the native type.
104 | */
105 | basic_encodedstring&
106 | operator= (const basic_encodedstring &other) {
107 | native_string = other;
108 | }
109 |
110 | //@{
111 | /**
112 | * Get begin and end native iterators.
113 | */
114 | inline typename NativeString::iterator native_begin() {
115 | return native_string.begin();
116 | }
117 | inline typename NativeString::iterator native_end() {
118 | return native_string.end();
119 | }
120 | inline typename NativeString::const_iterator native_cbegin() {
121 | return native_string.cbegin();
122 | }
123 | inline typename NativeString::const_iterator native_cend() {
124 | return native_string.cend();
125 | }
126 | //@}
127 |
128 | //@{
129 | /**
130 | * Get begin and end codepoint iterators.
131 | */
132 | inline iterator codepoint_begin() {
133 | return iterator(native_string.begin());
134 | }
135 | inline iterator codepoint_end() {
136 | return iterator(native_string.end());
137 | }
138 | inline const_iterator codepoint_cbegin() {
139 | return const_iterator(native_string.cbegin());
140 | }
141 | inline const_iterator codepoint_cend() {
142 | return const_iterator(native_string.cend());
143 | }
144 | //@}
145 |
146 | //@{
147 | /**
148 | * Get begin and end grapheme iterators.
149 | * Graphemes are always built from the const iterators, since graphemes
150 | * are always immutable.
151 | */
152 | inline basic_grapheme_iterator grapheme_begin() {
153 | basic_grapheme_iterator i(codepoint_cbegin(),
154 | codepoint_cend());
155 | return i;
156 | }
157 | inline basic_grapheme_iterator grapheme_end() {
158 | basic_grapheme_iterator i(codepoint_cbegin(),
159 | codepoint_cend(),
160 | codepoint_cend());
161 | return i;
162 | }
163 | //@}
164 |
165 | //@{
166 | /**
167 | * Append from input iterators.
168 | *
169 | * Note that this is only possible from iterators of the same
170 | * encoding. This will not perform any conversion.
171 | */
172 | template
173 | inline basic_encodedstring& append
174 | (
175 | typename basic_encodedstring::const_iterator first,
176 | typename basic_encodedstring::const_iterator last
177 | ) {
178 | native_string.append
179 | (Encoding::template native_const_iterator(first),
180 | Encoding::template native_const_iterator(last)
181 | );
182 | return *this;
183 | }
184 |
185 | inline basic_encodedstring& append
186 | (const_iterator first,const_iterator last
187 | ) {
188 | return append(first,last);
189 | }
190 |
191 | template
192 | inline basic_encodedstring& append
193 | (basic_grapheme_iterator>& first,
194 | basic_grapheme_iterator>& last)
195 | {
196 | native_string.append((*first).codepoint_begin(),
197 | (*last).codepoint_begin());
198 | return *this;
199 | }
200 |
201 | inline basic_encodedstring& append
202 | (basic_grapheme_iterator& first,
203 | basic_grapheme_iterator& last) {
204 | return append(first, last);
205 | }
206 |
207 | template
208 | inline basic_encodedstring& append_from_utf32ne
209 | (
210 | typename basic_encodedstring::const_iterator first,
211 | typename basic_encodedstring::const_iterator last
212 | ) {
213 | Encoding::append_from_utf32ne
214 | (utf32ne::template native_const_iterator(first),
215 | utf32ne::template native_const_iterator(last),
216 | native_string);
217 | return *this;
218 | }
219 | //@}
220 |
221 | };
222 |
223 | }
224 |
225 | #endif
226 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/basic_grapheme.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_BASIC_GRAPHEME
2 | #define INCLUDED_U5E_BASIC_GRAPHEME
3 |
4 | namespace u5e {
5 | /**
6 | * \brief Represents a single grapheme cluster
7 | *
8 | * It works by holding start and end values for an underlying
9 | * encodedstring_view-like object.
10 | *
11 | * \tparam UnderlyingEncodedStringView a basic_encodedstring
12 | * instantiation.
13 | */
14 | template
15 | class basic_grapheme {
16 | public:
17 | /**
18 | * const_codepoint_iterator allows you to traverse the codepoints
19 | * inside this grapheme.
20 | */
21 | typedef typename UnderlyingEncodedStringView::const_iterator
22 | const_codepoint_iterator;
23 |
24 | private:
25 | /**
26 | * represents where the grapheme starts
27 | */
28 | const_codepoint_iterator d_begin;
29 |
30 | /**
31 | * represents where the grapheme ends
32 | */
33 | const_codepoint_iterator d_end;
34 |
35 | public:
36 | /**
37 | * Default constructor is only valid if the underlying type allows it
38 | */
39 | basic_grapheme() {};
40 |
41 | /**
42 | * Construct it with the iterators representing the begin and the
43 | * end of the grapheme.
44 | */
45 | basic_grapheme(const_codepoint_iterator b,
46 | const_codepoint_iterator e)
47 | :d_begin(b), d_end(e) {};
48 |
49 | /**
50 | * get the beginning of the codepoints
51 | */
52 | const_codepoint_iterator codepoint_begin() {
53 | return d_begin;
54 | }
55 |
56 | /**
57 | * get the end of the codepoints
58 | */
59 | const_codepoint_iterator codepoint_end() {
60 | return d_end;
61 | }
62 | };
63 | }
64 |
65 | #endif
66 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/basic_grapheme_iterator.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_BASIC_GRAPHEME_ITERATOR
2 | #define INCLUDED_U5E_BASIC_GRAPHEME_ITERATOR
3 |
4 | #include
5 | #include
6 |
7 | namespace u5e {
8 |
9 | /**
10 | * \brief Iterator that describes full graphemes.
11 | *
12 | * \tparam UnderlyingEncodedStringView the underlying encoded string
13 | * type with an underlying native string-like type.
14 | */
15 | template
16 | class basic_grapheme_iterator {
17 | public:
18 | /**
19 | * The type of the underlying encoded iterator
20 | */
21 | typedef typename UnderlyingEncodedStringView::const_iterator
22 | const_codepoint_iterator;
23 |
24 | /**
25 | * the specific grapheme type for this encoded string view
26 | */
27 | typedef basic_grapheme grapheme;
28 | typedef grapheme value_type;
29 |
30 | //@{
31 | /**
32 | * The begin and end iterators for the whole text are necessary for
33 | * bounds check, since the size of graphemes cannot be predicted.
34 | */
35 | const_codepoint_iterator begin_;
36 | const_codepoint_iterator end_;
37 | //@}
38 | //@{
39 | /**
40 | * This par of iterators point to where we are now and where the end
41 | * of the current grapheme is.
42 | */
43 | const_codepoint_iterator where_;
44 | const_codepoint_iterator end_of_grapheme_;
45 | //@}
46 |
47 | typedef props::grapheme_cluster_break::prop_value_type g_c_b_vt;
48 |
49 | /**
50 | * The unicode standard documents that a grapheme boundary can be
51 | * determined by looking just at two adjecent codepoints.
52 | */
53 | bool is_grapheme_boundary(codepoint a, codepoint b) {
54 | g_c_b_vt va = props::grapheme_cluster_break::resolve(a);
55 | g_c_b_vt vb = props::grapheme_cluster_break::resolve(b);
56 |
57 | if (va == g_c_b_vt::CR &&
58 | vb == g_c_b_vt::LF) {
59 | // GB3
60 | return false;
61 | } else if (va == g_c_b_vt::CR ||
62 | va == g_c_b_vt::LF ||
63 | va == g_c_b_vt::CONTROL) {
64 | // GB4
65 | return true;
66 | } else if (vb == g_c_b_vt::CR ||
67 | vb == g_c_b_vt::LF ||
68 | vb == g_c_b_vt::CONTROL) {
69 | // GB5
70 | return true;
71 | } else if (va == g_c_b_vt::L &&
72 | (vb == g_c_b_vt::L ||
73 | vb == g_c_b_vt::V ||
74 | vb == g_c_b_vt::LV ||
75 | vb == g_c_b_vt::LVT)) {
76 | // GB6
77 | return false;
78 | } else if ((va == g_c_b_vt::LV ||
79 | va == g_c_b_vt::V) &&
80 | (vb == g_c_b_vt::V ||
81 | vb == g_c_b_vt::T)) {
82 | // GB7
83 | return false;
84 | } else if ((va == g_c_b_vt::LVT ||
85 | va == g_c_b_vt::T) &&
86 | vb == g_c_b_vt::T) {
87 | // GB8
88 | return false;
89 | } else if (vb == g_c_b_vt::EXTEND ||
90 | vb == g_c_b_vt::ZWJ) {
91 | // GB9
92 | return false;
93 | } else if (vb == g_c_b_vt::SPACINGMARK) {
94 | // GB9a
95 | return false;
96 | } else if (va == g_c_b_vt::PREPEND) {
97 | // GB9b
98 | return false;
99 | } else if ( ( (va == g_c_b_vt::E_BASE ||
100 | va == g_c_b_vt::E_BASE_GAZ) &&
101 | vb == g_c_b_vt::E_MODIFIER) ||
102 | ( va == g_c_b_vt::EXTEND &&
103 | vb == g_c_b_vt::E_MODIFIER )) {
104 | // GB10 -- that is the interpretation I can make
105 | // of the combination of the fact that you should be able
106 | // to compare only two adjancent characters and the text of
107 | // the standard.
108 | return false;
109 | } else if (va == g_c_b_vt::ZWJ &&
110 | (vb == g_c_b_vt::GLUE_AFTER_ZWJ ||
111 | vb == g_c_b_vt::E_BASE_GAZ)) {
112 | // GB11
113 | return false;
114 | } else if (va == g_c_b_vt::REGIONAL_INDICATOR &&
115 | vb == g_c_b_vt::REGIONAL_INDICATOR) {
116 | // GB12, GB13
117 | // again, I take the liberty to assume the earlier part of the text
118 | // that says you only need to look at two adjacent characters
119 | return false;
120 | } else {
121 | // GB999
122 | return true;
123 | }
124 | }
125 |
126 | //@{
127 | /**
128 | * Use the data from the unicode database to find the start and
129 | * end of the current grapheme.
130 | */
131 | void find_end_of_grapheme() {
132 | // GB2
133 | if (end_of_grapheme_ == end_)
134 | return;
135 | // advance end_of_grapheme_ until it's no longer in the same grapheme
136 |
137 | // GB1
138 | // this always start as where_ == end_of_grapheme_;
139 | codepoint a = *end_of_grapheme_;
140 | end_of_grapheme_++;
141 |
142 | while (1) {
143 | // GB2
144 | if (end_of_grapheme_ == end_)
145 | return;
146 | codepoint b = *end_of_grapheme_;
147 |
148 | if (is_grapheme_boundary(a, b)) {
149 | return;
150 | }
151 |
152 | a = b;
153 | end_of_grapheme_++;
154 | }
155 | }
156 |
157 | void find_start_of_grapheme() {
158 | // GB2
159 | if (where_ == begin_ || where_ == end_)
160 | return;
161 | // rewind where_ until it's no longer in the same grapheme
162 |
163 | // GB1
164 | // this always start as copy = where_
165 | const_codepoint_iterator copy = where_;
166 | --copy;
167 | codepoint a = *copy;
168 |
169 | while (1) {
170 | if (where_ == begin_ || where_ == end_)
171 | return;
172 | codepoint b = *where_;
173 |
174 | if (is_grapheme_boundary(a, b)) {
175 | return;
176 | }
177 |
178 | a = b;
179 | --where_;
180 | }
181 | }
182 | //@}
183 |
184 | /**
185 | * \brief start at the beginning of the text
186 | */
187 | basic_grapheme_iterator(const_codepoint_iterator b,
188 | const_codepoint_iterator e)
189 | :begin_(b), end_(e), where_(b), end_of_grapheme_(b) {
190 | find_end_of_grapheme();
191 | };
192 |
193 | /**
194 | * \brief start at a specific point
195 | * find the start and the end of the grapheme
196 | */
197 | basic_grapheme_iterator(const_codepoint_iterator b,
198 | const_codepoint_iterator e,
199 | const_codepoint_iterator w)
200 | :begin_(b), end_(e), where_(w), end_of_grapheme_(w) {
201 | find_start_of_grapheme();
202 | find_end_of_grapheme();
203 | };
204 |
205 | /**
206 | * \brief start at a specific point - precalculated
207 | * start and end of grapheme
208 | */
209 | basic_grapheme_iterator(const_codepoint_iterator b,
210 | const_codepoint_iterator e,
211 | const_codepoint_iterator w,
212 | const_codepoint_iterator we)
213 | :begin_(b), end_(e), where_(w), end_of_grapheme_(we) {
214 | };
215 |
216 | /**
217 | * \brief copy constructor
218 | */
219 | basic_grapheme_iterator(const basic_grapheme_iterator& copy)
220 | :begin_(copy.begin_), end_(copy.end_),
221 | where_(copy.where_), end_of_grapheme_(copy.end_of_grapheme_) {}
222 |
223 | /**
224 | * dereference to a grapheme object
225 | */
226 | grapheme operator*() {
227 | return grapheme(where_, end_of_grapheme_);
228 | }
229 |
230 | //@{
231 | /**
232 | * advance one grapheme
233 | */
234 | basic_grapheme_iterator operator++() {
235 | where_ = end_of_grapheme_;
236 | find_end_of_grapheme();
237 | return *this;
238 | }
239 |
240 | basic_grapheme_iterator operator++(int i) {
241 | basic_grapheme_iterator copy(*this);
242 | ++(*this);
243 | return copy;
244 | }
245 | //@}
246 |
247 | /**
248 | * delegate the comparison to the underlying iterator
249 | */
250 | bool operator==(const_codepoint_iterator other) {
251 | if (where_ == other) {
252 | return true;
253 | } else {
254 | for (const_codepoint_iterator copy = where_;
255 | copy != end_of_grapheme_; copy++) {
256 | if (copy == other) {
257 | return true;
258 | }
259 | }
260 | return false;
261 | }
262 | }
263 |
264 | /**
265 | * delegate the comparison to the underlying iterator
266 | */
267 | bool operator==(basic_grapheme_iterator other) {
268 | if (where_ == end_ &&
269 | other == end_) {
270 | return true;
271 | } else {
272 | for (const_codepoint_iterator copy = where_;
273 | copy != end_of_grapheme_; copy++) {
274 | if (other == copy) {
275 | return true;
276 | }
277 | }
278 | return false;
279 | }
280 | }
281 |
282 | /**
283 | * delegate the comparison to the underlying iterator
284 | */
285 | bool operator!=(basic_grapheme_iterator other) {
286 | return !(*this == other);
287 | }
288 |
289 | /**
290 | * delegate the comparison to the underlying iterator
291 | */
292 | bool operator!=(const_codepoint_iterator other)
293 | {
294 | return !(*this == other);
295 | }
296 | };
297 | };
298 |
299 | #endif
300 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/canonical_combining_order.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_CANONICAL_COMBINING_ORDER
2 | #define INCLUDED_U5E_CANONICAL_COMBINING_ORDER
3 |
4 | #include
5 |
6 | namespace u5e {
7 |
8 | /**
9 | * \brief compare codepoints according to the canonical combining order
10 | *
11 | * This is intended to be used with std::sort on a utf32ne string type.
12 | */
13 | inline bool canonical_combining_order(int a, int b) {
14 | return
15 | props::canonical_combining_class::resolve(a)
16 | <
17 | props::canonical_combining_class::resolve(b);
18 | }
19 | }
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/canonical_composition.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_CANONICAL_COMPOSITION
2 | #define INCLUDED_U5E_CANONICAL_COMPOSITION
3 |
4 | #include
5 |
6 | namespace u5e {
7 | /**
8 | * \brief performs in-place canonical composition.
9 | *
10 | * This will return the iterator in the end position after the
11 | * composition.
12 | *
13 | * \tparam StorageType the storage type where to apply it.
14 | *
15 | * Must support codepoint_begin, codepont_cbegin, codepoint_end,
16 | * codepoint_cend, as well as the member types iterator and
17 | * const_iterator. It is also a requirement that you should be able
18 | * to write to it as you read it, which means that this must only be
19 | * used in utf32 iterators, otherwise the output may race ahead of
20 | * the input.
21 | *
22 | * \param data the object where the canonical composition will be
23 | * performed.
24 | *
25 | * \param count return pointer for how many compositions were performed
26 | */
27 | template
28 | typename StorageType::iterator
29 | inline canonical_composition(StorageType& data, int* count) {
30 | typename StorageType::iterator oi(data.codepoint_begin());
31 | typename StorageType::iterator in = oi;
32 | typename StorageType::iterator end(data.codepoint_end());
33 |
34 | int a, b, c;
35 | while (in != end) {
36 | //
37 | // grab the codepoint in the current input iterator
38 | //
39 | a = *in;
40 | if ((in + 1) == end) {
41 | //
42 | // If this is the last codepoint, it can't be composed, so we
43 | // just push it to the output as-is.
44 | //
45 | *(oi++) = a;
46 | in++;
47 | } else {
48 | //
49 | // look ahead for the next codepoint
50 | //
51 | b = *(in + 1);
52 | if (u5e::props::canonical_composition_mapping::resolve(a, b, &c)) {
53 | //
54 | // If this is a composition, we set it as the current input
55 | // iterator after advancing, because it may still be
56 | // composed more.
57 | //
58 | *(++in) = c;
59 | *count = *count + 1;
60 | } else {
61 | //
62 | // If there is no composition, we set it in the output iterator
63 | //
64 | *(oi++) = a;
65 | //
66 | // And finally advance the input iterator.
67 | //
68 | in++;
69 | }
70 | }
71 | }
72 |
73 | return oi;
74 | };
75 | }
76 |
77 | #endif
78 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/canonical_decomposition.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_CANONICAL_DECOMPOSITION
2 | #define INCLUDED_U5E_CANONICAL_DECOMPOSITION
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | namespace u5e {
14 |
15 | /**
16 | * \brief Perform codepoint-by-codepoint canonical decomposition
17 | *
18 | * This is one step of the normalization process, you probably want
19 | * to use that instead.
20 | *
21 | * This is meant to be used as an operation for u5e::filter.
22 | *
23 | *
24 | * \tparam OutputStringType the output string type to be used.
25 | * Because this reads data from the database, the returned data is
26 | * utf32ne, so you need an OutputStringType that is compatible with
27 | * that.
28 | *
29 | */
30 | template
31 | inline int canonical_decomposition(const codepoint input,
32 | OutputStringType& output) {
33 | return codepoint_decomposition
34 | ( input,
35 | output,
36 | props::canonical_decomposition_mapping::resolve );
37 | }
38 |
39 | }
40 |
41 | #endif
42 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/codepoint.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef INCLUDED_U5E_CODEPOINT_HPP
3 | #define INCLUDED_U5E_CODEPOINT_HPP
4 |
5 | #include
6 |
7 | namespace u5e {
8 | /**
9 | * \brief Native representation of a codepoint
10 | *
11 | * Explicity class in order to hijack overloads, such that we only
12 | * build codepoints out of known encodings and we only write to
13 | * encodings out of known codepoints.
14 | */
15 | class codepoint {
16 | public:
17 | /**
18 | * A codepoint has an integer value type.
19 | */
20 | codepoint_traits::int_type value;
21 |
22 | /**
23 | * Default constructor, starts as NULL.
24 | */
25 | constexpr codepoint() : value(0) { };
26 |
27 | /**
28 | * Implicit constructor from an integer value.
29 | */
30 | constexpr codepoint(int32_t v) : value(v) { };
31 |
32 | /**
33 | * Copy constructor.
34 | */
35 | constexpr codepoint(const codepoint& x) = default;
36 |
37 | /**
38 | * Assignment operator from another codepoint.
39 | */
40 | constexpr codepoint& operator=(const codepoint& x) = default;
41 |
42 | /**
43 | * Assignment operator from an int.
44 | */
45 | constexpr codepoint& operator=(int c) { value = c; return *this; };
46 |
47 | /**
48 | * Override int operator to return the codepoint value.
49 | */
50 | constexpr operator int() const { return value; };
51 | };
52 |
53 | /**
54 | * Compare two codepoints by comparing their values.
55 | */
56 | constexpr bool operator==(const codepoint& a, const codepoint& b) { return a.value == b.value; };
57 |
58 | //@{
59 | /**
60 | * Compare an int to a codepoint by comparing the codepoint's value
61 | * with the integer.
62 | */
63 | constexpr bool operator==(const codepoint_traits::int_type a, const codepoint& b) { return a == b.value; };
64 | constexpr bool operator==(const codepoint& a, const codepoint_traits::int_type b) { return a.value == b; };
65 | //@}
66 | }
67 |
68 | #endif
69 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/codepoint_decomposition.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_CODEPOINT_DECOMPOSITION
2 | #define INCLUDED_U5E_CODEPOINT_DECOMPOSITION
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include
9 | #include
10 | #include
11 |
12 | namespace u5e {
13 |
14 | /**
15 | * \brief Perform codepoint by codepoint decomposition
16 | *
17 | * This is one step of the normalization process, you probably want
18 | * to use that instead.
19 | *
20 | * This implements only the logic of dealing with the resolved data,
21 | * the actual database resolution is a template parameter.
22 | *
23 | * This is meant to be used as an operation for u5e::filter.
24 | *
25 | * \tparam PropResolver the function that resolves the input
26 | * codepoint into a sequence of decomposed codepoints.
27 | *
28 | * \tparam OutputStringType the output string type to be used.
29 | * Because this reads data from the database, the returned data is
30 | * utf32ne, so you need an OutputStringType that is compatible with
31 | * that.
32 | *
33 | */
34 | template
35 | inline int codepoint_decomposition
36 | (const codepoint input,
37 | OutputStringType& output,
38 | PropResolver& resolver) {
39 | int const * mapping = resolver(input);
40 | int const * begin;
41 | int const * end;
42 | int count = 0;
43 | if (mapping == NULL) {
44 | begin = &(input.value);
45 | end = begin;
46 | end++;
47 | count = 1;
48 | } else {
49 | begin = mapping;
50 | end = begin;
51 | while (*end != 0) {
52 | end++;
53 | count++;
54 | }
55 | }
56 | utf32ne_string_view from_database
57 | (std::experimental::basic_string_view(begin, count));
58 | output.template append
59 | (from_database.codepoint_cbegin(),
60 | from_database.codepoint_cend());
61 | return count;
62 | }
63 |
64 | }
65 |
66 | #endif
67 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/codepoint_traits.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_CODEPOINT_TRAITS
2 | #define INCLUDED_U5E_CODEPOINT_TRAITS
3 |
4 | #include
5 |
6 | namespace u5e {
7 | /**
8 | * \brief Type information for codepoint
9 | *
10 | * This class exists only to provide an interface similar to that of
11 | * the stream and string types. But it is not truly parameterizable,
12 | * since a codepoint always means the same thing.
13 | */
14 | class codepoint_traits {
15 | public:
16 | //@{
17 | /**
18 | * Basic meta-description of a codepoint
19 | */
20 | typedef int32_t int_type;
21 | typedef uint32_t pos_type;
22 | typedef int32_t off_type;
23 | //@}
24 | };
25 | }
26 |
27 | #endif
28 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/compatibility_and_canonical_decomposition.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_COMPATIBILITY_AND_CANONICAL_DECOMPOSITION
2 | #define INCLUDED_U5E_COMPATIBILITY_AND_CANONICAL_DECOMPOSITION
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | namespace u5e {
14 |
15 | /**
16 | * \brief Perform compatibility and canonical decomposition
17 | *
18 | * This is one step of the normalization process, you probably want
19 | * to use that instead.
20 | *
21 | * This is meant to be used as an operation for u5e::filter.
22 | *
23 | * \tparam OutputStringType the output string type to be used.
24 | * Because this reads data from the database, the returned data is
25 | * utf32ne, so you need an OutputStringType that is compatible with
26 | * that.
27 | *
28 | */
29 | template
30 | inline int compatibility_and_canonical_decomposition
31 | (const codepoint input,
32 | OutputStringType& output) {
33 | return codepoint_decomposition
34 | ( input,
35 | output,
36 | props::compatibility_and_canonical_decomposition_mapping::resolve );
37 | }
38 |
39 | }
40 |
41 | #endif
42 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/encoding_assertion.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_ENCODING_ASSERTION
2 | #define INCLUDED_U5E_ENCODING_ASSERTION
3 |
4 | #include
5 | #include
6 |
7 | namespace u5e {
8 | /**
9 | * \brief Assert the encoding matches the native type
10 | *
11 | * Tests that the encoding can be used with the specific
12 | * native string type.
13 | */
14 | template
15 | class encoding_assertion {
16 | iterator_assertion
17 | _assertion1;
18 | iterator_assertion
19 | _assertion2;
20 | iterator_assertion
21 | _assertion3;
22 | iterator_assertion
23 | _assertion4;
24 | };
25 | }
26 |
27 | #endif
28 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/filter.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_FILTER
2 | #define INCLUDED_U5E_FILTER
3 |
4 | namespace u5e {
5 |
6 | /**
7 | * \brief Walks an input iterator through a filter
8 | *
9 | * This will go from the begin to the end of the input iterator and
10 | * will execute the filter function once for every input element.
11 | *
12 | * Unlike std::transform, the filter function does not return the
13 | * output element, but it receives the output object and will do
14 | * whatever makes sense with the output object.
15 | *
16 | * That means that the type of filter will define what type of
17 | * object can be used as output. The filter function itself will not
18 | * touch the output object, but simply forward it to the operator
19 | * function.
20 | *
21 | * The operator function returns an int that is meant to indicate
22 | * how much output was produced. The filter function will accumulate
23 | * those values and return the sum.
24 | *
25 | * The filter is not required to produce a constant number of
26 | * outputs for each input. The function can be produce many outputs
27 | * or even none at all during the processing of each element.
28 | *
29 | * The value type for input and output is not required to be the
30 | * same. The input type is resolved by the value_type member type of
31 | * the input iterator type.
32 | *
33 | * \tparam InputIteratorType the type of the input iterator
34 | * \tparam OutputType the type of the output iterator
35 | * \tparam Functor the callback function type called for each element
36 | *
37 | * \param input_from starting position for the input iterator
38 | * \param input_to end position for the input iterator
39 | * \param output output container sent to the operator function
40 | * \param operation function that takes the element, the output
41 | * container and returns the number of outputted elements
42 | */
43 | template
45 | inline int
46 | filter(InputIteratorType input_from, InputIteratorType input_to,
47 | OutputType& output, Functor operation) {
48 | int counter = 0;
49 | while (input_from != input_to) {
50 | counter += operation(*input_from, output);
51 | input_from++;
52 | }
53 | return counter;
54 | }
55 |
56 | }
57 |
58 | #endif
59 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/iterator_assertion.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_ITERATOR_ASSERTION
2 | #define INCLUDED_U5E_ITERATOR_ASSERTION
3 |
4 | #include
5 |
6 | namespace u5e {
7 | /**
8 | * \brief Asserts the iterator is consistently defined
9 | */
10 | template
11 | class iterator_assertion {
12 | typedef typename std::iterator_traits::value_type VT;
13 | static_assert(sizeof(VT)==sizeof(T),
14 | "sizeof value_type incompatible with encoding");
15 | static_assert(alignof(VT)==alignof(T),
16 | "alignof value_type incompatible with encoding");
17 | static_assert(std::is_integral::value,
18 | "value_type is not an integral type");
19 | };
20 | };
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/normalization_form_c.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_NORMALIZATION_FORM_C
2 | #define INCLUDED_U5E_NORMALIZATION_FORM_C
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | namespace u5e {
13 | /**
14 | * \brief u5e::filter algorithm for normalizing graphemes
15 | *
16 | * This will work by reading an input grapheme iterator and,
17 | * grapheme by grapheme normalize them in form C.
18 | *
19 | * This will use the unicode database to search for equivalent
20 | * codepoint sequences.
21 | */
22 | template
24 | inline int normalization_form_c(basic_grapheme grapheme,
25 | OutputStorageType& output) {
26 |
27 | // first step is to decompose the grapheme
28 | utf32ne_string decomposed;
29 | int count = u5e::filter(grapheme.codepoint_begin(),
30 | grapheme.codepoint_end(),
31 | decomposed,
32 | canonical_decomposition);
33 |
34 | // then sort based on canonical combining class
35 | std::sort(decomposed.codepoint_begin(), decomposed.codepoint_end(),
36 | canonical_combining_order);
37 |
38 | // finally recompose. we will do that in-place on the decomposed
39 | // string, since we never have to look back.
40 | int compositions = 0;
41 | utf32ne_string::iterator oi_begin(decomposed.codepoint_begin());
42 | utf32ne_string::iterator oi
43 | (u5e::canonical_composition(decomposed,&compositions));
44 |
45 | // finally append the output
46 | output.template append_from_utf32ne
47 | (oi_begin, oi);
48 |
49 | // we re-use the counter from the decomposition filter and
50 | // subtract how many pair were composed into a single codepoint.
51 | return count - compositions;
52 | }
53 | }
54 |
55 | #endif
56 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/normalization_form_d.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_NORMALIZATION_FORM_D
2 | #define INCLUDED_U5E_NORMALIZATION_FORM_D
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | namespace u5e {
12 | /**
13 | * \brief u5e::filter algorithm for normalizing graphemes
14 | *
15 | * This will work by reading an input grapheme iterator and,
16 | * grapheme by grapheme normalize them in form D.
17 | *
18 | * This will use the unicode database to search for equivalent
19 | * codepoint sequences.
20 | */
21 | template
23 | inline int normalization_form_d(basic_grapheme grapheme,
24 | OutputStorageType& output) {
25 |
26 | // first step is to decompose the grapheme
27 | utf32ne_string decomposed;
28 | int count = u5e::filter(grapheme.codepoint_begin(),
29 | grapheme.codepoint_end(),
30 | decomposed,
31 | canonical_decomposition);
32 |
33 | // then sort based on canonical combining class
34 | std::sort(decomposed.codepoint_begin(), decomposed.codepoint_end(),
35 | canonical_combining_order);
36 |
37 | // finally append the output
38 | output.template append_from_utf32ne
39 | (decomposed.codepoint_begin(),
40 | decomposed.codepoint_end());
41 |
42 | // we re-use the counter from the decomposition filter.
43 | return count;
44 | }
45 | }
46 |
47 | #endif
48 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/normalization_form_kc.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_NORMALIZATION_FORM_KC
2 | #define INCLUDED_U5E_NORMALIZATION_FORM_KC
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | namespace u5e {
13 | /**
14 | * \brief u5e::filter algorithm for normalizing graphemes
15 | *
16 | * This will work by reading an input grapheme iterator and,
17 | * grapheme by grapheme normalize them in form KC.
18 | *
19 | * This will use the unicode database to search for equivalent
20 | * codepoint sequences.
21 | */
22 | template
24 | inline int normalization_form_kc(basic_grapheme grapheme,
25 | OutputStorageType& output) {
26 |
27 | // first step is to decompose the grapheme
28 | utf32ne_string decomposed;
29 | int count = u5e::filter
30 | (grapheme.codepoint_begin(),
31 | grapheme.codepoint_end(),
32 | decomposed,
33 | compatibility_and_canonical_decomposition);
34 |
35 | // then sort based on canonical combining class
36 | std::sort(decomposed.codepoint_begin(), decomposed.codepoint_end(),
37 | canonical_combining_order);
38 |
39 | // finally recompose. we will do that in-place on the decomposed
40 | // string, since we never have to look back.
41 | int compositions = 0;
42 | utf32ne_string::iterator oi_begin(decomposed.codepoint_begin());
43 | utf32ne_string::iterator oi
44 | (u5e::canonical_composition(decomposed,&compositions));
45 |
46 | // finally append the output
47 | output.template append_from_utf32ne
48 | (oi_begin, oi);
49 |
50 | // we re-use the counter from the decomposition filter and
51 | // subtract how many pair were composed into a single codepoint.
52 | return count - compositions;
53 | }
54 | }
55 |
56 | #endif
57 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/normalization_form_kd.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_NORMALIZATION_FORM_KD
2 | #define INCLUDED_U5E_NORMALIZATION_FORM_KD
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | namespace u5e {
12 | /**
13 | * \brief u5e::filter algorithm for normalizing graphemes
14 | *
15 | * This will work by reading an input grapheme iterator and,
16 | * grapheme by grapheme normalize them in form KD.
17 | *
18 | * This will use the unicode database to search for equivalent
19 | * codepoint sequences.
20 | */
21 | template
23 | inline int normalization_form_kd(basic_grapheme grapheme,
24 | OutputStorageType& output) {
25 |
26 | // first step is to decompose the grapheme
27 | utf32ne_string decomposed;
28 | int count = u5e::filter
29 | (grapheme.codepoint_begin(),
30 | grapheme.codepoint_end(),
31 | decomposed,
32 | compatibility_and_canonical_decomposition);
33 |
34 | // then sort based on canonical combining class
35 | std::sort(decomposed.codepoint_begin(), decomposed.codepoint_end(),
36 | canonical_combining_order);
37 |
38 | // finally append the output
39 | output.template append_from_utf32ne
40 | (decomposed.codepoint_begin(),
41 | decomposed.codepoint_end());
42 |
43 | // we re-use the counter from the decomposition filter.
44 | return count;
45 | }
46 | }
47 |
48 | #endif
49 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/props/canonical_combining_class.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_PROPS_CANONICAL_COMBINING_CLASS
2 | #define INCLUDED_U5E_PROPS_CANONICAL_COMBINING_CLASS
3 |
4 | namespace u5e {
5 | /**
6 | * \brief codepoint property handling
7 | */
8 | namespace props {
9 | /**
10 | * \brief Canonical_Combining_Class attribute
11 | */
12 | class canonical_combining_class {
13 | public:
14 | /**
15 | * Return the Canonical_Combining_class for this codepoint
16 | */
17 | static int resolve(int input);
18 | };
19 | }
20 | }
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/props/canonical_composition_mapping.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_PROPS_CANONICAL_COMPOSITION_MAPPING
2 | #define INCLUDED_U5E_PROPS_CANONICAL_COMPOSITION_MAPPING
3 |
4 | namespace u5e {
5 | /**
6 | * \brief codepoint property handling
7 | */
8 | namespace props {
9 | /**
10 | * \brief Derived property for canonical composition
11 | *
12 | * This has the fully resolved canonical composition for
13 | * characters, including the composition exclusions specified in
14 | * the standard.
15 | */
16 | class canonical_composition_mapping {
17 | public:
18 | /**
19 | * Given a pair of input codepoints a and b returns whether or
20 | * not that pair has a canonical composition. The composed
21 | * codepoint is returned via the r_composed pointer if that is
22 | * the case.
23 | *
24 | * \param a the first codepoint in the decomposed pair
25 | *
26 | * \param b the second codepoint in the decomposed pair
27 | *
28 | * \param r_composed the pointer where the composed codepoint
29 | * will be set if the return is true.
30 | */
31 | static bool resolve(int a, int b, int* r_composed);
32 | };
33 | }
34 | }
35 |
36 | #endif
37 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/props/canonical_decomposition_mapping.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_PROPS_CANONICAL_DECOMPOSITION_MAPPING
2 | #define INCLUDED_U5E_PROPS_CANONICAL_DECOMPOSITION_MAPPING
3 |
4 | namespace u5e {
5 | /**
6 | * \brief codepoint property handling
7 | */
8 | namespace props {
9 | /**
10 | * \brief Subset of Decomposition_Mapping attribute
11 | *
12 | * This recursively resolves the canonical decomposition mapping.
13 | * The returned data is fully canonically decomposed.
14 | */
15 | class canonical_decomposition_mapping {
16 | public:
17 | /**
18 | * Perform the decomposition. Returns NULL if the character has
19 | * no decomposition.
20 | *
21 | * The returned int array will be zero terminated.
22 | */
23 | static int const * const resolve(int input);
24 | };
25 | }
26 | }
27 |
28 | #endif
29 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/props/compatibility_and_canonical_decomposition_mapping.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_PROPS_COMPATIBILITY_AND_CANONICAL_DECOMPOSITION_MAPPING
2 | #define INCLUDED_U5E_PROPS_COMPATIBILITY_AND_CANONICAL_DECOMPOSITION_MAPPING
3 |
4 | namespace u5e {
5 | /**
6 | * \brief codepoint property handling
7 | */
8 | namespace props {
9 | /**
10 | * \brief Subset of Decomposition_Mapping attribute
11 | *
12 | * This recursively resolves the canonical decomposition mapping.
13 | * The returned data is fully compat and canonically decomposed.
14 | */
15 | class compatibility_and_canonical_decomposition_mapping {
16 | public:
17 | /**
18 | * Perform the decomposition. Returns NULL if the character has
19 | * no decomposition.
20 | *
21 | * The returned int array will be zero terminated.
22 | */
23 | static int const * const resolve(int input);
24 | };
25 | }
26 | }
27 |
28 | #endif
29 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/props/grapheme_cluster_break.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_PROPS_GRAPHEME_CLUSTER_BREAK
2 | #define INCLUDED_U5E_PROPS_GRAPHEME_CLUSTER_BREAK
3 |
4 | #include
5 | #include
6 |
7 | namespace u5e {
8 | /**
9 | * \brief codepoint property handling
10 | */
11 | namespace props {
12 | /**
13 | * \brief Grapheme Cluster Break property for a codepoint
14 | */
15 | class grapheme_cluster_break {
16 | public:
17 | /**
18 | * Possible values for the property as specified by the standard
19 | */
20 | enum prop_value_type {
21 | OTHER,
22 | PREPEND,
23 | CR,
24 | LF,
25 | CONTROL,
26 | EXTEND,
27 | REGIONAL_INDICATOR,
28 | SPACINGMARK,
29 | L,
30 | V,
31 | T,
32 | LV,
33 | LVT,
34 | E_BASE,
35 | E_MODIFIER,
36 | ZWJ,
37 | GLUE_AFTER_ZWJ,
38 | E_BASE_GAZ,
39 | };
40 |
41 | /**
42 | * Return the value of the property for the given codepoint by
43 | * looking at the database.
44 | */
45 | static prop_value_type resolve(codepoint c);
46 | };
47 | };
48 | };
49 |
50 | #endif
51 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf32ne.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF32NE
2 | #define INCLUDED_U5E_UTF32NE
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | namespace u5e {
9 | /**
10 | * \brief Architecture-specific type to interface UTF32BE or UTF32LE
11 | *
12 | * utf32ne is not an encoding. It is a type that should be used to
13 | * interface with either UTF32BE or with UTF32LE depending on what
14 | * the native endianess is.
15 | *
16 | * Because utf32 with the native endianess can be used natively,
17 | * there's no special logic and everything is delegated to the
18 | * native types.
19 | */
20 | class utf32ne {
21 | public:
22 | //@{
23 | /**
24 | * Delegate to the underlying iterator
25 | */
26 | template
27 | using iterator = typename NativeString::iterator;
28 |
29 | template
30 | using const_iterator = typename NativeString::const_iterator;
31 |
32 | template
33 | static typename NativeString::const_iterator
34 | native_const_iterator(typename NativeString::const_iterator it) {
35 | return it;
36 | }
37 |
38 | template
39 | static void append_from_utf32ne
40 | (InputNativeIterator first, InputNativeIterator last,
41 | OutputNativeString& output) {
42 | output.append(first, last);
43 | }
44 |
45 | //@}
46 | };
47 | }
48 |
49 | #endif
50 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf32ne_string.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF32NE_STRING
2 | #define INCLUDED_U5E_UTF32NE_STRING
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | namespace u5e {
9 | /**
10 | * \class u5e::utf32ne_string
11 | * \brief Typedef: basic_encodedstring of utf32ne and std::basic_string
12 | *
13 | * Although this is a typedef, it shows up in doxygen as a class for
14 | * better discoverability.
15 | *
16 | * \typedef utf32ne_string
17 | * \brief A basic_encodedstring of utf32ne and std::basic_string
18 | */
19 | typedef basic_encodedstring>
21 | utf32ne_string;
22 | };
23 |
24 | #endif
25 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf32ne_string_grapheme.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF32NE_STRING_GRAPHEME
2 | #define INCLUDED_U5E_UTF32NE_STRING_GRAPHEME
3 |
4 | #include
5 | #include
6 |
7 | namespace u5e {
8 | /**
9 | * \class u5e::utf32ne_string_grapheme
10 | * \brief Typedef: basic_grapheme of utf32ne_string
11 | *
12 | * Although this is a typedef, it shows up in doxygen as a class for
13 | * better discoverability.
14 | *
15 | * \typedef u5e::utf32ne_string_grapheme
16 | * \brief A basic_grapheme of utf32ne_string
17 | */
18 | typedef basic_grapheme utf32ne_string_grapheme;
19 | };
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf32ne_string_grapheme_iterator.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF32NE_STRING_GRAPHEME_ITERATOR
2 | #define INCLUDED_U5E_UTF32NE_STRING_GRAPHEME_ITERATOR
3 |
4 | #include
5 | #include
6 |
7 | namespace u5e {
8 | /**
9 | * \class u5e::utf32ne_string_grapheme_iterator
10 | * \brief Typedef: basic_grapheme_iterator of utf32ne_string
11 | *
12 | * Although this is a typedef, it shows up in doxygen as a class for
13 | * better discoverability.
14 | *
15 | * \typedef u5e::utf32ne_string_grapheme_iterator
16 | * \brief A basic_grapheme_iterator of utf32ne_string
17 | */
18 | typedef basic_grapheme_iterator
19 | utf32ne_string_grapheme_iterator;
20 | };
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf32ne_string_view.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF32NE_STRING_VIEW
2 | #define INCLUDED_U5E_UTF32NE_STRING_VIEW
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | namespace u5e {
9 | /**
10 | * \class u5e::utf32ne_string_view
11 | * \brief Typedef: basic_encodedstring of utf32ne and basic_string_view
12 | *
13 | * Although this is a typedef, it shows up in doxygen as a class for
14 | * better discoverability.
15 | *
16 | * \typedef u5e::utf32ne_string_view
17 | * \brief A basic_encodedstring of utf32ne and basic_string_view
18 | */
19 | typedef basic_encodedstring>
21 | utf32ne_string_view;
22 | };
23 |
24 | #endif
25 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf32ne_string_view_grapheme.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF32NE_STRING_VIEW_GRAPHEME
2 | #define INCLUDED_U5E_UTF32NE_STRING_VIEW_GRAPHEME
3 |
4 | #include
5 | #include
6 |
7 | namespace u5e {
8 | /**
9 | * \class u5e::utf32ne_string_view_grapheme
10 | * \brief Typedef: basic_grapheme of utf32ne_string_view
11 | *
12 | * Although this is a typedef, it shows up in doxygen as a class for
13 | * better discoverability.
14 | *
15 | * \typedef u5e::utf32ne_string_view_grapheme
16 | * \brief A basic_grapheme of utf32ne_string_view
17 | */
18 | typedef basic_grapheme utf32ne_string_view_grapheme;
19 | };
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf32ne_string_view_grapheme_iterator.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF32NE_STRING_VIEW_GRAPHEME_ITERATOR
2 | #define INCLUDED_U5E_UTF32NE_STRING_VIEW_GRAPHEME_ITERATOR
3 |
4 | #include
5 | #include
6 |
7 | namespace u5e {
8 | /**
9 | * \class u5e::utf32ne_string_view_grapheme_iterator
10 | * \brief Typedef: basic_grapheme_iterator of utf32ne_string_view
11 | *
12 | * Although this is a typedef, it shows up in doxygen as a class for
13 | * better discoverability.
14 | *
15 | * \typedef u5e::utf32ne_string_view_grapheme_iterator
16 | * \brief A basic_grapheme_iterator of utf32ne_string_view
17 | */
18 | typedef basic_grapheme_iterator
19 | utf32ne_string_view_grapheme_iterator;
20 | };
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/extlib/u5e/include/u5e/utf8.hpp:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDED_U5E_UTF8
2 | #define INCLUDED_U5E_UTF8
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 |
12 | namespace u5e {
13 | /**
14 | * \brief Encoding type for UTF8 text.
15 | * Unlike UTF16 and UTF32, UTF8 is endian independent.
16 | */
17 | class utf8 {
18 | public:
19 |
20 | /**
21 | * Delegated to utf8_iterator of the native type.
22 | * \tparam NativeString the native string type with utf8 data
23 | */
24 | template
25 | using iterator =
26 | utf8_iterator;
27 |
28 | /**
29 | * Delegated to utf8_const_iterator of the native type
30 | * \tparam NativeString the native string type with utf8 data
31 | */
32 | template
33 | using const_iterator =
34 | utf8_const_iterator;
35 |
36 | /**
37 | * Get access to the native const_iterator with the native data.
38 | */
39 | template
40 | static typename NativeString::const_iterator
41 | native_const_iterator
42 | (utf8_const_iterator it) {
43 | it.rewind_to_start_of_codepoint(*(it.raw_iterator_));
44 | return it.raw_iterator_;
45 | }
46 |
47 | template