├── .travis.yml ├── scripts ├── update-linux-main.sh └── LinuxMain.stencil ├── Package.resolved ├── Tests ├── LinuxMain.swift └── ICUTests │ ├── TestSupport.swift │ ├── SearchCursorTests.swift │ ├── RuleBasedBreakCursorTests.swift │ ├── CharacterBreakCursorTests.swift │ ├── SentenceBreakCursorTests.swift │ ├── LineBreakCursorTests.swift │ └── WordBreakCursorTests.swift ├── Sources └── ICU │ ├── UnsafeMutableBufferPointer+Deallocate.swift │ ├── UErrorCode+Additions.swift │ ├── UnicodeScalar+Numeric.swift │ ├── UnicodeNameKind.swift │ ├── UnicodeScalar+Miscellaneous.swift │ ├── BidiPairedBracketType.swift │ ├── JoiningType.swift │ ├── String+UTF16.swift │ ├── UnicodeScalar+Casing.swift │ ├── HangulSyllableType.swift │ ├── ConvertibleFromUnicodeIntProperty.swift │ ├── NumericType.swift │ ├── NormalizationCheckResult.swift │ ├── ParseErrorContext.swift │ ├── CInterop.swift │ ├── SentenceBreakType.swift │ ├── EastAsianWidth.swift │ ├── UnicodeScalar+Naming.swift │ ├── GraphemeClusterBreakType.swift │ ├── BreakRuleParseError.swift │ ├── WordBreakType.swift │ ├── DecompositionType.swift │ ├── UnicodeScalar+Internal.swift │ ├── BidiClass.swift │ ├── CharacterBreakCursor.swift │ ├── LineBreakType.swift │ ├── UnicodeScalar+Enumeration.swift │ ├── UnicodeVersion.swift │ ├── LineBreakCursor.swift │ ├── SentenceBreakCursor.swift │ ├── RuleBasedBreakCursor.swift │ ├── GeneralCategory.swift │ ├── WordBreakCursor.swift │ ├── BreakCursorImpl.swift │ ├── SearchCursor.swift │ └── JoiningGroup.swift ├── Package.swift ├── .gitignore ├── README.md └── LICENSE /.travis.yml: -------------------------------------------------------------------------------- 1 | # TODO(allevato): Add Linux once we have the renaming issues sorted out. 2 | matrix: 3 | include: 4 | - os: osx 5 | language: swift 6 | osx_image: xcode9.1 7 | 8 | script: 9 | - swift build 10 | - swift test 11 | -------------------------------------------------------------------------------- /scripts/update-linux-main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | SCRIPTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 5 | 6 | pushd "$SCRIPTS_DIR/.." > /dev/null 7 | 8 | sourcery \ 9 | --sources Tests \ 10 | --templates "$SCRIPTS_DIR/LinuxMain.stencil" \ 11 | --args testimports='@testable import ICUTests' 12 | 13 | popd > /dev/null 14 | -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "object": { 3 | "pins": [ 4 | { 5 | "package": "icu4c-swift", 6 | "repositoryURL": "https://github.com/allevato/icu4c-swift", 7 | "state": { 8 | "branch": null, 9 | "revision": "eb1ea105e25d17ce6481b5a3338711530b7bd13d", 10 | "version": "1.0.1" 11 | } 12 | } 13 | ] 14 | }, 15 | "version": 1 16 | } 17 | -------------------------------------------------------------------------------- /Tests/LinuxMain.swift: -------------------------------------------------------------------------------- 1 | // Generated using Sourcery 0.9.0 — https://github.com/krzysztofzablocki/Sourcery 2 | // DO NOT EDIT 3 | 4 | import XCTest 5 | @testable import ICUTests 6 | 7 | extension SearchCursorTests { 8 | static var allTests = [ 9 | ("testFirst", testFirst), 10 | ("testNext", testNext), 11 | ("testLast", testLast), 12 | ("testPrevious", testPrevious), 13 | ("testMoveToIndexFollowing", testMoveToIndexFollowing), 14 | ("testMoveToIndexPreceding", testMoveToIndexPreceding), 15 | ] 16 | } 17 | 18 | XCTMain([ 19 | testCase(SearchCursorTests.allTests), 20 | ]) 21 | -------------------------------------------------------------------------------- /scripts/LinuxMain.stencil: -------------------------------------------------------------------------------- 1 | // sourcery:file:Tests/LinuxMain.swift 2 | import XCTest 3 | {{ argument.testimports }} 4 | 5 | {% for type in types.classes|based:"XCTestCase" %} 6 | {% if not type.annotations.disableTests %}extension {{ type.name }} { 7 | static var allTests = [ 8 | {% for method in type.methods %}{% if method.parameters.count == 0 and method.shortName|hasPrefix:"test" %} ("{{ method.shortName }}", {{ method.shortName }}), 9 | {% endif %}{% endfor %}] 10 | } 11 | 12 | {% endif %}{% endfor %} 13 | XCTMain([ 14 | {% for type in types.classes|based:"XCTestCase" %}{% if not type.annotations.disableTests %} testCase({{ type.name }}.allTests), 15 | {% endif %}{% endfor %}]) 16 | // sourcery:end 17 | -------------------------------------------------------------------------------- /Sources/ICU/UnsafeMutableBufferPointer+Deallocate.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension UnsafeMutableBufferPointer { 18 | 19 | /// Deallocates the member underlying this buffer pointer. 20 | internal func deallocate() { 21 | baseAddress!.deallocate(capacity: count) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Sources/ICU/UErrorCode+Additions.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension UErrorCode { 18 | 19 | /// True if the error code is a success error code. 20 | var isSuccess: Bool { 21 | return rawValue <= U_ZERO_ERROR.rawValue 22 | } 23 | 24 | /// Creates a new error code equal to `U_ZERO_ERROR`. 25 | init() { 26 | self = U_ZERO_ERROR 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:4.0 2 | 3 | // Copyright 2017 Tony Allevato. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | import PackageDescription 18 | 19 | let package = Package( 20 | name: "icu-swift", 21 | products: [ 22 | .library(name: "swiftICU", type: .static, targets: ["ICU"]), 23 | ], 24 | dependencies: [ 25 | .package(url: "https://github.com/allevato/icu4c-swift", from: "1.0.1"), 26 | ], 27 | targets: [ 28 | .target(name: "ICU"), 29 | .testTarget(name: "ICUTests", dependencies: ["ICU"]), 30 | ] 31 | ) 32 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeScalar+Numeric.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | public extension UnicodeScalar { 18 | 19 | /// Returns the decimal digit value of the receiving scalar in the given 20 | /// radix. 21 | /// 22 | /// - Precondition: `radix` must be between 2 and 36 (inclusive). 23 | /// 24 | /// - Parameter radix: The radix, between 2 and 36. 25 | /// - Returns: The decimal digit value of the scalar, or nil if it was not a 26 | /// numeric scalar. 27 | public func digitValue(withRadix radix: Int = 10) -> Int? { 28 | precondition((2...36).contains(radix), "Radix must be between 2 and 36") 29 | let result = u_digit(uchar32Value, Int8(truncatingIfNeeded: radix)) 30 | return result != -1 ? Int(result) : nil 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeNameKind.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Indicates which name for a `UnicodeScalar` should be returned by 18 | /// `UnicodeScalar.name(kind:)` or provided in enumerations. 19 | public enum UnicodeNameKind { 20 | 21 | /// The "modern" name of a Unicode scalar (defined by its "Name" property). 22 | case unicode 23 | 24 | /// The "extended" (standard or synthetic) name of a Unicode scalar, which is 25 | /// unique for each scalar. 26 | case extended 27 | 28 | /// The corrected name of a Unicode scalar from NameAliases.txt. 29 | case alias 30 | 31 | /// The C API value of type `UCharNameChoice` that corresponds to the 32 | /// receiving enum case. 33 | var cValue: UCharNameChoice { 34 | switch self { 35 | case .unicode: return U_UNICODE_CHAR_NAME 36 | case .extended: return U_EXTENDED_CHAR_NAME 37 | case .alias: return U_CHAR_NAME_ALIAS 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /Tests/ICUTests/TestSupport.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import XCTest 16 | 17 | /// Asserts that the given index is a certain distance from the start of a 18 | /// collection. 19 | /// 20 | /// - Parameters: 21 | /// - index: The index to assert. A test failure occurs if this is `nil`. 22 | /// - distance: The expected distance from the start of the collection to 23 | /// `index`. 24 | /// - collection: The collection whose `startIndex` will be compared to 25 | /// `index`. 26 | func assertIndex( 27 | _ index: C.Index?, 28 | isDistance distance: C.IndexDistance, 29 | fromStartOf collection: C, 30 | file: StaticString = #file, 31 | line: UInt = #line 32 | ) { 33 | guard let index = index else { 34 | XCTFail("expected index to be non-nil", file: file, line: line) 35 | return 36 | } 37 | XCTAssertEqual( 38 | collection.distance(from: collection.startIndex, to: index), 39 | distance, 40 | file: file, 41 | line: line) 42 | } 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Xcode 2 | # 3 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 4 | 5 | .DS_Store 6 | 7 | ## Build generated 8 | build/ 9 | DerivedData/ 10 | 11 | ## Various settings 12 | *.pbxuser 13 | !default.pbxuser 14 | *.mode1v3 15 | !default.mode1v3 16 | *.mode2v3 17 | !default.mode2v3 18 | *.perspectivev3 19 | !default.perspectivev3 20 | xcuserdata/ 21 | 22 | ## Other 23 | *.moved-aside 24 | *.xccheckout 25 | *.xcscmblueprint 26 | 27 | ## Obj-C/Swift specific 28 | *.hmap 29 | *.ipa 30 | *.dSYM.zip 31 | *.dSYM 32 | 33 | ## Playgrounds 34 | timeline.xctimeline 35 | playground.xcworkspace 36 | 37 | # Swift Package Manager 38 | # 39 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. 40 | # Packages/ 41 | # Package.pins 42 | .build/ 43 | 44 | # Don't check in the generated Xcode project. 45 | *.xcodeproj/ 46 | 47 | # CocoaPods 48 | # 49 | # We recommend against adding the Pods directory to your .gitignore. However 50 | # you should judge for yourself, the pros and cons are mentioned at: 51 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control 52 | # 53 | # Pods/ 54 | 55 | # Carthage 56 | # 57 | # Add this line if you want to avoid checking in source code from Carthage dependencies. 58 | # Carthage/Checkouts 59 | 60 | Carthage/Build 61 | 62 | # fastlane 63 | # 64 | # It is recommended to not store the screenshots in the git repo. Instead, use fastlane to re-generate the 65 | # screenshots whenever they are needed. 66 | # For more information about the recommended setup visit: 67 | # https://docs.fastlane.tools/best-practices/source-control/#source-control 68 | 69 | fastlane/report.xml 70 | fastlane/Preview.html 71 | fastlane/screenshots 72 | fastlane/test_output 73 | 74 | default.profraw 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ICU for Swift 2 | 3 | [![Build Status](https://travis-ci.org/allevato/icu-swift.svg?branch=master)](https://travis-ci.org/allevato/icu-swift) 4 | 5 | This package implements Swift-style APIs for [ICU (International Components 6 | for Unicode)](http://site.icu-project.org) for Swift 4. 7 | 8 | Swift already provides great support for working with strings, characters, code 9 | points, and code units of various encodings in a Unicode-safe way. This library 10 | fills in some of the more detailed functionality not in the standard library. 11 | 12 | ## Usage note 13 | 14 | If you are using this library (or ICU in general) in an application meant for 15 | distribution on the App Store, Apple considers the ICU dynamic library included 16 | with the operating system to be private API and will reject the submission. 17 | 18 | This can be worked around; the solution is to **statically link ICU in your 19 | application** instead of relying on the system version. 20 | 21 | ## Changelog 22 | 23 | * **0.2.0 (2017.11.23):** Adds `CharacterBreakCursor`, `LineBreakCursor`, 24 | `RuleBasedBreakCursor`, `SentenceBreakCursor`, and `WordBreakCursor`, which 25 | wrap ICU's break iterators. 26 | 27 | * **0.1.3 (2017.10.30):** Fixes the missing `usearch.h` dependency by updating 28 | to the correct version of `icu4c`, which wasn't caught when working in Xcode. 29 | 30 | * **0.1.2 (2017.10.07):** Fix `text` and `pattern` setters in `SearchCursor`. 31 | 32 | * **0.1.1 (2017.10.07):** Add `SearchCursor`, which wraps ICU's search iterator. 33 | 34 | * **0.1.0 (2017.06.23):** Initial release. Adds a number of custom types to the 35 | `Unicode` namespace and corresponding properties and methods to 36 | `UnicodeScalar`. 37 | 38 | ## Future work 39 | 40 | * Wrap more ICU core features 41 | * Easier integration with iOS, tvOS, watchOS (not using Swift Package Manager) 42 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeScalar+Miscellaneous.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | public extension UnicodeScalar { 18 | 19 | /// The canonical combining class of the receiving scalar. 20 | public var canonicalCombiningClass: Int { 21 | let cValue = 22 | u_getIntPropertyValue(uchar32Value, UCHAR_CANONICAL_COMBINING_CLASS) 23 | return Int(cValue) 24 | } 25 | 26 | /// An ICU-specific property denoting the canonical combining class of the 27 | /// first code point of the decomposition. 28 | /// 29 | /// In other words, `lccc(c) = ccc(NFD(c)[0])`. This is useful for checking 30 | /// for canonically ordered text. 31 | public var leadingCanonicalCombiningClass: Int { 32 | let cValue = 33 | u_getIntPropertyValue(uchar32Value, UCHAR_LEAD_CANONICAL_COMBINING_CLASS) 34 | return Int(cValue) 35 | } 36 | 37 | /// An ICU-specific property denoting the canonical combining class of the 38 | /// last code point of the decomposition. 39 | /// 40 | /// In other words, `tccc(c) = ccc(NFD(c)[last])`. This is useful for checking 41 | /// for canonically ordered text. 42 | public var trailingCanonicalCombiningClass: Int { 43 | let cValue = 44 | u_getIntPropertyValue(uchar32Value, UCHAR_TRAIL_CANONICAL_COMBINING_CLASS) 45 | return Int(cValue) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Sources/ICU/BidiPairedBracketType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Enumerated property `Bidi_Paired_Bracket_Type` (new in Unicode 6.3). 20 | /// 21 | /// Used in UAX #9: Unicode Bidirectional Algorithm 22 | /// . 23 | public enum BidiPairedBracketType: ConvertibleFromUnicodeIntProperty { 24 | 25 | /// Open paired bracket. 26 | case open 27 | 28 | /// Closed paired bracket. 29 | case close 30 | 31 | /// The C API value of type `UBidiPairedBracketType` that corresponds to the 32 | /// receiving enum case. 33 | var cValue: UBidiPairedBracketType { 34 | switch self { 35 | case .open: return U_BPT_OPEN 36 | case .close: return U_BPT_CLOSE 37 | } 38 | } 39 | 40 | /// Creates a new value from the given ICU C API value. 41 | /// 42 | /// - Parameter cValue: The ICU C API value. 43 | init?(cValue: UBidiPairedBracketType) { 44 | switch cValue { 45 | case U_BPT_NONE: return nil 46 | case U_BPT_OPEN: self = .open 47 | case U_BPT_CLOSE: self = .close 48 | default: fatalError("Invalid UBidiPairedBracketType value: \(cValue)") 49 | } 50 | } 51 | } 52 | } 53 | 54 | extension UnicodeScalar { 55 | 56 | /// The bidirectional paired bracket type of the receiver. 57 | public var bidiPairedBracketType: Unicode.BidiPairedBracketType? { 58 | return value(of: UCHAR_BIDI_PAIRED_BRACKET_TYPE) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Tests/ICUTests/SearchCursorTests.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU 16 | import XCTest 17 | 18 | /// Unit tests for `SearchCursor`. 19 | class SearchCursorTests: XCTestCase { 20 | 21 | let text = "This is the test, isn't it?" 22 | 23 | var cursor: SearchCursor! 24 | 25 | override func setUp() { 26 | cursor = SearchCursor(text: text, pattern: "is") 27 | } 28 | 29 | func testFirst() { 30 | assertIndex(cursor.first(), isDistance: 2, fromStartOf: text) 31 | } 32 | 33 | func testNext() { 34 | _ = cursor.first() 35 | assertIndex(cursor.next(), isDistance: 5, fromStartOf: text) 36 | assertIndex(cursor.next(), isDistance: 18, fromStartOf: text) 37 | XCTAssertNil(cursor.next()) 38 | } 39 | 40 | func testLast() { 41 | assertIndex(cursor.last(), isDistance: 18, fromStartOf: text) 42 | } 43 | 44 | func testPrevious() { 45 | _ = cursor.last() 46 | assertIndex(cursor.previous(), isDistance: 5, fromStartOf: text) 47 | assertIndex(cursor.previous(), isDistance: 2, fromStartOf: text) 48 | XCTAssertNil(cursor.previous()) 49 | } 50 | 51 | func testMoveToIndexFollowing() { 52 | let index = text.index(text.startIndex, offsetBy: 3) 53 | assertIndex( 54 | cursor.moveToIndex(following: index), isDistance: 5, fromStartOf: text) 55 | } 56 | 57 | func testMoveToIndexPreceding() { 58 | let index = text.index(text.startIndex, offsetBy: 16) 59 | assertIndex( 60 | cursor.moveToIndex(preceding: index), isDistance: 5, fromStartOf: text) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Sources/ICU/JoiningType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Describes the Arabic cursive joining type of a scalar. 20 | public enum JoiningType: ConvertibleFromUnicodeIntProperty { 21 | 22 | case nonJoining 23 | case joinCausing 24 | case dualJoining 25 | case leftJoining 26 | case rightJoining 27 | case transparent 28 | 29 | /// The C API value of type `UJoiningType` that corresponds to the 30 | /// receiving enum case. 31 | var cValue: UJoiningType { 32 | switch self { 33 | case .nonJoining: return U_JT_NON_JOINING 34 | case .joinCausing: return U_JT_JOIN_CAUSING 35 | case .dualJoining: return U_JT_DUAL_JOINING 36 | case .leftJoining: return U_JT_LEFT_JOINING 37 | case .rightJoining: return U_JT_RIGHT_JOINING 38 | case .transparent: return U_JT_TRANSPARENT 39 | } 40 | } 41 | 42 | /// Creates a new value from the given ICU C API value. 43 | /// 44 | /// - Parameter cValue: The ICU C API value. 45 | init(cValue: UJoiningType) { 46 | switch cValue { 47 | case U_JT_NON_JOINING: self = .nonJoining 48 | case U_JT_JOIN_CAUSING: self = .joinCausing 49 | case U_JT_DUAL_JOINING: self = .dualJoining 50 | case U_JT_LEFT_JOINING: self = .leftJoining 51 | case U_JT_RIGHT_JOINING: self = .rightJoining 52 | case U_JT_TRANSPARENT: self = .transparent 53 | default: fatalError("Invalid UJoiningType value: \(cValue)") 54 | } 55 | } 56 | } 57 | } 58 | 59 | extension UnicodeScalar { 60 | 61 | /// The joining type property of the receiver. 62 | public var joiningType: Unicode.JoiningType { 63 | return value(of: UCHAR_JOINING_TYPE) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /Sources/ICU/String+UTF16.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension String { 18 | 19 | /// Creates a new `String` from the given pointer to a buffer containing 20 | /// null-terminated UTF-16 code units. 21 | /// 22 | /// - Parameter pointer: A pointer to a buffer containing null-terminated 23 | /// UTF-16 code units. 24 | internal init(unsafeUTF16CodeUnits pointer: UnsafePointer) { 25 | var codec = UTF16() 26 | var result = "" 27 | 28 | var iteratingPointer = pointer 29 | var iterator = AnyIterator { 30 | let uchar = iteratingPointer.pointee 31 | if uchar == 0 { 32 | return nil 33 | } 34 | iteratingPointer = iteratingPointer.advanced(by: 1) 35 | return uchar 36 | } 37 | 38 | decode: while true { 39 | switch codec.decode(&iterator) { 40 | case .scalarValue(let scalar): result.unicodeScalars.append(scalar) 41 | case .emptyInput: break decode 42 | case .error: 43 | print("Decoding error") 44 | break decode 45 | } 46 | } 47 | 48 | self = result 49 | } 50 | 51 | /// Returns a buffer pointer to a copy of the UTF-16 code units of the 52 | /// receiver. 53 | /// 54 | /// The code units are not zero-terminated. It is the responsibility of the 55 | /// caller to deallocate the pointer returned by this method. 56 | /// 57 | /// - Returns: An `UnsafeMutableBufferPointer` containing a copy of the UTF-16 58 | /// code units of the receiver. 59 | internal func unsafeUTF16CodeUnits() -> UnsafeMutableBufferPointer { 60 | let length = utf16.count 61 | let pointer = UnsafeMutablePointer.allocate(capacity: length) 62 | let bufferPointer = UnsafeMutableBufferPointer(start: pointer, count: length) 63 | _ = bufferPointer.initialize(from: utf16) 64 | return bufferPointer 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeScalar+Casing.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | public extension UnicodeScalar { 18 | 19 | /// Returns the lowercase equivalent of the receiving scalar, or the scalar 20 | /// itself if it has no lowercase equivalent. 21 | /// 22 | /// - Returns: The lowercase equivalent of the receiving scalar. 23 | public func lowercased() -> UnicodeScalar { 24 | return UnicodeScalar(uchar32Value: u_tolower(uchar32Value))! 25 | } 26 | 27 | /// Returns the mirror-image of the receiving scalar, or the scalar itself if 28 | /// it has no mirror-image equivalent (via the `Bidi_Mirrored` Unicode 29 | /// property). 30 | /// 31 | /// - Returns: The mirror-image equivalent of the receiving scalar. 32 | public func mirrored() -> UnicodeScalar { 33 | return UnicodeScalar(uchar32Value: u_charMirror(uchar32Value))! 34 | } 35 | 36 | /// Returns the titlecase equivalent of the receiving scalar, or the scalar 37 | /// itself if it has no titlecase equivalent. 38 | /// 39 | /// Titlecase differs from uppercase in a small number of cases; particularly, 40 | /// those that represent digraphs. For example, U+01C6 (LATIN SMALL LETTER DZ 41 | /// WITH CARON) has a titlecase equivalent of U+01C5 (LATIN CAPITAL LETTER D 42 | /// WITH SMALL LETTER Z WITH CARON), whereas its uppercase equivalent is 43 | /// U+01C4 (LATIN CAPITAL LETTER DZ WITH CARON). 44 | /// 45 | /// - Returns: The titlecase equivalent of the receiving scalar. 46 | public func titlecased() -> UnicodeScalar { 47 | return UnicodeScalar(uchar32Value: u_totitle(uchar32Value))! 48 | } 49 | 50 | /// Returns the uppercase equivalent of the receiving scalar, or the scalar 51 | /// itself if it has no uppercase equivalent. 52 | /// 53 | /// - Returns: The uppercase equivalent of the receiving scalar. 54 | public func uppercased() -> UnicodeScalar { 55 | return UnicodeScalar(uchar32Value: u_toupper(uchar32Value))! 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Sources/ICU/HangulSyllableType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Represents the Hangul syllable type of a scalar. 20 | public enum HangulSyllableType: ConvertibleFromUnicodeIntProperty { 21 | 22 | /// A leading consonant. 23 | case leadingJamo 24 | 25 | /// A vowel. 26 | case vowelJamo 27 | 28 | /// A trailing consonant. 29 | case trailingJamo 30 | 31 | /// A syllable composed of a leading consonant and a vowel. 32 | case lvSyllable 33 | 34 | /// A syllable composed of a leading consonant, a vowel, and a trailing 35 | /// consonant. 36 | case lvtSyllable 37 | 38 | /// The C API value of type `UHangulSyllableType` that corresponds to the 39 | /// receiving enum case. 40 | var cValue: UHangulSyllableType { 41 | switch self { 42 | case .leadingJamo: return U_HST_LEADING_JAMO 43 | case .vowelJamo: return U_HST_VOWEL_JAMO 44 | case .trailingJamo: return U_HST_TRAILING_JAMO 45 | case .lvSyllable: return U_HST_LV_SYLLABLE 46 | case .lvtSyllable: return U_HST_LVT_SYLLABLE 47 | } 48 | } 49 | 50 | /// Creates a new value from the given ICU C API value. 51 | /// 52 | /// - Parameter cValue: The ICU C API value. 53 | init?(cValue: UHangulSyllableType) { 54 | switch cValue { 55 | case U_HST_NOT_APPLICABLE: return nil 56 | case U_HST_LEADING_JAMO: self = .leadingJamo 57 | case U_HST_VOWEL_JAMO: self = .vowelJamo 58 | case U_HST_TRAILING_JAMO: self = .trailingJamo 59 | case U_HST_LV_SYLLABLE: self = .lvSyllable 60 | case U_HST_LVT_SYLLABLE: self = .lvtSyllable 61 | default: fatalError("Invalid UHangulSyllableType value: \(cValue)") 62 | } 63 | } 64 | } 65 | } 66 | 67 | extension UnicodeScalar { 68 | 69 | /// The Hangul syllable type of the receiver. 70 | public var hangulSyllableType: Unicode.HangulSyllableType? { 71 | return value(of: UCHAR_HANGUL_SYLLABLE_TYPE) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /Sources/ICU/ConvertibleFromUnicodeIntProperty.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Implemented internally by Swift types that map to ICU C types representing 18 | /// Unicode integer-vaued or enumerated properties to support a chain of uniform 19 | /// conversions from raw integer values to ICU C types to native Swift types. 20 | protocol ConvertibleFromUnicodeIntProperty { 21 | 22 | /// The ICU C type of the property. 23 | associatedtype CICUType: RawRepresentable 24 | 25 | /// Creates a new instance of the value by converting the given ICU C value. 26 | /// 27 | /// This initializer can "fail" and return nil for zero-values that should be 28 | /// represented as `Optional.none`. 29 | /// 30 | /// - Parameter cValue: The ICU C value to convert. 31 | init?(cValue: CICUType) 32 | } 33 | 34 | extension ConvertibleFromUnicodeIntProperty where CICUType.RawValue == Int32 { 35 | 36 | /// Creates a new instance of the value by converting the given `Int32` value. 37 | /// 38 | /// - Parameter unicodeIntPropertyValue: The integer value of the property. 39 | init?(unicodeIntPropertyValue: Int32) { 40 | guard let cValue = CICUType.init(rawValue: unicodeIntPropertyValue) else { 41 | fatalError("Unknown integer value \(unicodeIntPropertyValue) for " + 42 | "property \(CICUType.self)") 43 | } 44 | self.init(cValue: cValue) 45 | } 46 | } 47 | 48 | extension ConvertibleFromUnicodeIntProperty where CICUType.RawValue == UInt32 { 49 | 50 | /// Creates a new instance of the value by converting the given `UInt32` 51 | /// value. 52 | /// 53 | /// - Parameter unicodeIntPropertyValue: The integer value of the property. 54 | init?(unicodeIntPropertyValue: Int32) { 55 | let rawValue = UInt32(bitPattern: unicodeIntPropertyValue) 56 | guard let cValue = CICUType.init(rawValue: rawValue) else { 57 | fatalError("Unknown integer value \(unicodeIntPropertyValue) for " + 58 | "property \(CICUType.self)") 59 | } 60 | self.init(cValue: cValue) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Sources/ICU/NumericType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Denotes the numeric type and value of a scalar, if it has one. 20 | public enum NumericValue { 21 | 22 | /// Used for scalars whose numeric values are represented with an integer 23 | /// value in range `0...9`. Such scalars are restricted to digits that can be 24 | /// used in a decimal radix positional number system and which are encoded in 25 | /// a contiguous ascending range. 26 | case decimal(Int) 27 | 28 | /// Used for scalars whose numeric values are represented with an integer 29 | /// value in the range `0...9`. Unlike `decimal`, this case covers digits that 30 | /// need special handling, such as compatibility superscripts. 31 | /// 32 | /// Starting with Unicode 6.3.0, no newly encoded numeric characters will be 33 | /// given type `digit` nor will any existing characters with type `numeric` be 34 | /// changed to `digit`. As such, the distinction between those two types is 35 | /// not considered useful. 36 | case digit(Int) 37 | 38 | /// Used for scalars whose numeric values are represented with a positive or 39 | /// negative integer or rational number, such as "1/5" for U+2155 VULGAR 40 | /// FRACTION ONE FIFTH. 41 | case numeric(Double) 42 | } 43 | } 44 | 45 | extension UnicodeScalar { 46 | 47 | /// The numeric value of the receiving scalar, if it has one. 48 | public var numericValue: Unicode.NumericValue? { 49 | let cValue = UNumericType( 50 | UInt32(bitPattern: u_getIntPropertyValue( 51 | uchar32Value, UCHAR_NUMERIC_TYPE))) 52 | let numericValue = u_getNumericValue(uchar32Value) 53 | 54 | switch cValue { 55 | case U_NT_NONE: return nil 56 | case U_NT_DECIMAL: return .decimal(Int(numericValue)) 57 | case U_NT_DIGIT: return .digit(Int(numericValue)) 58 | case U_NT_NUMERIC: return .numeric(numericValue) 59 | default: fatalError("Invalid UNumericType value: \(cValue)") 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Sources/ICU/NormalizationCheckResult.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// The result of any of the "quick check" properties that provide optimizations 20 | /// for implementations of Unicode normalization. 21 | public enum NormalizationCheckResult: ConvertibleFromUnicodeIntProperty { 22 | 23 | /// The scalar cannot ever occur in the checked normalization form. 24 | case no 25 | 26 | /// The scalar may occur in the checked normalization, depending on context. 27 | /// This result only occurs for NFC and NFKC checks, not for NFD and NFKD. 28 | case maybe 29 | 30 | /// Any other character not covered by the `no` or `maybe` cases. 31 | case yes 32 | 33 | /// Creates a new value from the given ICU C API value. 34 | /// 35 | /// - Parameter cValue: The ICU C API value. 36 | init(cValue: UNormalizationCheckResult) { 37 | switch cValue { 38 | case UNORM_NO: self = .no 39 | case UNORM_MAYBE: self = .maybe 40 | case UNORM_YES: self = .yes 41 | default: fatalError("Invalid UNormalizationCheckResult value: \(cValue)") 42 | } 43 | } 44 | } 45 | } 46 | 47 | extension UnicodeScalar { 48 | 49 | /// Indicates whether or not the receiving scalar can occur in NFD form. 50 | public var nfdQuickCheck: Unicode.NormalizationCheckResult { 51 | return value(of: UCHAR_NFD_QUICK_CHECK) 52 | } 53 | 54 | /// Indicates whether or not the receiving scalar can occur in NFKD form. 55 | public var nfkdQuickCheck: Unicode.NormalizationCheckResult { 56 | return value(of: UCHAR_NFKD_QUICK_CHECK) 57 | } 58 | 59 | /// Indicates whether or not the receiving scalar can occur in NFC form. 60 | public var nfcQuickCheck: Unicode.NormalizationCheckResult { 61 | return value(of: UCHAR_NFC_QUICK_CHECK) 62 | } 63 | 64 | /// Indicates whether or not the receiving scalar can occur in NFKC form. 65 | public var nfkcQuickCheck: Unicode.NormalizationCheckResult { 66 | return value(of: UCHAR_NFKC_QUICK_CHECK) 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /Tests/ICUTests/RuleBasedBreakCursorTests.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU 16 | import XCTest 17 | 18 | /// Unit tests for `RuleBasedBreakCursor`. 19 | class RuleBasedBreakCursorTests: XCTestCase { 20 | 21 | let text = "AB12" 22 | var cursor: RuleBasedBreakCursor! 23 | 24 | override func setUp() { 25 | do { 26 | let breakRules = """ 27 | [A-Z]{100}; 28 | [\\p{Lu}]{200}; 29 | [0-9]{300}; 30 | [\\p{N}]{400}; 31 | !.*; 32 | """ 33 | cursor = try RuleBasedBreakCursor(rules: breakRules, text: text) 34 | } catch { 35 | XCTFail("Setup should not throw error") 36 | } 37 | } 38 | 39 | func testInvalidRule() { 40 | let badRules = """ 41 | # This is a rule comment on line 1 42 | [:L:]; # This rule is OK. 43 | abcdefg); # Error, mismatched parens 44 | """ 45 | XCTAssertThrowsError(try RuleBasedBreakCursor(rules: badRules)) { error in 46 | guard case BreakRuleParseError.mismatchedParentheses( 47 | let context) = error 48 | else { 49 | XCTFail("Expected mismatchedParentheses thrown, but got \(error)") 50 | return 51 | } 52 | guard case ParseErrorContext.Location.lineAndColumn( 53 | line: let line, column: let column) = context.location 54 | else { 55 | XCTFail("Expected lineAndColumn location, but got \(context.location)") 56 | return 57 | } 58 | XCTAssertEqual(line, 3) 59 | XCTAssertEqual(column, 8) 60 | } 61 | } 62 | 63 | func testRuleStatuses() { 64 | assertIndex(cursor.first(), isDistance: 0, fromStartOf: text) 65 | XCTAssertEqual(cursor.ruleStatuses, [0]) 66 | assertIndex(cursor.next(), isDistance: 1, fromStartOf: text) 67 | XCTAssertEqual(cursor.ruleStatuses, [100, 200]) 68 | assertIndex(cursor.next(), isDistance: 2, fromStartOf: text) 69 | XCTAssertEqual(cursor.ruleStatuses, [100, 200]) 70 | assertIndex(cursor.next(), isDistance: 3, fromStartOf: text) 71 | XCTAssertEqual(cursor.ruleStatuses, [300, 400]) 72 | assertIndex(cursor.next(), isDistance: 4, fromStartOf: text) 73 | XCTAssertEqual(cursor.ruleStatuses, [300, 400]) 74 | XCTAssertNil(cursor.next()) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /Sources/ICU/ParseErrorContext.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Represents the location and text content of an error that occurred while 18 | /// parsing source text, such as break cursor rules. 19 | public struct ParseErrorContext { 20 | 21 | /// Indicates where in the source text the error occurred. 22 | /// 23 | /// Parsers that support line numbers will return locations using the 24 | /// `lineAndColumn` case. Parsers that do not support them will only return 25 | /// locations that are case `offset`. 26 | public enum Location { 27 | 28 | /// The line number and column offset (both starting at 1) where the error 29 | /// occurred. 30 | case lineAndColumn(line: Int, column: Int) 31 | 32 | /// The offset (in terms of UTF-16 code units) in the source text where the 33 | /// error occurred. 34 | case offset(Int) 35 | } 36 | 37 | /// The location where the error occurred in the source text. 38 | public let location: Location 39 | 40 | /// Some textual context before the error. 41 | /// 42 | /// If the parser does not support this, it will be empty. 43 | public let preContext: String 44 | 45 | /// The textual content of the error itself and some trailing content. 46 | /// 47 | /// If the parser does not support this, it will be empty. 48 | public let postContext: String 49 | 50 | /// Creates a new `ParseErrorContext` from the given ICU C value. 51 | /// 52 | /// - Parameter cValue: The `UParseError` value from an ICU C API that 53 | /// describes the error context. 54 | init(cValue: UParseError) { 55 | if cValue.line >= 1 { 56 | self.location = 57 | .lineAndColumn(line: Int(cValue.line), column: Int(cValue.offset)) 58 | } else { 59 | self.location = .offset(Int(cValue.offset)) 60 | } 61 | 62 | var cValueCopy = cValue 63 | self.preContext = withUnsafeBytes(of: &cValueCopy.preContext) { 64 | (rawPointer) in 65 | let codeUnitPointer = 66 | rawPointer.baseAddress!.assumingMemoryBound(to: UChar.self) 67 | return String(unsafeUTF16CodeUnits: codeUnitPointer) 68 | } 69 | self.postContext = withUnsafeBytes(of: &cValueCopy.postContext) { 70 | (rawPointer) in 71 | let codeUnitPointer = 72 | rawPointer.baseAddress!.assumingMemoryBound(to: UChar.self) 73 | return String(unsafeUTF16CodeUnits: codeUnitPointer) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /Sources/ICU/CInterop.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /// Namespace containing helper functions for interoperating with C APIs. 16 | enum CInterop { 17 | 18 | /// Wraps a value in a heap-allocated object and then executes the given 19 | /// block, passing it a pointer to the object. 20 | /// 21 | /// This is intended to be used to pass value types and function references to 22 | /// C functions -- in particular, those that take callbacks and `void *` 23 | /// context arguments. The wrapping object, and thus the underlying value, is 24 | /// guaranteed to exist during the execution of the closure. 25 | /// 26 | /// Because the value type is wrapped, be aware that the value that you get 27 | /// back on the other side (by calling `unwrappedValue(from:)`) will be a copy 28 | /// and that mutating it will not affect the original value. 29 | /// 30 | /// - Parameters: 31 | /// - value: An arbitrary value. 32 | /// - block: A block that will be passed the raw pointer to the object 33 | /// wrapping the value. 34 | /// - Returns: Whatever `block` returns. 35 | /// - Throws: Whatever `block` throws. 36 | static func withPointer< 37 | Value, 38 | Result 39 | >( 40 | wrapping value: Value, 41 | _ block: (UnsafeMutableRawPointer) throws -> Result 42 | ) rethrows -> Result { 43 | let wrapper = ValueWrapper(wrapping: value) 44 | let pointer = Unmanaged.passUnretained(wrapper).toOpaque() 45 | return try block(pointer) 46 | } 47 | 48 | /// Returns a copy of a value that has been wrapped by 49 | /// `withPointer(wrapping:block:)`. 50 | /// 51 | /// The behavior of this function is undefined if the return type `T` does not 52 | /// match the type of the value originally passed to 53 | /// `withPointer(wrapping:block:)`. 54 | /// 55 | /// - Parameter pointer: An opaque pointer to a wrapped value that was 56 | /// obtained by calling `withPointer(wrapping:block:)`. 57 | /// - Returns: A copy of the value that was wrapped by the opaque pointer. 58 | static func unwrappedValue(from pointer: UnsafeRawPointer) -> T { 59 | let unmanaged = Unmanaged>.fromOpaque(pointer) 60 | return unmanaged.takeUnretainedValue().wrapped 61 | } 62 | } 63 | 64 | /// Used by `CInterop.withPointer(wrapping:block:)` to wrap arbitrary values in 65 | /// a heap-allocated object. 66 | private class ValueWrapper { 67 | 68 | /// The wrapped value. 69 | let wrapped: Value 70 | 71 | /// Creates a new wrapper for the given value. 72 | /// 73 | /// - Parameter value: The value to wrap. 74 | init(wrapping value: Value) { 75 | self.wrapped = value 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /Sources/ICU/SentenceBreakType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Denotes the sentence break type property of a scalar. 20 | /// 21 | /// The values in this enum can be used in an algorithm as described by 22 | /// http://www.unicode.org/reports/tr29/ to determine sentence boundaries 23 | /// within text. 24 | public enum SentenceBreakType: ConvertibleFromUnicodeIntProperty { 25 | 26 | case other 27 | case aterm 28 | case close 29 | case format 30 | case lower 31 | case numeric 32 | case oletter 33 | case sep 34 | case sp 35 | case sterm 36 | case upper 37 | case cr 38 | case extend 39 | case lf 40 | case scontinue 41 | 42 | /// The C API value of type `USentenceBreak` that corresponds to the 43 | /// receiving enum case. 44 | var cValue: USentenceBreak { 45 | switch self { 46 | case .other: return U_SB_OTHER 47 | case .aterm: return U_SB_ATERM 48 | case .close: return U_SB_CLOSE 49 | case .format: return U_SB_FORMAT 50 | case .lower: return U_SB_LOWER 51 | case .numeric: return U_SB_NUMERIC 52 | case .oletter: return U_SB_OLETTER 53 | case .sep: return U_SB_SEP 54 | case .sp: return U_SB_SP 55 | case .sterm: return U_SB_STERM 56 | case .upper: return U_SB_UPPER 57 | case .cr: return U_SB_CR 58 | case .extend: return U_SB_EXTEND 59 | case .lf: return U_SB_LF 60 | case .scontinue: return U_SB_SCONTINUE 61 | } 62 | } 63 | 64 | /// Creates a new value from the given ICU C API value. 65 | /// 66 | /// - Parameter cValue: The ICU C API value. 67 | init(cValue: USentenceBreak) { 68 | switch cValue { 69 | case U_SB_OTHER: self = .other 70 | case U_SB_ATERM: self = .aterm 71 | case U_SB_CLOSE: self = .close 72 | case U_SB_FORMAT: self = .format 73 | case U_SB_LOWER: self = .lower 74 | case U_SB_NUMERIC: self = .numeric 75 | case U_SB_OLETTER: self = .oletter 76 | case U_SB_SEP: self = .sep 77 | case U_SB_SP: self = .sp 78 | case U_SB_STERM: self = .sterm 79 | case U_SB_UPPER: self = .upper 80 | case U_SB_CR: self = .cr 81 | case U_SB_EXTEND: self = .extend 82 | case U_SB_LF: self = .lf 83 | case U_SB_SCONTINUE: self = .scontinue 84 | default: fatalError("Invalid USentenceBreak value: \(cValue)") 85 | } 86 | } 87 | } 88 | } 89 | 90 | extension UnicodeScalar { 91 | 92 | /// The sentence break type of the receiver. 93 | public var sentenceBreakType: Unicode.SentenceBreakType { 94 | return value(of: UCHAR_SENTENCE_BREAK) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /Sources/ICU/EastAsianWidth.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Enumerated property `East_Asian_Width`. 20 | /// 21 | /// Described by [http://www.unicode.org/reports/tr11/](http://www.unicode.org/reports/tr11/). 22 | public enum EastAsianWidth: ConvertibleFromUnicodeIntProperty { 23 | 24 | /// Any character that does not appear in East Asian typography. 25 | case neutral 26 | 27 | /// A character that can be sometimes narrow or sometimes wide, requiring 28 | /// additional contextual information not in the character itself to determine 29 | /// the correct width. 30 | case ambiguous 31 | 32 | /// A character with a decomposition of type of `narrow` to other characters 33 | /// that are implicitly wide but unmarked, as well as U+20A9 WON SIGN. 34 | case halfWidth 35 | 36 | /// A character with a decomposition of type of `wide` to other characters 37 | /// that are implicitly narrow but unmarked. 38 | case fullWidth 39 | 40 | /// A character that is always narrow and has an explicit full-width or wide 41 | /// counterpart. 42 | case narrow 43 | 44 | /// A character that is always wide, occurring only in the context of East 45 | /// Asian typography. This also includes all characters that have explicit 46 | /// half-width counterparts and all characters with emoji presentation except 47 | /// for U+1F1E6...U+1F1FF (REGIONAL INDICATOR SYMBOL LETTER A...REGIONAL 48 | /// INDICATOR SYMBOL LETTER Z). 49 | case wide 50 | 51 | /// Creates a new East Asian width value from the given ICU C API value. 52 | /// 53 | /// - Parameter cValue: The ICU C API value. 54 | init(cValue: UEastAsianWidth) { 55 | switch cValue { 56 | case U_EA_NEUTRAL: self = .neutral 57 | case U_EA_AMBIGUOUS: self = .ambiguous 58 | case U_EA_HALFWIDTH: self = .halfWidth 59 | case U_EA_FULLWIDTH: self = .fullWidth 60 | case U_EA_NARROW: self = .narrow 61 | case U_EA_WIDE: self = .wide 62 | default: fatalError("Invalid UEastAsianWidth value: \(cValue)") 63 | } 64 | } 65 | 66 | /// The C API value of type `UEastAsianWidth` that corresponds to the 67 | /// receiving enum case. 68 | var cValue: UEastAsianWidth { 69 | switch self { 70 | case .neutral: return U_EA_NEUTRAL 71 | case .ambiguous: return U_EA_AMBIGUOUS 72 | case .halfWidth: return U_EA_HALFWIDTH 73 | case .fullWidth: return U_EA_FULLWIDTH 74 | case .narrow: return U_EA_NARROW 75 | case .wide: return U_EA_WIDE 76 | } 77 | } 78 | } 79 | } 80 | 81 | extension UnicodeScalar { 82 | 83 | /// The East Asian width property of the receiver. 84 | public var eastAsianWidth: Unicode.EastAsianWidth { 85 | return value(of: UCHAR_EAST_ASIAN_WIDTH) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /Tests/ICUTests/CharacterBreakCursorTests.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU 16 | import XCTest 17 | 18 | /// Unit tests for `CharacterBreakCursor`. 19 | class CharacterBreakCursorTests: XCTestCase { 20 | // 01 23 21 | let text = "a\u{1F937}\u{1F3FD}\u{200D}\u{2642}\u{FE0F}b" 22 | 23 | var cursor: CharacterBreakCursor! 24 | 25 | override func setUp() { 26 | cursor = CharacterBreakCursor(text: text) 27 | } 28 | 29 | func testBreaksForward() { 30 | assertBreak(cursor.first(), at: 0) 31 | assertBreak(cursor.next(), at: 1) 32 | assertBreak(cursor.next(), at: 2) 33 | assertBreak(cursor.next(), at: 3) 34 | XCTAssertNil(cursor.next()) 35 | } 36 | 37 | func testBreaksBackward() { 38 | assertBreak(cursor.last(), at: 3) 39 | assertBreak(cursor.previous(), at: 2) 40 | assertBreak(cursor.previous(), at: 1) 41 | assertBreak(cursor.previous(), at: 0) 42 | XCTAssertNil(cursor.previous()) 43 | } 44 | 45 | func testBreaksFollowing() { 46 | assertBreak(cursor.moveToIndex( 47 | following: text.index(text.startIndex, offsetBy: 2)), at: 3) 48 | XCTAssertNil(cursor.moveToIndex(following: text.endIndex)) 49 | } 50 | 51 | func testBreaksPreceding() { 52 | assertBreak(cursor.moveToIndex( 53 | preceding: text.index(text.startIndex, offsetBy: 3)), at: 2) 54 | XCTAssertNil(cursor.moveToIndex(preceding: text.startIndex)) 55 | } 56 | 57 | func testIsBoundary() { 58 | XCTAssertTrue(cursor.isBoundary(movingToOrAfter: text.startIndex)) 59 | assertBreak(cursor.index, at: 0) 60 | 61 | // This even works with String indices that are in the middle of a 62 | // character, like the following example where the index is in the middle of 63 | // a multi-scalar cluster that combine to make a single emoji character. 64 | // Swift's indices, combined with ICU functionality, properly determine that 65 | // the index is not a character boundary and moves it to the correct 66 | // position. 67 | let scalars = text.unicodeScalars 68 | XCTAssertFalse(cursor.isBoundary( 69 | movingToOrAfter: scalars.index(scalars.startIndex, offsetBy: 2))) 70 | assertBreak(cursor.index, at: 2) 71 | } 72 | 73 | func testSetText() { 74 | cursor.text = "abc" 75 | _ = cursor.first() 76 | assertBreak(cursor.next(), at: 1) 77 | } 78 | 79 | /// Asserts that the given break index is `distance` from the start of the 80 | /// string and that the most recent rule status matches the given one. 81 | private func assertBreak( 82 | _ index: String.Index?, 83 | at distance: Int, 84 | file: StaticString = #file, 85 | line: UInt = #line 86 | ) { 87 | assertIndex( 88 | index, 89 | isDistance: distance, 90 | fromStartOf: text, 91 | file: file, 92 | line: line) 93 | } 94 | } 95 | 96 | -------------------------------------------------------------------------------- /Tests/ICUTests/SentenceBreakCursorTests.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU 16 | import XCTest 17 | 18 | /// Unit tests for `SentenceBreakCursor`. 19 | class SentenceBreakCursorTests: XCTestCase { 20 | // 0000000000111111111122222222223 33333333344444 21 | // 0123456789012345678901234567890 12345678901234 22 | let text = "Here's a sentence. Another one\nAnd one more?" 23 | // 0 0 1 0 24 | // 0 = .terminator 25 | // 1 = .separator 26 | 27 | var cursor: SentenceBreakCursor! 28 | 29 | override func setUp() { 30 | cursor = SentenceBreakCursor(text: text) 31 | } 32 | 33 | func testBreaksForward() { 34 | assertBreak(cursor.first(), at: 0, status: .terminator) 35 | assertBreak(cursor.next(), at: 19, status: .terminator) 36 | assertBreak(cursor.next(), at: 31, status: .separator) 37 | assertBreak(cursor.next(), at: 44, status: .terminator) 38 | XCTAssertNil(cursor.next()) 39 | } 40 | 41 | func testBreaksBackward() { 42 | assertBreak(cursor.last(), at: 44, status: .terminator) 43 | assertBreak(cursor.previous(), at: 31, status: .separator) 44 | assertBreak(cursor.previous(), at: 19, status: .terminator) 45 | assertBreak(cursor.previous(), at: 0, status: .terminator) 46 | XCTAssertNil(cursor.previous()) 47 | } 48 | 49 | func testBreaksFollowing() { 50 | assertBreak(cursor.moveToIndex( 51 | following: text.index(text.startIndex, offsetBy: 15) 52 | ), at: 19, status: .terminator) 53 | XCTAssertNil(cursor.moveToIndex(following: text.endIndex)) 54 | } 55 | 56 | func testBreaksPreceding() { 57 | assertBreak(cursor.moveToIndex( 58 | preceding: text.index(text.startIndex, offsetBy: 35) 59 | ), at: 31, status: .separator) 60 | XCTAssertNil(cursor.moveToIndex(preceding: text.startIndex)) 61 | } 62 | 63 | func testIsBoundary() { 64 | XCTAssertTrue(cursor.isBoundary( 65 | movingToOrAfter: text.index(text.startIndex, offsetBy: 19))) 66 | assertBreak(cursor.index, at: 19, status: .terminator) 67 | 68 | XCTAssertFalse(cursor.isBoundary( 69 | movingToOrAfter: text.index(text.startIndex, offsetBy: 28))) 70 | assertBreak(cursor.index, at: 31, status: .separator) 71 | } 72 | 73 | func testSetText() { 74 | cursor.text = "A. B." 75 | _ = cursor.first() 76 | assertBreak(cursor.next(), at: 3, status: .terminator) 77 | } 78 | 79 | /// Asserts that the given break index is `distance` from the start of the 80 | /// string and that the most recent rule status matches the given one. 81 | private func assertBreak( 82 | _ index: String.Index?, 83 | at distance: Int, 84 | status: SentenceBreakCursor.RuleStatusRange, 85 | file: StaticString = #file, 86 | line: UInt = #line 87 | ) { 88 | assertIndex( 89 | index, 90 | isDistance: distance, 91 | fromStartOf: text, 92 | file: file, 93 | line: line) 94 | XCTAssertTrue(status ~= cursor.ruleStatus, file: file, line: line) 95 | } 96 | } 97 | 98 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeScalar+Naming.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Indicates which name for a `UnicodeScalar` should be returned by 20 | /// `UnicodeScalar.name(kind:)` or provided in enumerations. 21 | public enum NameKind { 22 | 23 | /// The "modern" name of a Unicode scalar (defined by its "Name" property). 24 | case unicode 25 | 26 | /// The "extended" (standard or synthetic) name of a Unicode scalar, which is 27 | /// unique for each scalar. 28 | case extended 29 | 30 | /// The corrected name of a Unicode scalar from NameAliases.txt. 31 | case alias 32 | 33 | /// The C API value of type `UCharNameChoice` that corresponds to the 34 | /// receiving enum case. 35 | var cValue: UCharNameChoice { 36 | switch self { 37 | case .unicode: return U_UNICODE_CHAR_NAME 38 | case .extended: return U_EXTENDED_CHAR_NAME 39 | case .alias: return U_CHAR_NAME_ALIAS 40 | } 41 | } 42 | } 43 | } 44 | 45 | public extension UnicodeScalar { 46 | 47 | /// Creates a new Unicode scalar with the given name. 48 | /// 49 | /// The name passed into this initializer must match exactly. Unicode names 50 | /// (`nameKind == .unicode`) are all uppercase. Extended names 51 | /// (`nameKind == .extended`) are lowercase followed by an uppercase 52 | /// hexadecimal number, all enclosed by angled brackets. 53 | /// 54 | /// This initializer returns nil if no code point exists with the given name. 55 | /// 56 | /// - Parameters: 57 | /// - name: The name of the code point. 58 | /// - nameKind: The kind of the name specified by `name`. 59 | public init?(named name: String, nameKind: Unicode.NameKind = .unicode) { 60 | var error = UErrorCode() 61 | let value = u_charFromName(nameKind.cValue, name, &error) 62 | guard error.isSuccess else { 63 | return nil 64 | } 65 | 66 | self.init(uchar32Value: value) 67 | } 68 | 69 | /// Returns the Unicode name, or a variant, for the receiving scalar. 70 | /// 71 | /// - Parameters: 72 | /// - kind: A value from `UnicodeNameKind` indicating which name should be 73 | /// returned. If not provided, the default is `.unicode`. 74 | /// - Returns: The name of the scalar, or `nil` if the name does not exist. 75 | public func name(kind: Unicode.NameKind = .unicode) -> String? { 76 | var error = UErrorCode() 77 | var buffer = UnsafeMutablePointer.allocate( 78 | capacity: charNameBufferLength) 79 | defer { buffer.deallocate(capacity: charNameBufferLength) } 80 | 81 | let length = u_charName( 82 | uchar32Value, 83 | kind.cValue, 84 | buffer, 85 | Int32(truncatingIfNeeded: charNameBufferLength), 86 | &error) 87 | guard error.isSuccess else { 88 | // FIXME: Do something that makes sense here. 89 | return "ERROR: \(error)" 90 | } 91 | 92 | return length != 0 ? String(cString: buffer) : nil 93 | } 94 | } 95 | 96 | /// The length of the C-string buffer that should be passed to `u_charName`. 97 | private let charNameBufferLength = 256 98 | -------------------------------------------------------------------------------- /Tests/ICUTests/LineBreakCursorTests.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU 16 | import XCTest 17 | 18 | /// Unit tests for `LineBreakCursor`. 19 | class LineBreakCursorTests: XCTestCase { 20 | // 000000000011 11111111222222222 21 | // 012345678901 23456789012345678 22 | let text = "Here's some\npossible—breaks." 23 | // 0 0 1 00 0 24 | // 0 = .soft 25 | // 1 = .hard 26 | 27 | var cursor: LineBreakCursor! 28 | 29 | override func setUp() { 30 | cursor = LineBreakCursor(text: text) 31 | } 32 | 33 | func testBreaksForward() { 34 | assertBreak(cursor.first(), at: 0, status: .soft) 35 | assertBreak(cursor.next(), at: 7, status: .soft) 36 | assertBreak(cursor.next(), at: 12, status: .hard) 37 | assertBreak(cursor.next(), at: 20, status: .soft) 38 | assertBreak(cursor.next(), at: 21, status: .soft) 39 | assertBreak(cursor.next(), at: 28, status: .soft) 40 | XCTAssertNil(cursor.next()) 41 | } 42 | 43 | func testBreaksBackward() { 44 | assertBreak(cursor.last(), at: 28, status: .soft) 45 | assertBreak(cursor.previous(), at: 21, status: .soft) 46 | assertBreak(cursor.previous(), at: 20, status: .soft) 47 | assertBreak(cursor.previous(), at: 12, status: .hard) 48 | assertBreak(cursor.previous(), at: 7, status: .soft) 49 | assertBreak(cursor.previous(), at: 0, status: .soft) 50 | XCTAssertNil(cursor.previous()) 51 | } 52 | 53 | func testBreaksFollowing() { 54 | assertBreak(cursor.moveToIndex( 55 | following: text.index(text.startIndex, offsetBy: 15) 56 | ), at: 20, status: .soft) 57 | XCTAssertNil(cursor.moveToIndex(following: text.endIndex)) 58 | } 59 | 60 | func testBreaksPreceding() { 61 | assertBreak(cursor.moveToIndex( 62 | preceding: text.index(text.startIndex, offsetBy: 18) 63 | ), at: 12, status: .hard) 64 | XCTAssertNil(cursor.moveToIndex(preceding: text.startIndex)) 65 | } 66 | 67 | func testIsBoundary() { 68 | XCTAssertTrue(cursor.isBoundary( 69 | movingToOrAfter: text.index(text.startIndex, offsetBy: 12))) 70 | assertBreak(cursor.index, at: 12, status: .hard) 71 | 72 | XCTAssertFalse(cursor.isBoundary( 73 | movingToOrAfter: text.index(text.startIndex, offsetBy: 17))) 74 | assertBreak(cursor.index, at: 20, status: .soft) 75 | } 76 | 77 | func testSetText() { 78 | cursor.text = "a\nb" 79 | _ = cursor.first() 80 | assertBreak(cursor.next(), at: 2, status: .hard) 81 | } 82 | 83 | /// Asserts that the given break index is `distance` from the start of the 84 | /// string and that the most recent rule status matches the given one. 85 | private func assertBreak( 86 | _ index: String.Index?, 87 | at distance: Int, 88 | status: LineBreakCursor.RuleStatusRange, 89 | file: StaticString = #file, 90 | line: UInt = #line 91 | ) { 92 | assertIndex( 93 | index, 94 | isDistance: distance, 95 | fromStartOf: text, 96 | file: file, 97 | line: line) 98 | XCTAssertTrue(status ~= cursor.ruleStatus, file: file, line: line) 99 | } 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /Sources/ICU/GraphemeClusterBreakType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Denotes the grapheme cluster break type property of a scalar. 20 | /// 21 | /// The values in this enum can be used in an algorithm as described by 22 | /// http://www.unicode.org/reports/tr29/ to determine grapheme cluster 23 | /// boundaries within a sequence of scalars. 24 | public enum GraphemeClusterBreakType: ConvertibleFromUnicodeIntProperty { 25 | 26 | case other 27 | case control 28 | case cr 29 | case extend 30 | case l 31 | case lf 32 | case lv 33 | case lvt 34 | case t 35 | case v 36 | case spacingMark 37 | case prepend 38 | case regionalIndicator 39 | case eBase 40 | case eBaseGaz 41 | case eModifier 42 | case glueAfterZWJ 43 | case zwj 44 | 45 | /// The C API value of type `UGraphemeClusterBreak` that corresponds to the 46 | /// receiving enum case. 47 | var cValue: UGraphemeClusterBreak { 48 | switch self { 49 | case .other: return U_GCB_OTHER 50 | case .control: return U_GCB_CONTROL 51 | case .cr: return U_GCB_CR 52 | case .extend: return U_GCB_EXTEND 53 | case .l: return U_GCB_L 54 | case .lf: return U_GCB_LF 55 | case .lv: return U_GCB_LV 56 | case .lvt: return U_GCB_LVT 57 | case .t: return U_GCB_T 58 | case .v: return U_GCB_V 59 | case .spacingMark: return U_GCB_SPACING_MARK 60 | case .prepend: return U_GCB_PREPEND 61 | case .regionalIndicator: return U_GCB_REGIONAL_INDICATOR 62 | case .eBase: return U_GCB_E_BASE 63 | case .eBaseGaz: return U_GCB_E_BASE_GAZ 64 | case .eModifier: return U_GCB_E_MODIFIER 65 | case .glueAfterZWJ: return U_GCB_GLUE_AFTER_ZWJ 66 | case .zwj: return U_GCB_ZWJ 67 | } 68 | } 69 | 70 | /// Creates a new value from the given ICU C API value. 71 | /// 72 | /// - Parameter cValue: The ICU C API value. 73 | init(cValue: UGraphemeClusterBreak) { 74 | switch cValue { 75 | case U_GCB_OTHER: self = .other 76 | case U_GCB_CONTROL: self = .control 77 | case U_GCB_CR: self = .cr 78 | case U_GCB_EXTEND: self = .extend 79 | case U_GCB_L: self = .l 80 | case U_GCB_LF: self = .lf 81 | case U_GCB_LV: self = .lv 82 | case U_GCB_LVT: self = .lvt 83 | case U_GCB_T: self = .t 84 | case U_GCB_V: self = .v 85 | case U_GCB_SPACING_MARK: self = .spacingMark 86 | case U_GCB_PREPEND: self = .prepend 87 | case U_GCB_REGIONAL_INDICATOR: self = .regionalIndicator 88 | case U_GCB_E_BASE: self = .eBase 89 | case U_GCB_E_BASE_GAZ: self = .eBaseGaz 90 | case U_GCB_E_MODIFIER: self = .eModifier 91 | case U_GCB_GLUE_AFTER_ZWJ: self = .glueAfterZWJ 92 | case U_GCB_ZWJ: self = .zwj 93 | default: fatalError("Invalid UGraphemeClusterBreak value: \(cValue)") 94 | } 95 | } 96 | } 97 | } 98 | 99 | extension UnicodeScalar { 100 | 101 | /// The grapheme cluster break property of the receiver. 102 | public var graphemeClusterBreakType: Unicode.GraphemeClusterBreakType { 103 | return value(of: UCHAR_GRAPHEME_CLUSTER_BREAK) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /Sources/ICU/BreakRuleParseError.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Errors that can be thrown when creating a `RuleBasedBreakCursor`. 18 | public enum BreakRuleParseError: Error { 19 | 20 | /// An internal error (bug) was detected. 21 | case internalError(context: ParseErrorContext) 22 | 23 | /// Hexadecimal digits were expected but not found (e.g., part of an escaped 24 | /// character in a rule). 25 | case hexDigitsExpected(context: ParseErrorContext) 26 | 27 | /// The semicolon was missing at the end of a rule. 28 | case semicolonExpected(context: ParseErrorContext) 29 | 30 | /// There was a syntax error in a rule. 31 | case ruleSyntax(context: ParseErrorContext) 32 | 33 | /// A Unicode set within a rule was missing the closing "]". 34 | case unclosedSet(context: ParseErrorContext) 35 | 36 | /// There was a syntax error in a rule assignment statement. 37 | case assignError(context: ParseErrorContext) 38 | 39 | /// A `$variable` was redefined. 40 | case variableRedefinition(context: ParseErrorContext) 41 | 42 | /// There were mismatched parentheses in a rule. 43 | case mismatchedParentheses(context: ParseErrorContext) 44 | 45 | /// There was a missing closing quote in a rule. 46 | case newLineInQuotedString(context: ParseErrorContext) 47 | 48 | /// A `$variable` was used that was not defined. 49 | case undefinedVariable(context: ParseErrorContext) 50 | 51 | /// An error occurred during initialization (maybe ICU data is missing?). 52 | case initializationError(context: ParseErrorContext) 53 | 54 | /// A rule contained an empty Unicode set. 55 | case ruleEmptySet(context: ParseErrorContext) 56 | 57 | /// An `!!option` in a rule was not recognized. 58 | case unrecognizedOption(context: ParseErrorContext) 59 | 60 | /// The `{nnn}` tag on a rule was malformed. 61 | case malformedRuleTag(context: ParseErrorContext) 62 | 63 | /// Creates a new parse error with the given ICU error code and context. 64 | /// 65 | /// - Parameters: 66 | /// - cValue: The underlying ICU C error code. 67 | /// - context: The `ParseErrorContext` that describes where the error 68 | /// occurred during parsing. 69 | init(cValue: UErrorCode, context: ParseErrorContext) { 70 | let factory: (ParseErrorContext) -> BreakRuleParseError 71 | 72 | switch cValue { 73 | case U_BRK_INTERNAL_ERROR: factory = BreakRuleParseError.internalError 74 | case U_BRK_HEX_DIGITS_EXPECTED: 75 | factory = BreakRuleParseError.hexDigitsExpected 76 | case U_BRK_SEMICOLON_EXPECTED: 77 | factory = BreakRuleParseError.semicolonExpected 78 | case U_BRK_RULE_SYNTAX: factory = BreakRuleParseError.ruleSyntax 79 | case U_BRK_UNCLOSED_SET: factory = BreakRuleParseError.unclosedSet 80 | case U_BRK_ASSIGN_ERROR: factory = BreakRuleParseError.assignError 81 | case U_BRK_VARIABLE_REDFINITION: 82 | factory = BreakRuleParseError.variableRedefinition 83 | case U_BRK_MISMATCHED_PAREN: 84 | factory = BreakRuleParseError.mismatchedParentheses 85 | case U_BRK_NEW_LINE_IN_QUOTED_STRING: 86 | factory = BreakRuleParseError.newLineInQuotedString 87 | case U_BRK_UNDEFINED_VARIABLE: 88 | factory = BreakRuleParseError.undefinedVariable 89 | case U_BRK_INIT_ERROR: factory = BreakRuleParseError.initializationError 90 | case U_BRK_RULE_EMPTY_SET: factory = BreakRuleParseError.ruleEmptySet 91 | case U_BRK_UNRECOGNIZED_OPTION: 92 | factory = BreakRuleParseError.unrecognizedOption 93 | case U_BRK_MALFORMED_RULE_TAG: 94 | factory = BreakRuleParseError.malformedRuleTag 95 | default: fatalError("Internal error: error code not supported") 96 | } 97 | 98 | self = factory(context) 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /Sources/ICU/WordBreakType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Denotes the word break type property of a scalar. 20 | /// 21 | /// The values in this enum can be used in an algorithm as described by 22 | /// http://www.unicode.org/reports/tr29/ to determine word boundaries within 23 | /// text. 24 | public enum WordBreakType: ConvertibleFromUnicodeIntProperty { 25 | 26 | case other 27 | case aletter 28 | case format 29 | case katakana 30 | case midletter 31 | case midnum 32 | case numeric 33 | case extendnumlet 34 | case cr 35 | case extend 36 | case lf 37 | case midnumlet 38 | case newline 39 | case regionalIndicator 40 | case hebrewLetter 41 | case singleQuote 42 | case doubleQuote 43 | case eBase 44 | case eBaseGaz 45 | case eModifier 46 | case glueAfterZWJ 47 | case zwj 48 | 49 | /// The C API value of type `UWordBreakValues` that corresponds to the 50 | /// receiving enum case. 51 | var cValue: UWordBreakValues { 52 | switch self { 53 | case .other: return U_WB_OTHER 54 | case .aletter: return U_WB_ALETTER 55 | case .format: return U_WB_FORMAT 56 | case .katakana: return U_WB_KATAKANA 57 | case .midletter: return U_WB_MIDLETTER 58 | case .midnum: return U_WB_MIDNUM 59 | case .numeric: return U_WB_NUMERIC 60 | case .extendnumlet: return U_WB_EXTENDNUMLET 61 | case .cr: return U_WB_CR 62 | case .extend: return U_WB_EXTEND 63 | case .lf: return U_WB_LF 64 | case .midnumlet: return U_WB_MIDNUMLET 65 | case .newline: return U_WB_NEWLINE 66 | case .regionalIndicator: return U_WB_REGIONAL_INDICATOR 67 | case .hebrewLetter: return U_WB_HEBREW_LETTER 68 | case .singleQuote: return U_WB_SINGLE_QUOTE 69 | case .doubleQuote: return U_WB_DOUBLE_QUOTE 70 | case .eBase: return U_WB_E_BASE 71 | case .eBaseGaz: return U_WB_E_BASE_GAZ 72 | case .eModifier: return U_WB_E_MODIFIER 73 | case .glueAfterZWJ: return U_WB_GLUE_AFTER_ZWJ 74 | case .zwj: return U_WB_ZWJ 75 | } 76 | } 77 | 78 | /// Creates a new value from the given ICU C API value. 79 | /// 80 | /// - Parameter cValue: The ICU C API value. 81 | init(cValue: UWordBreakValues) { 82 | switch cValue { 83 | case U_WB_OTHER: self = .other 84 | case U_WB_ALETTER: self = .aletter 85 | case U_WB_FORMAT: self = .format 86 | case U_WB_KATAKANA: self = .katakana 87 | case U_WB_MIDLETTER: self = .midletter 88 | case U_WB_MIDNUM: self = .midnum 89 | case U_WB_NUMERIC: self = .numeric 90 | case U_WB_EXTENDNUMLET: self = .extendnumlet 91 | case U_WB_CR: self = .cr 92 | case U_WB_EXTEND: self = .extend 93 | case U_WB_LF: self = .lf 94 | case U_WB_MIDNUMLET: self = .midnumlet 95 | case U_WB_NEWLINE: self = .newline 96 | case U_WB_REGIONAL_INDICATOR: self = .regionalIndicator 97 | case U_WB_HEBREW_LETTER: self = .hebrewLetter 98 | case U_WB_SINGLE_QUOTE: self = .singleQuote 99 | case U_WB_DOUBLE_QUOTE: self = .doubleQuote 100 | case U_WB_E_BASE: self = .eBase 101 | case U_WB_E_BASE_GAZ: self = .eBaseGaz 102 | case U_WB_E_MODIFIER: self = .eModifier 103 | case U_WB_GLUE_AFTER_ZWJ: self = .glueAfterZWJ 104 | case U_WB_ZWJ: self = .zwj 105 | default: fatalError("Invalid UWordBreakValues value: \(cValue)") 106 | } 107 | } 108 | } 109 | } 110 | 111 | extension UnicodeScalar { 112 | 113 | /// The word break type of the receiver. 114 | public var wordBreakType: Unicode.WordBreakType { 115 | return value(of: UCHAR_WORD_BREAK) 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /Sources/ICU/DecompositionType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Enumerated property `Decomposition_Type`. 20 | /// 21 | /// If this value is anything other than `canonical`, then it represents a 22 | /// compatibility decomposition that indicates the type of information that 23 | /// would be lost if the character was converted to that decomposition. 24 | public enum DecompositionType: ConvertibleFromUnicodeIntProperty { 25 | 26 | /// A canonical decomposition, which is either a single Unicode scalar or 27 | /// two Unicode scalars where the first may itself recursively have a 28 | /// canonical decomposition. 29 | case canonical 30 | 31 | /// A compatibility decomposition that does not match one of the other 32 | /// categories. 33 | case compat 34 | 35 | /// A composite that is equivalent to its decomposition with a circle drawn 36 | /// around it. 37 | case circle 38 | 39 | /// Arabic final presentation form. 40 | case final 41 | 42 | /// A composite that is equal to its decomposition with additional font or 43 | /// styling information (such as a black-letter equivalent). 44 | case font 45 | 46 | /// Vulgar fractions. 47 | case fraction 48 | 49 | /// Arabic initial presentation form. 50 | case initial 51 | 52 | /// Arabic isolated presentation form. 53 | case isolated 54 | 55 | /// Arabic medial presentation form. 56 | case medial 57 | 58 | /// Half-width forms of Japanese characters used in abbreviations. 59 | case narrow 60 | 61 | /// A variant of a space or hyphen that prevents line breaks. (Equivalent to 62 | /// its decomposition preceded and followed by U+2060 WORD JOINER.) 63 | case noBreak 64 | 65 | /// Small variants of ASCII punctuation and symbols often used in Chinese 66 | /// text. 67 | case small 68 | 69 | /// A character that is equivalent to its decomposition but arranged in a 70 | /// square East Asian display cell. 71 | case square 72 | 73 | /// Equivalent to its decomposition but in subscript position. 74 | case sub 75 | 76 | /// Equivalent to its decomposition but in superscript position. 77 | case `super` 78 | 79 | /// Presentation forms for East Asian punctuation when rendered vertically. 80 | case vertical 81 | 82 | /// Full-width forms of Latin characters in East Asian text. 83 | case wide 84 | 85 | /// Creates a new value from the given ICU C API value. 86 | /// 87 | /// - Parameter cValue: The ICU C API value. 88 | init?(cValue: UDecompositionType) { 89 | switch cValue { 90 | case U_DT_NONE: return nil 91 | case U_DT_CANONICAL: self = .canonical 92 | case U_DT_COMPAT: self = .compat 93 | case U_DT_CIRCLE: self = .circle 94 | case U_DT_FINAL: self = .final 95 | case U_DT_FONT: self = .font 96 | case U_DT_FRACTION: self = .fraction 97 | case U_DT_INITIAL: self = .initial 98 | case U_DT_ISOLATED: self = .isolated 99 | case U_DT_MEDIAL: self = .medial 100 | case U_DT_NARROW: self = .narrow 101 | case U_DT_NOBREAK: self = .noBreak 102 | case U_DT_SMALL: self = .small 103 | case U_DT_SQUARE: self = .square 104 | case U_DT_SUB: self = .sub 105 | case U_DT_SUPER: self = .super 106 | case U_DT_VERTICAL: self = .vertical 107 | case U_DT_WIDE: self = .wide 108 | default: fatalError("Invalid UDecompositionType value: \(cValue)") 109 | } 110 | } 111 | } 112 | } 113 | 114 | extension UnicodeScalar { 115 | 116 | /// The decomposition type of the receiver. 117 | public var decompositionType: Unicode.DecompositionType? { 118 | return value(of: UCHAR_DECOMPOSITION_TYPE) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /Tests/ICUTests/WordBreakCursorTests.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU 16 | import XCTest 17 | 18 | /// Unit tests for `WordBreakCursor`. 19 | class WordBreakCursorTests: XCTestCase { 20 | // 000000000011111111112222222 21 | // 012345678901234567890123456 22 | let text = "This is the test, isn't it?" 23 | // 0 10 10 10 100 10 10 24 | // 0 = .none 25 | // 1 = .letter 26 | 27 | var cursor: WordBreakCursor! 28 | 29 | override func setUp() { 30 | cursor = WordBreakCursor(text: text) 31 | } 32 | 33 | func testBreaksForward() { 34 | assertBreak(cursor.first(), at: 0, status: .none) 35 | assertBreak(cursor.next(), at: 4, status: .letter) 36 | assertBreak(cursor.next(), at: 5, status: .none) 37 | assertBreak(cursor.next(), at: 7, status: .letter) 38 | assertBreak(cursor.next(), at: 8, status: .none) 39 | assertBreak(cursor.next(), at: 11, status: .letter) 40 | assertBreak(cursor.next(), at: 12, status: .none) 41 | assertBreak(cursor.next(), at: 16, status: .letter) 42 | assertBreak(cursor.next(), at: 17, status: .none) 43 | assertBreak(cursor.next(), at: 18, status: .none) 44 | assertBreak(cursor.next(), at: 23, status: .letter) 45 | assertBreak(cursor.next(), at: 24, status: .none) 46 | assertBreak(cursor.next(), at: 26, status: .letter) 47 | assertBreak(cursor.next(), at: 27, status: .none) 48 | XCTAssertNil(cursor.next()) 49 | } 50 | 51 | func testBreaksBackward() { 52 | assertBreak(cursor.last(), at: 27, status: .none) 53 | assertBreak(cursor.previous(), at: 26, status: .letter) 54 | assertBreak(cursor.previous(), at: 24, status: .none) 55 | assertBreak(cursor.previous(), at: 23, status: .letter) 56 | assertBreak(cursor.previous(), at: 18, status: .none) 57 | assertBreak(cursor.previous(), at: 17, status: .none) 58 | assertBreak(cursor.previous(), at: 16, status: .letter) 59 | assertBreak(cursor.previous(), at: 12, status: .none) 60 | assertBreak(cursor.previous(), at: 11, status: .letter) 61 | assertBreak(cursor.previous(), at: 8, status: .none) 62 | assertBreak(cursor.previous(), at: 7, status: .letter) 63 | assertBreak(cursor.previous(), at: 5, status: .none) 64 | assertBreak(cursor.previous(), at: 4, status: .letter) 65 | assertBreak(cursor.previous(), at: 0, status: .none) 66 | XCTAssertNil(cursor.previous()) 67 | } 68 | 69 | func testBreaksFollowing() { 70 | assertBreak(cursor.moveToIndex( 71 | following: text.index(text.startIndex, offsetBy: 2) 72 | ), at: 4, status: .letter) 73 | XCTAssertNil(cursor.moveToIndex(following: text.endIndex)) 74 | } 75 | 76 | func testBreaksPreceding() { 77 | assertBreak(cursor.moveToIndex( 78 | preceding: text.index(text.startIndex, offsetBy: 20) 79 | ), at: 18, status: .none) 80 | XCTAssertNil(cursor.moveToIndex(preceding: text.startIndex)) 81 | } 82 | 83 | func testIsBoundary() { 84 | XCTAssertTrue(cursor.isBoundary( 85 | movingToOrAfter: text.index(text.startIndex, offsetBy: 12))) 86 | assertBreak(cursor.index, at: 12, status: .none) 87 | 88 | XCTAssertFalse(cursor.isBoundary( 89 | movingToOrAfter: text.index(text.startIndex, offsetBy: 14))) 90 | assertBreak(cursor.index, at: 16, status: .letter) 91 | } 92 | 93 | func testSetText() { 94 | cursor.text = "one two" 95 | _ = cursor.first() 96 | assertBreak(cursor.next(), at: 3, status: .letter) 97 | } 98 | 99 | /// Asserts that the given break index is `distance` from the start of the 100 | /// string and that the most recent rule status matches the given one. 101 | private func assertBreak( 102 | _ index: String.Index?, 103 | at distance: Int, 104 | status: WordBreakCursor.RuleStatusRange, 105 | file: StaticString = #file, 106 | line: UInt = #line 107 | ) { 108 | assertIndex( 109 | index, 110 | isDistance: distance, 111 | fromStartOf: text, 112 | file: file, 113 | line: line) 114 | XCTAssertTrue(status ~= cursor.ruleStatus, file: file, line: line) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeScalar+Internal.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension UnicodeScalar { 18 | 19 | /// Creates a new Unicode scalar with the given `UChar32` value obtained from 20 | /// one of the ICU C APIs. 21 | /// 22 | /// - Parameter uchar32Value: A `UChar32` value representing the numeric code 23 | /// point. 24 | internal init?(uchar32Value: UChar32) { 25 | self.init(UInt32(bitPattern: uchar32Value)) 26 | } 27 | 28 | /// Helper property to return the raw value as a `UChar32` that can be easily 29 | /// consumed by ICU's C APIs. 30 | internal var uchar32Value: UChar32 { 31 | return Int32(bitPattern: value) 32 | } 33 | 34 | /// Returns the value of a Boolean Unicode property. 35 | /// 36 | /// - Parameter property: The C value representing the property to return. 37 | /// - Returns: The value of the property (true or false). 38 | internal func value(of property: UProperty) -> Bool { 39 | return u_hasBinaryProperty(uchar32Value, property) != 0 40 | } 41 | 42 | /// Returns the value of a Unicode property by converting it from its raw 43 | /// `Int32` value. 44 | /// 45 | /// This version of `value(of:)` supports properties with failable 46 | /// initializers, which return nil for "none-like" values. 47 | /// 48 | /// - Parameter property: The C value representing the property to return. 49 | /// - Returns: The value of the property, which is a type that conforms to 50 | /// `ConvertibleFromUnicodeIntProperty` such that it can be initialized 51 | /// using the underlying integer value. 52 | internal func value< 53 | Result: ConvertibleFromUnicodeIntProperty 54 | >( 55 | of property: UProperty 56 | ) -> Result? where Result.CICUType.RawValue == Int32 { 57 | let propertyValue = u_getIntPropertyValue(uchar32Value, property) 58 | return Result.init(unicodeIntPropertyValue: propertyValue) 59 | } 60 | 61 | /// Returns the value of a Unicode property by converting it from its raw 62 | /// `Int32` value. 63 | /// 64 | /// This version of `value(of:)` supports properties with non-failable 65 | /// initializers. 66 | /// 67 | /// - Parameter property: The C value representing the property to return. 68 | /// - Returns: The value of the property, which is a type that conforms to 69 | /// `ConvertibleFromUnicodeIntProperty` such that it can be initialized 70 | /// using the underlying integer value. 71 | internal func value< 72 | Result: ConvertibleFromUnicodeIntProperty 73 | >( 74 | of property: UProperty 75 | ) -> Result where Result.CICUType.RawValue == Int32 { 76 | guard let result: Result = value(of: property) else { 77 | fatalError("Unexpectedly got nil; did you mean to use the optional " + 78 | "form of value(of:)?") 79 | } 80 | return result 81 | } 82 | 83 | /// Returns the value of a Unicode property by converting it from its raw 84 | /// `UInt32` value. 85 | /// 86 | /// This version of `value(of:)` supports properties with non-failable 87 | /// initializers. 88 | /// 89 | /// - Parameter property: The C value representing the property to return. 90 | /// - Returns: The value of the property, which is a type that conforms to 91 | /// `ConvertibleFromUnicodeIntProperty` such that it can be initialized 92 | /// using the underlying integer value. 93 | internal func value< 94 | Result: ConvertibleFromUnicodeIntProperty 95 | >( 96 | of property: UProperty 97 | ) -> Result? where Result.CICUType.RawValue == UInt32 { 98 | let propertyValue = u_getIntPropertyValue(uchar32Value, property) 99 | return Result.init(unicodeIntPropertyValue: propertyValue) 100 | } 101 | 102 | /// Returns the value of a Unicode property by converting it from its raw 103 | /// `UInt32` value. 104 | /// 105 | /// This version of `value(of:)` supports properties with non-failable 106 | /// initializers. 107 | /// 108 | /// - Parameter property: The C value representing the property to return. 109 | /// - Returns: The value of the property, which is a type that conforms to 110 | /// `ConvertibleFromUnicodeIntProperty` such that it can be initialized 111 | /// using the underlying integer value. 112 | internal func value< 113 | Result: ConvertibleFromUnicodeIntProperty 114 | >( 115 | of property: UProperty 116 | ) -> Result where Result.CICUType.RawValue == UInt32 { 117 | guard let result: Result = value(of: property) else { 118 | fatalError("Unexpectedly got nil; did you mean to use the optional " + 119 | "form of value(of:)?") 120 | } 121 | return result 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /Sources/ICU/BidiClass.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// The bidirectional class property of a scalar. 20 | public enum BidiClass: ConvertibleFromUnicodeIntProperty { 21 | 22 | /// Any strong left-to-right character (abbreviated L). 23 | case leftToRight 24 | 25 | /// Any strong right-to-left (non-Arabic-type) character (abbreviated R). 26 | case rightToLeft 27 | 28 | /// Any strong right-to-left (Arabic-type) character (abbreviated AL). 29 | case arabicLetter 30 | 31 | /// Any ASCII digit or Eastern Arabic-Indic digit (abbreviated EN). 32 | case europeanNumber 33 | 34 | /// A plus or minus sign (abbreviated ES). 35 | case europeanNumberSeparator 36 | 37 | /// A terminator found in a numeric format context, including currency signs 38 | /// (abbreviated ET). 39 | case europeanNumberTerminator 40 | 41 | /// Any Arabic-Indic digit (abbreviated AN). 42 | case arabicNumber 43 | 44 | /// A comma, colon, or slash (abbreviated CS). 45 | case commonNumberSeparator 46 | 47 | /// Any nonspacing mark (abbreviated NSM). 48 | case nonspacingMark 49 | 50 | /// Most format characters, control codes, or noncharacters (abbreviated 51 | /// BN). 52 | case boundaryNeutral 53 | 54 | /// Various newline characters (abbreviated B). 55 | case paragraphSeparator 56 | 57 | /// Various segment-related control codes (abbreviated S). 58 | case segmentSeparator 59 | 60 | /// A space (abbreviated WS). 61 | case whitespace 62 | 63 | /// Most other symbols and punctuation marks (abbreviated ON). 64 | case otherNeutral 65 | 66 | /// U+202A, the LR embedding control (abbreviated LRE). 67 | case leftToRightEmbedding 68 | 69 | /// U+202D, the LR override control (abbreviated LRO). 70 | case leftToRightOverride 71 | 72 | /// U+202B, the RL embedding control (abbreviated RLE). 73 | case rightToLeftEmbedding 74 | 75 | /// U+202E, the RL override control (abbreviated RLO). 76 | case rightToLeftOverride 77 | 78 | /// U+202C, which terminates an embedding or overriding control (abbreviated 79 | /// PDF). 80 | case popDirectionalFormat 81 | 82 | /// U+2066, the LR isolate control (abbreviated LRI). 83 | case leftToRightIsolate 84 | 85 | /// U+2067, the RL isolate control (abbreviated RLI). 86 | case rightToLeftIsolate 87 | 88 | /// U+2068, the first strong isolate control (abbreviated FSI). 89 | case firstStrongIsolate 90 | 91 | /// U+2069, which terminates an isolate control (abbreviated PSI). 92 | case popDirectionalIsolate 93 | 94 | /// Creates a new value from the given ICU C API value. 95 | /// 96 | /// - Parameter cValue: The ICU C API value. 97 | init(cValue: UCharDirection) { 98 | switch cValue { 99 | case U_LEFT_TO_RIGHT: self = .leftToRight 100 | case U_RIGHT_TO_LEFT: self = .rightToLeft 101 | case U_EUROPEAN_NUMBER: self = .europeanNumber 102 | case U_EUROPEAN_NUMBER_SEPARATOR: self = .europeanNumberSeparator 103 | case U_EUROPEAN_NUMBER_TERMINATOR: self = .europeanNumberTerminator 104 | case U_ARABIC_NUMBER: self = .arabicNumber 105 | case U_COMMON_NUMBER_SEPARATOR: self = .commonNumberSeparator 106 | case U_BLOCK_SEPARATOR: self = .paragraphSeparator 107 | case U_SEGMENT_SEPARATOR: self = .segmentSeparator 108 | case U_WHITE_SPACE_NEUTRAL: self = .whitespace 109 | case U_OTHER_NEUTRAL: self = .otherNeutral 110 | case U_LEFT_TO_RIGHT_EMBEDDING: self = .leftToRightEmbedding 111 | case U_LEFT_TO_RIGHT_OVERRIDE: self = .leftToRightOverride 112 | case U_RIGHT_TO_LEFT_ARABIC: self = .arabicLetter 113 | case U_RIGHT_TO_LEFT_EMBEDDING: self = .rightToLeftEmbedding 114 | case U_RIGHT_TO_LEFT_OVERRIDE: self = .rightToLeftOverride 115 | case U_POP_DIRECTIONAL_FORMAT: self = .popDirectionalFormat 116 | case U_DIR_NON_SPACING_MARK: self = .nonspacingMark 117 | case U_BOUNDARY_NEUTRAL: self = .boundaryNeutral 118 | case U_FIRST_STRONG_ISOLATE: self = .firstStrongIsolate 119 | case U_LEFT_TO_RIGHT_ISOLATE: self = .leftToRightIsolate 120 | case U_RIGHT_TO_LEFT_ISOLATE: self = .rightToLeftIsolate 121 | case U_POP_DIRECTIONAL_ISOLATE: self = .popDirectionalIsolate 122 | default: fatalError("Invalid UCharDirection value: \(cValue)") 123 | } 124 | } 125 | } 126 | } 127 | 128 | extension UnicodeScalar { 129 | 130 | /// The bidirectional class of the receiver. 131 | public var bidiClass: Unicode.BidiClass { 132 | return value(of: UCHAR_BIDI_CLASS) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /Sources/ICU/CharacterBreakCursor.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Provides an interface for efficiently locating character boundaries in a 18 | /// text string. 19 | /// 20 | /// Note that this cursor essentially provides the same informatio that Swift's 21 | /// own treatment of `String` as a collection of `Character` values provides. It 22 | /// is included primarily for API completeness. 23 | /// 24 | /// Unlike other cursor types, character break cursors do not surface rule 25 | /// status values. 26 | /// 27 | /// **Terminology note:** The name "cursor" has been chosen instead of 28 | /// "iterator" to better map to the concepts found in Swift. Swift's iterator 29 | /// types provide unidirectional traversal of a sequence; by contrast, a 30 | /// "cursor" more accurately describes this type, which can move forward and 31 | /// backward arbitrarily within a string. 32 | public final class CharacterBreakCursor { 33 | 34 | /// The actual break cursor implementation to which this class's operations 35 | /// are delegated. 36 | private var impl: BreakCursorImpl 37 | 38 | /// The text being scanned by the cursor. 39 | public var text: String? { 40 | get { return impl.text } 41 | set { impl.text = newValue } 42 | } 43 | 44 | /// The most recently returned text boundary. 45 | public var index: String.Index? { 46 | return impl.index 47 | } 48 | 49 | /// The locale used to determine the language rules for text breaking. 50 | public let locale: String? 51 | 52 | /// Creates a new character break cursor with the given rules. 53 | /// 54 | /// - Parameters: 55 | /// - text: The optional initial text that the cursor will scan. 56 | /// - locale: The locale used to determine the language rules for text 57 | /// breaking. 58 | public init(text: String? = nil, locale: String? = nil) { 59 | self.locale = locale 60 | self.impl = BreakCursorImpl( 61 | type: UBRK_CHARACTER, 62 | text: text, 63 | locale: locale 64 | ) { _ in () } 65 | } 66 | 67 | deinit { 68 | impl.release() 69 | } 70 | 71 | /// Returns the start index of the text being scanned. 72 | /// 73 | /// This method also adjusts the cursor such that its `index` is equal to the 74 | /// text's starting index. 75 | /// 76 | /// - Returns: The start index of the text being scanned. 77 | public func first() -> String.Index { 78 | return impl.first() 79 | } 80 | 81 | /// Returns the index past the last character of the text being scanned. 82 | /// 83 | /// This method also adjusts the cursor such that its `index` is equal to the 84 | /// index past the last character of the text. 85 | /// 86 | /// - Returns: The index past the last character of the text being scanned. 87 | public func last() -> String.Index { 88 | return impl.last() 89 | } 90 | 91 | /// Returns the index of the boundary following the current boundary in the 92 | /// text. 93 | /// 94 | /// This method adjusts the cursor such that its `index` is equal to the 95 | /// position of the next boundary, or `nil` if all boundaries have been 96 | /// returned. 97 | /// 98 | /// - Returns: The index of the next boundary in the text, or nil if all 99 | /// boundaries have been returned. 100 | public func next() -> String.Index? { 101 | return impl.next() 102 | } 103 | 104 | /// Returns the index of the boundary preceding the current boundary in the 105 | /// text. 106 | /// 107 | /// This method adjusts the cursor such that its `index` is equal to the 108 | /// position of the previous boundary, or `nil` if all boundaries have been 109 | /// returned. 110 | /// 111 | /// - Returns: The index of the previous boundary in the text, or nil if all 112 | /// boundaries have been returned. 113 | public func previous() -> String.Index? { 114 | return impl.previous() 115 | } 116 | 117 | /// Returns the first index greater than `index` at which a boundary occurs. 118 | /// 119 | /// This method adjusts the cursor such that its `index` is equal to the 120 | /// boundary position if one was found, or `nil` if there were no boundaries 121 | /// after `index`. 122 | /// 123 | /// - Parameter index: The index at which scanning should begin. 124 | /// - Returns: The index of the first boundary following `index`, or nil if no 125 | /// boundaries were found. 126 | public func moveToIndex(following index: String.Index) -> String.Index? { 127 | return impl.moveToIndex(following: index) 128 | } 129 | 130 | /// Returns the first index less than `index` at which a boundary occurs. 131 | /// 132 | /// This method adjusts the cursor such that its `index` is equal to the 133 | /// boundary position if one was found, or `nil` if there were no boundaries 134 | /// before `index`. 135 | /// 136 | /// - Parameter index: The index at which the scanning should begin. 137 | /// - Returns: The index of the first boundary preceding `index`, or nil if no 138 | /// boundaries were found. 139 | public func moveToIndex(preceding index: String.Index) -> String.Index? { 140 | return impl.moveToIndex(preceding: index) 141 | } 142 | 143 | /// Returns true if the given index represents a boundary position in the 144 | /// cursor's text, also moving the cursor to the first boundary at or 145 | /// following that index. 146 | /// 147 | /// - Parameter index: The index to check. 148 | /// - Returns: True if the given index is a boundary position. 149 | public func isBoundary(movingToOrAfter index: String.Index) -> Bool { 150 | return impl.isBoundary(movingToOrAfter: index) 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /Sources/ICU/LineBreakType.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Denotes the line break type property of a scalar. 20 | /// 21 | /// The values in this enum can be used in an algorithm as described by 22 | /// http://www.unicode.org/reports/tr14/ to determine line boundaries within 23 | /// text. 24 | public enum LineBreakType: ConvertibleFromUnicodeIntProperty { 25 | 26 | case unknown 27 | case ambiguous 28 | case alphabetic 29 | case breakBoth 30 | case breakAfter 31 | case breakBefore 32 | case mandatoryBreak 33 | case contingentBreak 34 | case closePunctuation 35 | case combiningMark 36 | case carriageReturn 37 | case exclamation 38 | case glue 39 | case hyphen 40 | case ideographic 41 | case inseparable 42 | case inseperable 43 | case infixNumeric 44 | case lineFeed 45 | case nonstarter 46 | case numeric 47 | case openPunctuation 48 | case postfixNumeric 49 | case prefixNumeric 50 | case quotation 51 | case complexContext 52 | case surrogate 53 | case space 54 | case breakSymbols 55 | case zwspace 56 | case nextLine 57 | case wordJoiner 58 | case h2 59 | case h3 60 | case jl 61 | case jt 62 | case jv 63 | case closeParenthesis 64 | case conditionalJapaneseStarter 65 | case hebrewLetter 66 | case regionalIndicator 67 | case eBase 68 | case eModifier 69 | case zwj 70 | 71 | /// The C API value of type `ULineBreak` that corresponds to the 72 | /// receiving enum case. 73 | var cValue: ULineBreak { 74 | switch self { 75 | case .unknown: return U_LB_UNKNOWN 76 | case .ambiguous: return U_LB_AMBIGUOUS 77 | case .alphabetic: return U_LB_ALPHABETIC 78 | case .breakBoth: return U_LB_BREAK_BOTH 79 | case .breakAfter: return U_LB_BREAK_AFTER 80 | case .breakBefore: return U_LB_BREAK_BEFORE 81 | case .mandatoryBreak: return U_LB_MANDATORY_BREAK 82 | case .contingentBreak: return U_LB_CONTINGENT_BREAK 83 | case .closePunctuation: return U_LB_CLOSE_PUNCTUATION 84 | case .combiningMark: return U_LB_COMBINING_MARK 85 | case .carriageReturn: return U_LB_CARRIAGE_RETURN 86 | case .exclamation: return U_LB_EXCLAMATION 87 | case .glue: return U_LB_GLUE 88 | case .hyphen: return U_LB_HYPHEN 89 | case .ideographic: return U_LB_IDEOGRAPHIC 90 | case .inseparable: return U_LB_INSEPARABLE 91 | case .inseperable: return U_LB_INSEPERABLE 92 | case .infixNumeric: return U_LB_INFIX_NUMERIC 93 | case .lineFeed: return U_LB_LINE_FEED 94 | case .nonstarter: return U_LB_NONSTARTER 95 | case .numeric: return U_LB_NUMERIC 96 | case .openPunctuation: return U_LB_OPEN_PUNCTUATION 97 | case .postfixNumeric: return U_LB_POSTFIX_NUMERIC 98 | case .prefixNumeric: return U_LB_PREFIX_NUMERIC 99 | case .quotation: return U_LB_QUOTATION 100 | case .complexContext: return U_LB_COMPLEX_CONTEXT 101 | case .surrogate: return U_LB_SURROGATE 102 | case .space: return U_LB_SPACE 103 | case .breakSymbols: return U_LB_BREAK_SYMBOLS 104 | case .zwspace: return U_LB_ZWSPACE 105 | case .nextLine: return U_LB_NEXT_LINE 106 | case .wordJoiner: return U_LB_WORD_JOINER 107 | case .h2: return U_LB_H2 108 | case .h3: return U_LB_H3 109 | case .jl: return U_LB_JL 110 | case .jt: return U_LB_JT 111 | case .jv: return U_LB_JV 112 | case .closeParenthesis: return U_LB_CLOSE_PARENTHESIS 113 | case .conditionalJapaneseStarter: return U_LB_CONDITIONAL_JAPANESE_STARTER 114 | case .hebrewLetter: return U_LB_HEBREW_LETTER 115 | case .regionalIndicator: return U_LB_REGIONAL_INDICATOR 116 | case .eBase: return U_LB_E_BASE 117 | case .eModifier: return U_LB_E_MODIFIER 118 | case .zwj: return U_LB_ZWJ 119 | } 120 | } 121 | 122 | /// Creates a new value from the given ICU C API value. 123 | /// 124 | /// - Parameter cValue: The ICU C API value. 125 | init(cValue: ULineBreak) { 126 | switch cValue { 127 | case U_LB_UNKNOWN: self = .unknown 128 | case U_LB_AMBIGUOUS: self = .ambiguous 129 | case U_LB_ALPHABETIC: self = .alphabetic 130 | case U_LB_BREAK_BOTH: self = .breakBoth 131 | case U_LB_BREAK_AFTER: self = .breakAfter 132 | case U_LB_BREAK_BEFORE: self = .breakBefore 133 | case U_LB_MANDATORY_BREAK: self = .mandatoryBreak 134 | case U_LB_CONTINGENT_BREAK: self = .contingentBreak 135 | case U_LB_CLOSE_PUNCTUATION: self = .closePunctuation 136 | case U_LB_COMBINING_MARK: self = .combiningMark 137 | case U_LB_CARRIAGE_RETURN: self = .carriageReturn 138 | case U_LB_EXCLAMATION: self = .exclamation 139 | case U_LB_GLUE: self = .glue 140 | case U_LB_HYPHEN: self = .hyphen 141 | case U_LB_IDEOGRAPHIC: self = .ideographic 142 | case U_LB_INSEPARABLE: self = .inseparable 143 | case U_LB_INSEPERABLE: self = .inseperable 144 | case U_LB_INFIX_NUMERIC: self = .infixNumeric 145 | case U_LB_LINE_FEED: self = .lineFeed 146 | case U_LB_NONSTARTER: self = .nonstarter 147 | case U_LB_NUMERIC: self = .numeric 148 | case U_LB_OPEN_PUNCTUATION: self = .openPunctuation 149 | case U_LB_POSTFIX_NUMERIC: self = .postfixNumeric 150 | case U_LB_PREFIX_NUMERIC: self = .prefixNumeric 151 | case U_LB_QUOTATION: self = .quotation 152 | case U_LB_COMPLEX_CONTEXT: self = .complexContext 153 | case U_LB_SURROGATE: self = .surrogate 154 | case U_LB_SPACE: self = .space 155 | case U_LB_BREAK_SYMBOLS: self = .breakSymbols 156 | case U_LB_ZWSPACE: self = .zwspace 157 | case U_LB_NEXT_LINE: self = .nextLine 158 | case U_LB_WORD_JOINER: self = .wordJoiner 159 | case U_LB_H2: self = .h2 160 | case U_LB_H3: self = .h3 161 | case U_LB_JL: self = .jl 162 | case U_LB_JT: self = .jt 163 | case U_LB_JV: self = .jv 164 | case U_LB_CLOSE_PARENTHESIS: self = .closeParenthesis 165 | case U_LB_CONDITIONAL_JAPANESE_STARTER: self = .conditionalJapaneseStarter 166 | case U_LB_HEBREW_LETTER: self = .hebrewLetter 167 | case U_LB_REGIONAL_INDICATOR: self = .regionalIndicator 168 | case U_LB_E_BASE: self = .eBase 169 | case U_LB_E_MODIFIER: self = .eModifier 170 | case U_LB_ZWJ: self = .zwj 171 | default: fatalError("Invalid ULineBreak value: \(cValue)") 172 | } 173 | } 174 | } 175 | } 176 | 177 | extension UnicodeScalar { 178 | 179 | /// The line break type of the receiver. 180 | public var lineBreakType: Unicode.LineBreakType { 181 | return value(of: UCHAR_LINE_BREAK) 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeScalar+Enumeration.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode.Scalar { 18 | 19 | /// The type of a function passed to `UnicodeScalar.enumerateCodePointRanges`. 20 | /// It takes three arguments: the `GeneralCategory`, the `Range` of code 21 | /// points, and an `inout Bool` that should be set to `true` to stop 22 | /// enumeration. 23 | public typealias EnumerateCodePointRangesCallback = 24 | (Unicode.GeneralCategory?, Range, inout Bool) -> Void 25 | 26 | /// The type of a function passed to `UnicodeScalar.enumerateScalarRanges`. It 27 | /// takes three arguments: the `GeneralCategory`, the 28 | /// `ClosedRange` of code points, and an `inout Bool` that 29 | /// should be set to `true` to stop enumeration. 30 | public typealias EnumerateScalarRangesCallback = 31 | (Unicode.GeneralCategory?, ClosedRange, inout Bool) -> Void 32 | 33 | /// The type of a function passed to `UnicodeScalar.enumerateScalars`. It 34 | /// takes three arguments: the current `UnicodeScalar` in the enumeration, 35 | /// a `String` containing the name of the code point, and an `inout Bool` that 36 | /// should be set to `true` to stop enumeration. 37 | public typealias EnumerateScalarsCallback = 38 | (UnicodeScalar, String, inout Bool) -> Void 39 | 40 | /// Enumerates contiguous ranges of Unicode code points such that the code 41 | /// points in each range have the same general category. 42 | /// 43 | /// This function enumerates all defined code points, including surrogates, so 44 | /// its callback receives integer ranges instead of `UnicodeScalar` values. 45 | /// To iterate only valid scalar ranges, see `enumerateScalarRanges` instead. 46 | /// 47 | /// - Parameters: 48 | /// - block: A block that will be called for each contiguous range of code 49 | /// points that share the same general category. It takes three arguments: 50 | /// the `GeneralCategory`, the `Range` of code points, and an `inout 51 | /// Bool` that should be set to `true` to stop enumeration. 52 | public static func enumerateCodePointRanges( 53 | block: EnumerateCodePointRangesCallback 54 | ) { 55 | withoutActuallyEscaping(block) { block in 56 | CInterop.withPointer(wrapping: block) { blockPointer in 57 | u_enumCharTypes(enumerateCodePointRangesCCallback, blockPointer) 58 | } 59 | } 60 | } 61 | 62 | /// Enumerates contiguous ranges of Unicode code points such that the code 63 | /// points in each range have the same general category. 64 | /// 65 | /// This function enumerates only code points that are valid `UnicodeScalar` 66 | /// values (that is, it excludes surrogates). Thus, its callback receives 67 | /// closed ranges of `UnicodeScalar` values. To iterate *all* code points, see 68 | /// `enumerateCodePointRanges` instead. 69 | /// 70 | /// - Parameters: 71 | /// - includeUnassigned: If true, unassigned code points will be included in 72 | /// the enumeration. The default value is false. 73 | /// - block: A block that will be called for each contiguous range of code 74 | /// points that share the same general category. It takes three arguments: 75 | /// the `GeneralCategory`, the `ClosedRange` of code 76 | /// points, and an `inout Bool` that should be set to `true` to stop 77 | /// enumeration. 78 | public static func enumerateScalarRanges( 79 | includeUnassigned: Bool = false, 80 | block: EnumerateScalarRangesCallback 81 | ) { 82 | enumerateCodePointRanges { category, codePointRange, stop in 83 | guard category != .surrogate else { 84 | return 85 | } 86 | guard includeUnassigned || category != nil else { 87 | return 88 | } 89 | let startScalar = UnicodeScalar(codePointRange.lowerBound)! 90 | let endScalar = UnicodeScalar(codePointRange.upperBound - 1)! 91 | block(category, startScalar...endScalar, &stop) 92 | } 93 | } 94 | 95 | /// Enumerates the named Unicode scalars in the range `[start, end)`, calling 96 | /// a function for each one. 97 | /// 98 | /// Only *valid, defined* scalars are enumerated; unassigned code points and 99 | /// other non-valid scalars (such as surrogates) are skipped. The desired name 100 | /// type also affects the set of scalars that are enumerated; for example, 101 | /// control characters U+0000-U+001F will be enumerated if 102 | /// `nameType == .extended` but not if `nameType == .unicode`. 103 | /// 104 | /// - Parameters: 105 | /// - start: The numeric value of the first code point to enumerate 106 | /// (inclusive). 107 | /// - end: The numeric value of the code point where enumeration should end 108 | /// (exclusive). 109 | /// - nameKind: The kind of name that should be passed to each call to the 110 | /// block (which also, in some cases, controls the code points that are 111 | /// enumerated). 112 | /// - block: A block that will be called for each Unicode scalar that is 113 | /// enumerate. It takes three arguments: a `UnicodeScalar`, a `String` 114 | /// containing the name of the code point, and an `inout Bool` that should 115 | /// be set to `true` to stop enumeration. 116 | public static func enumerateScalars( 117 | from start: Int, 118 | to end: Int, 119 | nameKind: Unicode.NameKind = .unicode, 120 | block: EnumerateScalarsCallback 121 | ) { 122 | withoutActuallyEscaping(block) { block in 123 | CInterop.withPointer(wrapping: block) { blockPointer in 124 | var error = UErrorCode() 125 | u_enumCharNames( 126 | Int32(truncatingIfNeeded: start), 127 | Int32(truncatingIfNeeded: end), 128 | enumerateScalarsCCallback, 129 | blockPointer, 130 | nameKind.cValue, 131 | &error 132 | ) 133 | } 134 | } 135 | } 136 | } 137 | 138 | /// Function passed to ICU's `u_enumCharTypes` C function, which delegates to 139 | /// the Swift block passed in via the `context` argument. 140 | /// 141 | /// - Parameters: 142 | /// - context: The Swift block to invoke, wrapped in an opaque pointer. 143 | /// - start: The first code point in the range. 144 | /// - limit: The code point after the last one in the range. 145 | /// - type: The `UCharCategory` of the code points in the range. 146 | /// - Returns: 0 if enumeration should be stopped, or 1 if it should continue. 147 | private func enumerateCodePointRangesCCallback( 148 | context: UnsafeRawPointer?, 149 | start: UChar32, 150 | limit: UChar32, 151 | type: UCharCategory 152 | ) -> UBool { 153 | let category = Unicode.GeneralCategory(cValue: type) 154 | let block: UnicodeScalar.EnumerateCodePointRangesCallback = 155 | CInterop.unwrappedValue(from: context!) 156 | 157 | var stop = false 158 | block(category, Int(start)..?, 179 | length: Int32 180 | ) -> UBool { 181 | let scalar = UnicodeScalar(uchar32Value: value)! 182 | let block: UnicodeScalar.EnumerateScalarsCallback = 183 | CInterop.unwrappedValue(from: context!) 184 | 185 | var stop = false 186 | block(scalar, String(cString: buffer!), &stop) 187 | return stop ? 0 : 1 188 | } 189 | -------------------------------------------------------------------------------- /Sources/ICU/UnicodeVersion.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// A four-component version value. These values are used to represent, for 20 | /// example, the Unicode version at which a code point was introduced. 21 | /// 22 | /// Values of this type support comparison (equality and ordering) and can be 23 | /// converted to/from human-readable strings. 24 | /// 25 | /// See also `UnicodeScalar.unicodeAge`. 26 | public struct Version { 27 | 28 | /// The underlying C value representing the version. 29 | private(set) var cValue: UVersionInfo 30 | 31 | /// The major component of the version. 32 | public var major: Int { 33 | return Int(cValue.0) 34 | } 35 | 36 | /// The minor component of the version. 37 | public var minor: Int { 38 | return Int(cValue.1) 39 | } 40 | 41 | /// The patch level component of the version. 42 | public var patchLevel: Int { 43 | return Int(cValue.2) 44 | } 45 | 46 | /// The build level component of the version. 47 | public var buildLevel: Int { 48 | return Int(cValue.3) 49 | } 50 | 51 | /// Creates a new Unicode version value with the given components. 52 | /// 53 | /// - Precondition: All components must be in the range `0...255`. 54 | /// 55 | /// - Parameters: 56 | /// - major: The major component. 57 | /// - minor: The minor component. 58 | /// - patchLevel: The patch level component. 59 | /// - buildLevel: The build level component. 60 | public init( 61 | major: Int, 62 | minor: Int = 0, 63 | patchLevel: Int = 0, 64 | buildLevel: Int = 0 65 | ) { 66 | precondition((0...255).contains(major) && 67 | (0...255).contains(minor) && 68 | (0...255).contains(patchLevel) && 69 | (0...255).contains(buildLevel), 70 | "Version components must be in the range 0...255.") 71 | self.cValue.0 = UInt8(truncatingIfNeeded: major) 72 | self.cValue.1 = UInt8(truncatingIfNeeded: minor) 73 | self.cValue.2 = UInt8(truncatingIfNeeded: patchLevel) 74 | self.cValue.3 = UInt8(truncatingIfNeeded: buildLevel) 75 | } 76 | 77 | /// Creates a new Unicode version value equivalent to the given C value. 78 | /// 79 | /// - Parameter cValue: The C value to be wrapped. 80 | init(cValue: UVersionInfo) { 81 | self.cValue = cValue 82 | } 83 | 84 | /// Invokes the given closure with an immutable `UInt8` pointer to the 85 | /// underlying C value. 86 | /// 87 | /// - Parameter body: A closure that takes an immutable pointer to the 88 | /// underlying C value as its sole argument, represented as a pointer to 89 | /// `UInt8`. 90 | /// - Returns: The return value of the `body` closure, if any. 91 | /// - Throws: The error thrown by the `body` closure, if any. 92 | func withUnsafeUInt8Pointer( 93 | _ body: (UnsafePointer) throws -> Result 94 | ) rethrows -> Result { 95 | var valueCopy = cValue 96 | return try withUnsafePointer(to: &valueCopy) { pointer in 97 | return try pointer.withMemoryRebound( 98 | to: UInt8.self, 99 | capacity: MemoryLayout.size 100 | ) { pointer in 101 | return try body(pointer) 102 | } 103 | } 104 | } 105 | 106 | /// Invokes the given closure with a mutable `UInt8` pointer to the 107 | /// underlying C value. 108 | /// 109 | /// - Parameter body: A closure that takes a mutable pointer to the 110 | /// underlying C value its sole argument, represented as a pointer to 111 | /// `UInt8`. 112 | /// - Returns: The return value of the `body` closure, if any. 113 | /// - Throws: The error thrown by the `body` closure, if any. 114 | mutating func withUnsafeMutableUInt8Pointer( 115 | _ body: (UnsafeMutablePointer) throws -> Result 116 | ) rethrows -> Result { 117 | return try withUnsafeMutablePointer(to: &cValue) { pointer in 118 | return try pointer.withMemoryRebound( 119 | to: UInt8.self, 120 | capacity: MemoryLayout.size 121 | ) { pointer in 122 | return try body(pointer) 123 | } 124 | } 125 | } 126 | 127 | /// Performs a three-way comparison of the receiver with another version. 128 | /// 129 | /// - Parameter other: Another version. 130 | /// - Returns: Zero if the receiver is equal to `other`, a negative number 131 | /// if the receiver is ordered before `other`, or a positive number if the 132 | /// receiver is ordered after `other`. 133 | fileprivate func threeWayCompare(to other: Unicode.Version) -> Int { 134 | var lhsValue = cValue 135 | var rhsValue = other.cValue 136 | 137 | let result = withUnsafeBytes(of: &lhsValue) { lhsBuffer in 138 | return withUnsafeBytes(of: &rhsValue) { rhsBuffer in 139 | return memcmp(lhsBuffer.baseAddress!, 140 | rhsBuffer.baseAddress!, 141 | MemoryLayout.size) 142 | } 143 | } 144 | return Int(result) 145 | } 146 | } 147 | } 148 | 149 | extension Unicode.Version: Comparable { 150 | 151 | /// Returns a value indicating whether the first version is less than the 152 | /// second version. 153 | /// 154 | /// - Parameters: 155 | /// - lhs: A version. 156 | /// - rhs: A version. 157 | /// - Returns: True if the first version is less than the second version, or 158 | /// false otherwise. 159 | public static func < (lhs: Unicode.Version, rhs: Unicode.Version) -> Bool { 160 | return lhs.threeWayCompare(to: rhs) < 0 161 | } 162 | } 163 | 164 | extension Unicode.Version: Equatable { 165 | 166 | /// Returns a value indicating whether two versions are equal. 167 | /// 168 | /// - Parameters: 169 | /// - lhs: A version. 170 | /// - rhs: A version. 171 | /// - Returns: True if the versions are equal, or false otherwise. 172 | public static func == (lhs: Unicode.Version, rhs: Unicode.Version) -> Bool { 173 | return lhs.threeWayCompare(to: rhs) == 0 174 | } 175 | } 176 | 177 | extension Unicode.Version: LosslessStringConvertible { 178 | 179 | /// A string representation of the version, in the form 180 | /// `major.minor.patch.buildLevel` (trailing zero components are omitted). 181 | public var description: String { 182 | let capacity = Int(U_MAX_VERSION_STRING_LENGTH) 183 | let stringPointer = UnsafeMutablePointer.allocate(capacity: capacity) 184 | defer { stringPointer.deallocate(capacity: capacity) } 185 | 186 | withUnsafeUInt8Pointer { versionPointer in 187 | u_versionToString(versionPointer, stringPointer) 188 | } 189 | 190 | return String(cString: stringPointer) 191 | } 192 | 193 | /// Creates a new `UnicodeVersion` by parsing the given string. 194 | /// 195 | /// Since the underlying ICU API does not provide error reporting, this 196 | /// initializer never fails. Instead, for values of `description` that do not 197 | /// meet the standard version string format, the created value is undefined. 198 | /// 199 | /// - Parameter description: A string representation of a version containing 200 | /// between 1 and 4 components separated by dots, where each component is an 201 | /// integer in the range `0...255`. 202 | public init(_ description: String) { 203 | var cValue: UVersionInfo = (0, 0, 0, 0) 204 | description.withCString { stringPointer in 205 | withUnsafeMutablePointer(to: &cValue) { pointer in 206 | pointer.withMemoryRebound( 207 | to: UInt8.self, 208 | capacity: MemoryLayout.size 209 | ) { pointer in 210 | u_versionFromString(pointer, stringPointer) 211 | } 212 | } 213 | } 214 | self.init(cValue: cValue) 215 | } 216 | } 217 | 218 | extension UnicodeScalar { 219 | 220 | /// The Unicode version in which the receiving scalar was first defined. 221 | public var unicodeAge: Unicode.Version { 222 | var version = Unicode.Version(major: 0) 223 | version.withUnsafeMutableUInt8Pointer { pointer in 224 | u_charAge(uchar32Value, pointer) 225 | } 226 | return version 227 | } 228 | } 229 | -------------------------------------------------------------------------------- /Sources/ICU/LineBreakCursor.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Provides an interface for efficiently locating line boundaries in a text 18 | /// string. 19 | /// 20 | /// ICU defines the rule status tags for line breaks as _ranges_, which allows 21 | /// future versions to subdivide tag groups into finer subcategories. Because of 22 | /// this, they should not (and cannot) be compared using `==`. Instead, use 23 | /// pattern matching (i.e., `if case`, `switch`, or the `~=` operator directly) 24 | /// to determine if a status tag belongs to one of the known categories: 25 | /// 26 | /// ```swift 27 | /// if case .soft = cursor.ruleStatus { 28 | /// // do something 29 | /// } 30 | /// ``` 31 | /// 32 | /// **Terminology note:** The name "cursor" has been chosen instead of 33 | /// "iterator" to better map to the concepts found in Swift. Swift's iterator 34 | /// types provide unidirectional traversal of a sequence; by contrast, a 35 | /// "cursor" more accurately describes this type, which can move forward and 36 | /// backward arbitrarily within a string. 37 | public final class LineBreakCursor { 38 | 39 | /// Represents the status tag from a break rule. 40 | public struct RuleStatus { 41 | 42 | /// The raw integer value of the status tag. 43 | public let rawValue: Int 44 | 45 | /// Creates a new `RuleStatus` value with the given integer value. 46 | /// 47 | /// - Parameter rawValue: The integer value of the status tag. 48 | init(rawValue: Int) { 49 | self.rawValue = rawValue 50 | } 51 | } 52 | 53 | /// Represents a range of status tags from a break rule. 54 | /// 55 | /// The rule status tags for word breaks are meant to be treated as ranges, 56 | /// not exact values; this allows future versions of the library to further 57 | /// subdivide a break rule's tags. Thus, `RuleStatus` values should not be 58 | /// compared using equality, but instead use the `~=` operator (directly, or 59 | /// through pattern matching): 60 | /// 61 | /// ```swift 62 | /// if case .soft = cursor.ruleStatus { 63 | /// // do something 64 | /// } 65 | /// ``` 66 | public struct RuleStatusRange { 67 | 68 | /// The range of integer values. 69 | private let range: Range 70 | 71 | /// Creates a new rule status range from a range of unsigned integers. 72 | /// 73 | /// - Parameter range: The range of unsigned integers. 74 | private init(_ range: Range) { 75 | self.range = Int(range.lowerBound).. Bool { 97 | return range.range.contains(ruleStatus.rawValue) 98 | } 99 | } 100 | 101 | /// The status tag from the break rule that determined the most recently 102 | /// returned break position. 103 | /// 104 | /// If more than one break rule applied at the current position, then the 105 | /// numerically largest status tag is returned. 106 | public var ruleStatus: RuleStatus { 107 | return impl.ruleStatus 108 | } 109 | 110 | /// The status tags from the break rules that determined the most recently 111 | /// returned break position. 112 | public var ruleStatuses: [RuleStatus] { 113 | return impl.ruleStatuses 114 | } 115 | 116 | /// The actual break cursor implementation to which this class's operations 117 | /// are delegated. 118 | private var impl: BreakCursorImpl 119 | 120 | /// The text being scanned by the cursor. 121 | public var text: String? { 122 | get { return impl.text } 123 | set { impl.text = newValue } 124 | } 125 | 126 | /// The most recently returned text boundary. 127 | public var index: String.Index? { 128 | return impl.index 129 | } 130 | 131 | /// The locale used to determine the language rules for text breaking. 132 | public let locale: String? 133 | 134 | /// Creates a new line break cursor with the given rules. 135 | /// 136 | /// - Parameters: 137 | /// - text: The optional initial text that the cursor will scan. 138 | /// - locale: The locale used to determine the language rules for text 139 | /// breaking. 140 | public init(text: String? = nil, locale: String? = nil) { 141 | self.locale = locale 142 | self.impl = BreakCursorImpl( 143 | type: UBRK_LINE, 144 | text: text, 145 | locale: locale, 146 | ruleStatusFactory: RuleStatus.init) 147 | } 148 | 149 | deinit { 150 | impl.release() 151 | } 152 | 153 | /// Returns the start index of the text being scanned. 154 | /// 155 | /// This method also adjusts the cursor such that its `index` is equal to the 156 | /// text's starting index. 157 | /// 158 | /// - Returns: The start index of the text being scanned. 159 | public func first() -> String.Index { 160 | return impl.first() 161 | } 162 | 163 | /// Returns the index past the last character of the text being scanned. 164 | /// 165 | /// This method also adjusts the cursor such that its `index` is equal to the 166 | /// index past the last character of the text. 167 | /// 168 | /// - Returns: The index past the last character of the text being scanned. 169 | public func last() -> String.Index { 170 | return impl.last() 171 | } 172 | 173 | /// Returns the index of the boundary following the current boundary in the 174 | /// text. 175 | /// 176 | /// This method adjusts the cursor such that its `index` is equal to the 177 | /// position of the next boundary, or `nil` if all boundaries have been 178 | /// returned. 179 | /// 180 | /// - Returns: The index of the next boundary in the text, or nil if all 181 | /// boundaries have been returned. 182 | public func next() -> String.Index? { 183 | return impl.next() 184 | } 185 | 186 | /// Returns the index of the boundary preceding the current boundary in the 187 | /// text. 188 | /// 189 | /// This method adjusts the cursor such that its `index` is equal to the 190 | /// position of the previous boundary, or `nil` if all boundaries have been 191 | /// returned. 192 | /// 193 | /// - Returns: The index of the previous boundary in the text, or nil if all 194 | /// boundaries have been returned. 195 | public func previous() -> String.Index? { 196 | return impl.previous() 197 | } 198 | 199 | /// Returns the first index greater than `index` at which a boundary occurs. 200 | /// 201 | /// This method adjusts the cursor such that its `index` is equal to the 202 | /// boundary position if one was found, or `nil` if there were no boundaries 203 | /// after `index`. 204 | /// 205 | /// - Parameter index: The index at which scanning should begin. 206 | /// - Returns: The index of the first boundary following `index`, or nil if no 207 | /// boundaries were found. 208 | public func moveToIndex(following index: String.Index) -> String.Index? { 209 | return impl.moveToIndex(following: index) 210 | } 211 | 212 | /// Returns the first index less than `index` at which a boundary occurs. 213 | /// 214 | /// This method adjusts the cursor such that its `index` is equal to the 215 | /// boundary position if one was found, or `nil` if there were no boundaries 216 | /// before `index`. 217 | /// 218 | /// - Parameter index: The index at which the scanning should begin. 219 | /// - Returns: The index of the first boundary preceding `index`, or nil if no 220 | /// boundaries were found. 221 | public func moveToIndex(preceding index: String.Index) -> String.Index? { 222 | return impl.moveToIndex(preceding: index) 223 | } 224 | 225 | /// Returns true if the given index represents a boundary position in the 226 | /// cursor's text, also moving the cursor to the first boundary at or 227 | /// following that index. 228 | /// 229 | /// - Parameter index: The index to check. 230 | /// - Returns: True if the given index is a boundary position. 231 | public func isBoundary(movingToOrAfter index: String.Index) -> Bool { 232 | return impl.isBoundary(movingToOrAfter: index) 233 | } 234 | } 235 | 236 | -------------------------------------------------------------------------------- /Sources/ICU/SentenceBreakCursor.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Provides an interface for efficiently locating sentence boundaries in a text 18 | /// string. 19 | /// 20 | /// ICU defines the rule status tags for sentence breaks as _ranges_, which 21 | /// allows future versions to subdivide tag groups into finer subcategories. 22 | /// Because of this, they should not (and cannot) be compared using `==`. 23 | /// Instead, use pattern matching (i.e., `if case`, `switch`, or the `~=` 24 | /// operator directly) to determine if a status tag belongs to one of the known 25 | /// categories: 26 | /// 27 | /// ```swift 28 | /// if case .separator = cursor.ruleStatus { 29 | /// // do something 30 | /// } 31 | /// ``` 32 | /// 33 | /// **Terminology note:** The name "cursor" has been chosen instead of 34 | /// "iterator" to better map to the concepts found in Swift. Swift's iterator 35 | /// types provide unidirectional traversal of a sequence; by contrast, a 36 | /// "cursor" more accurately describes this type, which can move forward and 37 | /// backward arbitrarily within a string. 38 | public final class SentenceBreakCursor { 39 | 40 | /// Represents the status tag from a break rule. 41 | public struct RuleStatus { 42 | 43 | /// The raw integer value of the status tag. 44 | public let rawValue: Int 45 | 46 | /// Creates a new `RuleStatus` value with the given integer value. 47 | /// 48 | /// - Parameter rawValue: The integer value of the status tag. 49 | init(rawValue: Int) { 50 | self.rawValue = rawValue 51 | } 52 | } 53 | 54 | /// Represents a range of status tags from a break rule. 55 | /// 56 | /// The rule status tags for word breaks are meant to be treated as ranges, 57 | /// not exact values; this allows future versions of the library to further 58 | /// subdivide a break rule's tags. Thus, `RuleStatus` values should not be 59 | /// compared using equality, but instead use the `~=` operator (directly, or 60 | /// through pattern matching): 61 | /// 62 | /// ```swift 63 | /// if case .separator = cursor.ruleStatus { 64 | /// // do something 65 | /// } 66 | /// ``` 67 | public struct RuleStatusRange { 68 | 69 | /// The range of integer values. 70 | private let range: Range 71 | 72 | /// Creates a new rule status range from a range of unsigned integers. 73 | /// 74 | /// - Parameter range: The range of unsigned integers. 75 | private init(_ range: Range) { 76 | self.range = Int(range.lowerBound).. Bool { 103 | return range.range.contains(ruleStatus.rawValue) 104 | } 105 | } 106 | 107 | /// The status tag from the break rule that determined the most recently 108 | /// returned break position. 109 | /// 110 | /// If more than one break rule applied at the current position, then the 111 | /// numerically largest status tag is returned. 112 | public var ruleStatus: RuleStatus { 113 | return impl.ruleStatus 114 | } 115 | 116 | /// The status tags from the break rules that determined the most recently 117 | /// returned break position. 118 | public var ruleStatuses: [RuleStatus] { 119 | return impl.ruleStatuses 120 | } 121 | 122 | /// The actual break cursor implementation to which this class's operations 123 | /// are delegated. 124 | private var impl: BreakCursorImpl 125 | 126 | /// The text being scanned by the cursor. 127 | public var text: String? { 128 | get { return impl.text } 129 | set { impl.text = newValue } 130 | } 131 | 132 | /// The most recently returned text boundary. 133 | public var index: String.Index? { 134 | return impl.index 135 | } 136 | 137 | /// The locale used to determine the language rules for text breaking. 138 | public let locale: String? 139 | 140 | /// Creates a new sentence break cursor with the given rules. 141 | /// 142 | /// - Parameters: 143 | /// - text: The optional initial text that the cursor will scan. 144 | /// - locale: The locale used to determine the language rules for text 145 | /// breaking. 146 | public init(text: String? = nil, locale: String? = nil) { 147 | self.locale = locale 148 | self.impl = BreakCursorImpl( 149 | type: UBRK_SENTENCE, 150 | text: text, 151 | locale: locale, 152 | ruleStatusFactory: RuleStatus.init) 153 | } 154 | 155 | deinit { 156 | impl.release() 157 | } 158 | 159 | /// Returns the start index of the text being scanned. 160 | /// 161 | /// This method also adjusts the cursor such that its `index` is equal to the 162 | /// text's starting index. 163 | /// 164 | /// - Returns: The start index of the text being scanned. 165 | public func first() -> String.Index { 166 | return impl.first() 167 | } 168 | 169 | /// Returns the index past the last character of the text being scanned. 170 | /// 171 | /// This method also adjusts the cursor such that its `index` is equal to the 172 | /// index past the last character of the text. 173 | /// 174 | /// - Returns: The index past the last character of the text being scanned. 175 | public func last() -> String.Index { 176 | return impl.last() 177 | } 178 | 179 | /// Returns the index of the boundary following the current boundary in the 180 | /// text. 181 | /// 182 | /// This method adjusts the cursor such that its `index` is equal to the 183 | /// position of the next boundary, or `nil` if all boundaries have been 184 | /// returned. 185 | /// 186 | /// - Returns: The index of the next boundary in the text, or nil if all 187 | /// boundaries have been returned. 188 | public func next() -> String.Index? { 189 | return impl.next() 190 | } 191 | 192 | /// Returns the index of the boundary preceding the current boundary in the 193 | /// text. 194 | /// 195 | /// This method adjusts the cursor such that its `index` is equal to the 196 | /// position of the previous boundary, or `nil` if all boundaries have been 197 | /// returned. 198 | /// 199 | /// - Returns: The index of the previous boundary in the text, or nil if all 200 | /// boundaries have been returned. 201 | public func previous() -> String.Index? { 202 | return impl.previous() 203 | } 204 | 205 | /// Returns the first index greater than `index` at which a boundary occurs. 206 | /// 207 | /// This method adjusts the cursor such that its `index` is equal to the 208 | /// boundary position if one was found, or `nil` if there were no boundaries 209 | /// after `index`. 210 | /// 211 | /// - Parameter index: The index at which scanning should begin. 212 | /// - Returns: The index of the first boundary following `index`, or nil if no 213 | /// boundaries were found. 214 | public func moveToIndex(following index: String.Index) -> String.Index? { 215 | return impl.moveToIndex(following: index) 216 | } 217 | 218 | /// Returns the first index less than `index` at which a boundary occurs. 219 | /// 220 | /// This method adjusts the cursor such that its `index` is equal to the 221 | /// boundary position if one was found, or `nil` if there were no boundaries 222 | /// before `index`. 223 | /// 224 | /// - Parameter index: The index at which the scanning should begin. 225 | /// - Returns: The index of the first boundary preceding `index`, or nil if no 226 | /// boundaries were found. 227 | public func moveToIndex(preceding index: String.Index) -> String.Index? { 228 | return impl.moveToIndex(preceding: index) 229 | } 230 | 231 | /// Returns true if the given index represents a boundary position in the 232 | /// cursor's text, also moving the cursor to the first boundary at or 233 | /// following that index. 234 | /// 235 | /// - Parameter index: The index to check. 236 | /// - Returns: True if the given index is a boundary position. 237 | public func isBoundary(movingToOrAfter index: String.Index) -> Bool { 238 | return impl.isBoundary(movingToOrAfter: index) 239 | } 240 | } 241 | 242 | -------------------------------------------------------------------------------- /Sources/ICU/RuleBasedBreakCursor.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Provides an interface for efficiently locating boundaries in a text string 18 | /// based on a set of custom rules described using a string-based encoding. 19 | /// 20 | /// The generic `RuleStatus` type argument itself is unconstrained, but through 21 | /// extensions this cursor can only be instantiated when `RuleStatus` is one of 22 | /// the following types, and instances of these types will be returned by the 23 | /// `ruleStatus` and `ruleStatuses` properties: 24 | /// 25 | /// 1. Anything conforming to `RawRepresentable` where `RawValue == Int` 26 | /// 1. `Int` itself 27 | /// 28 | /// In the first case, the `RuleStatus` type must have a value for all of the 29 | /// tags that you specify in your rules, and also for zero; missing values will 30 | /// result in a runtime failure. In the second case, querying `ruleStatus(es)` 31 | /// simply returns the raw integer. 32 | /// 33 | /// **Terminology note:** The name "cursor" has been chosen instead of 34 | /// "iterator" to better map to the concepts found in Swift. Swift's iterator 35 | /// types provide unidirectional traversal of a sequence; by contrast, a 36 | /// "cursor" more accurately describes this type, which can move forward and 37 | /// backward arbitrarily within a string. 38 | public final class RuleBasedBreakCursor { 39 | 40 | /// The text being scanned by the cursor. 41 | public var text: String? { 42 | get { return impl.text } 43 | set { impl.text = newValue } 44 | } 45 | 46 | /// The status tag from the break rule that determined the most recently 47 | /// returned break position. 48 | /// 49 | /// If more than one break rule applied at the current position, then the 50 | /// numerically largest status tag is returned. 51 | public var ruleStatus: RuleStatus { 52 | return impl.ruleStatus 53 | } 54 | 55 | /// The status tags from the break rules that determined the most recently 56 | /// returned break position. 57 | public var ruleStatuses: [RuleStatus] { 58 | return impl.ruleStatuses 59 | } 60 | 61 | /// The actual break cursor implementation to which this class's operations 62 | /// are delegated. 63 | private var impl: BreakCursorImpl 64 | 65 | /// The string representation of the break rules used by the cursor. 66 | public let rules: String 67 | 68 | /// A pointer to a copy of the UTF-16 code units in `rules`. 69 | private var rulesPointer: UnsafeMutableBufferPointer 70 | 71 | /// The most recently returned text boundary. 72 | public var index: String.Index? { 73 | return impl.index 74 | } 75 | 76 | /// This is the designated initializer, which is internal. it is called by 77 | /// the public convenience initializers, which allow public instantiation of 78 | /// `RuleBasedBreakIterator` for generic type arguments that satisfy specific 79 | /// constraints. 80 | /// 81 | /// - Parameters: 82 | /// - rules: The string representation of the break rules used by the 83 | /// cursor. 84 | /// - text: The optional initial text that the cursor will scan. 85 | /// - ruleStatusFactory: A function that converts the raw integer values of 86 | /// rule status tags into instances of the generic `RuleStatus` type. 87 | /// - Throws: `BreakRuleParseError` if an error occurs while parsing the 88 | /// rules. 89 | internal init( 90 | rules: String, 91 | text: String?, 92 | ruleStatusFactory: @escaping (Int) -> RuleStatus 93 | ) throws { 94 | self.rules = rules 95 | self.rulesPointer = rules.unsafeUTF16CodeUnits() 96 | let textPointer = text?.unsafeUTF16CodeUnits() 97 | 98 | var parseError = UParseError() 99 | var error = UErrorCode() 100 | let cBreak = ubrk_openRules( 101 | rulesPointer.baseAddress!, 102 | Int32(truncatingIfNeeded: rulesPointer.count), 103 | textPointer?.baseAddress, 104 | Int32(truncatingIfNeeded: textPointer?.count ?? 0), 105 | &parseError, 106 | &error) 107 | 108 | guard error.isSuccess else { 109 | let errorContext = ParseErrorContext(cValue: parseError) 110 | throw BreakRuleParseError(cValue: error, context: errorContext) 111 | } 112 | 113 | impl = BreakCursorImpl( 114 | cBreak: cBreak!, 115 | text: text, 116 | textPointer: textPointer, 117 | ruleStatusFactory: ruleStatusFactory) 118 | } 119 | 120 | deinit { 121 | impl.release() 122 | rulesPointer.deallocate() 123 | } 124 | 125 | /// Returns the start index of the text being scanned. 126 | /// 127 | /// This method also adjusts the cursor such that its `index` is equal to the 128 | /// text's starting index. 129 | /// 130 | /// - Returns: The start index of the text being scanned. 131 | public func first() -> String.Index { 132 | return impl.first() 133 | } 134 | 135 | /// Returns the index past the last character of the text being scanned. 136 | /// 137 | /// This method also adjusts the cursor such that its `index` is equal to the 138 | /// index past the last character of the text. 139 | /// 140 | /// - Returns: The index past the last character of the text being scanned. 141 | public func last() -> String.Index { 142 | return impl.last() 143 | } 144 | 145 | /// Returns the index of the boundary following the current boundary in the 146 | /// text. 147 | /// 148 | /// This method adjusts the cursor such that its `index` is equal to the 149 | /// position of the next boundary, or `nil` if all boundaries have been 150 | /// returned. 151 | /// 152 | /// - Returns: The index of the next boundary in the text, or nil if all 153 | /// boundaries have been returned. 154 | public func next() -> String.Index? { 155 | return impl.next() 156 | } 157 | 158 | /// Returns the index of the boundary preceding the current boundary in the 159 | /// text. 160 | /// 161 | /// This method adjusts the cursor such that its `index` is equal to the 162 | /// position of the previous boundary, or `nil` if all boundaries have been 163 | /// returned. 164 | /// 165 | /// - Returns: The index of the previous boundary in the text, or nil if all 166 | /// boundaries have been returned. 167 | public func previous() -> String.Index? { 168 | return impl.previous() 169 | } 170 | 171 | /// Returns the first index greater than `index` at which a boundary occurs. 172 | /// 173 | /// This method adjusts the cursor such that its `index` is equal to the 174 | /// boundary position if one was found, or `nil` if there were no boundaries 175 | /// after `index`. 176 | /// 177 | /// - Parameter index: The index at which scanning should begin. 178 | /// - Returns: The index of the first boundary following `index`, or nil if no 179 | /// boundaries were found. 180 | public func moveToIndex(following index: String.Index) -> String.Index? { 181 | return impl.moveToIndex(following: index) 182 | } 183 | 184 | /// Returns the first index less than `index` at which a boundary occurs. 185 | /// 186 | /// This method adjusts the cursor such that its `index` is equal to the 187 | /// boundary position if one was found, or `nil` if there were no boundaries 188 | /// before `index`. 189 | /// 190 | /// - Parameter index: The index at which the scanning should begin. 191 | /// - Returns: The index of the first boundary preceding `index`, or nil if no 192 | /// boundaries were found. 193 | public func moveToIndex(preceding index: String.Index) -> String.Index? { 194 | return impl.moveToIndex(preceding: index) 195 | } 196 | 197 | /// Returns true if the given index represents a boundary position in the 198 | /// cursor's text, also moving the cursor to the first boundary at or 199 | /// following that index. 200 | /// 201 | /// - Parameter index: The index to check. 202 | /// - Returns: True if the given index is a boundary position. 203 | public func isBoundary(movingToOrAfter index: String.Index) -> Bool { 204 | return impl.isBoundary(movingToOrAfter: index) 205 | } 206 | } 207 | 208 | extension RuleBasedBreakCursor 209 | where RuleStatus: RawRepresentable, RuleStatus.RawValue == Int { 210 | 211 | /// Creates a new rule-based break cursor with the given rules. 212 | /// 213 | /// - Parameters: 214 | /// - rules: The string representation of the break rules used by the 215 | /// cursor. 216 | /// - text: The optional initial text that the cursor will scan. 217 | /// - Throws: `BreakRuleParseError` if an error occurs while parsing the 218 | /// rules. 219 | public convenience init(rules: String, text: String? = nil) throws { 220 | try self.init(rules: rules, text: text) { RuleStatus(rawValue: $0)! } 221 | } 222 | } 223 | 224 | extension RuleBasedBreakCursor where RuleStatus == Int { 225 | 226 | /// Creates a new rule-based break cursor with the given rules. 227 | /// 228 | /// - Parameters: 229 | /// - rules: The string representation of the break rules used by the 230 | /// cursor. 231 | /// - text: The optional initial text that the cursor will scan. 232 | /// - Throws: `BreakRuleParseError` if an error occurs while parsing the 233 | /// rules. 234 | public convenience init(rules: String, text: String? = nil) throws { 235 | try self.init(rules: rules, text: text) { $0 } 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /Sources/ICU/GeneralCategory.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// The general category types of Unicode scalars. 20 | public enum GeneralCategory: ConvertibleFromUnicodeIntProperty { 21 | 22 | /// An uppercase letter (abbreviated Lu). 23 | case uppercaseLetter 24 | 25 | /// A lowercase letter (abbreviated Ll). 26 | case lowercaseLetter 27 | 28 | /// A digraph character whose first part is uppercase (abbreviated Lt). 29 | case titlecaseLetter 30 | 31 | /// A modifier letter (abbreviated Lm). 32 | case modifierLetter 33 | 34 | /// Other letters, including syllables and ideographs (abbreviated Lo). 35 | case otherLetter 36 | 37 | /// A non-spacing combining mark with zero advance width (abbreviated Mn). 38 | case nonspacingMark 39 | 40 | /// A spacing combining mark with positive advance width (abbreviated Mc). 41 | case spacingMark 42 | 43 | /// An enclosing combining mark (abbreviated Me). 44 | case enclosingMark 45 | 46 | /// A decimal digit (abbreviated Nd). 47 | case decimalNumber 48 | 49 | /// A letter-like numeric character (abbreviated Nl). 50 | case letterNumber 51 | 52 | /// A numeric character of another type (abbreviated No). 53 | case otherNumber 54 | 55 | /// A connecting punctuation mark like a tie (abbreviated Pc). 56 | case connectorPunctuation 57 | 58 | /// A dash or hyphen punctuation mark (abbreviated Pd). 59 | case dashPunctuation 60 | 61 | /// An opening punctuation mark of a pair (abbreviated Ps). 62 | case openPunctuation 63 | 64 | /// A closing punctuation mark of a pair (abbreviated Pe). 65 | case closePunctuation 66 | 67 | /// An initial quotation mark (abbreviated Pi). 68 | case initialPunctuation 69 | 70 | /// A final quotation mark (abbreviated Pf). 71 | case finalPunctuation 72 | 73 | /// A punctuation mark of another type (abbreviated Po). 74 | case otherPunctuation 75 | 76 | /// A symbol of mathematical use (abbreviated Sm). 77 | case mathSymbol 78 | 79 | /// A currency sign (abbreviated Sc). 80 | case currencySymbol 81 | 82 | /// A non-letterlike modifier symbol (abbreviated Sk). 83 | case modifierSymbol 84 | 85 | /// A symbol of another type (abbreviated So). 86 | case otherSymbol 87 | 88 | /// A space character of non-zero width (abbreviated Zs). 89 | case spaceSeparator 90 | 91 | /// A line separator, which is specifically (and only) U+2028 LINE SEPARATOR 92 | /// (abbreviated Zl). 93 | case lineSeparator 94 | 95 | /// A paragraph separator, which is specifically (and only) U+2029 PARAGRAPH 96 | /// SEPARATOR (abbreviated Zp). 97 | case paragraphSeparator 98 | 99 | /// A C0 or C1 control code (abbreviated Cc). 100 | case control 101 | 102 | /// A format control character (abbreviated Cf). 103 | case format 104 | 105 | /// A surrogate code point (abbreviated Cs). 106 | case surrogate 107 | 108 | /// A private-use character (abbreviated Co). 109 | case privateUse 110 | 111 | /// A reserved unassigned code point or a non-character (abbreviated Cn). 112 | case unassigned 113 | 114 | /// Indicates whether the general category is in the grouping that 115 | /// represents a cased letter: Lu, Ll, and Lt. 116 | /// 117 | /// This grouping is abbreviated LC in the Unicode standard. 118 | var isCasedLetter: Bool { 119 | switch self { 120 | case .uppercaseLetter, .lowercaseLetter, .titlecaseLetter: return true 121 | default: return false 122 | } 123 | } 124 | 125 | /// Indicates whether the general category is in the grouping that 126 | /// represents a letter: Lu, Ll, Lt, Lm, and Lo. 127 | /// 128 | /// This grouping is abbreviated L in the Unicode standard. 129 | var isLetter: Bool { 130 | switch self { 131 | case .uppercaseLetter, 132 | .lowercaseLetter, 133 | .titlecaseLetter, 134 | .modifierLetter, 135 | .otherLetter: 136 | return true 137 | default: return false 138 | } 139 | } 140 | 141 | /// Indicates whether the general category is in the grouping that 142 | /// represents a mark: Mn, Mc, and Me. 143 | /// 144 | /// This grouping is abbreviated M in the Unicode standard. 145 | var isMark: Bool { 146 | switch self { 147 | case .nonspacingMark, .spacingMark, .enclosingMark: return true 148 | default: return false 149 | } 150 | } 151 | 152 | /// Indicates whether the general category is in the grouping that 153 | /// represents a number: Nd, Nl, and No. 154 | /// 155 | /// This grouping is abbreviated N in the Unicode standard. 156 | var isNumber: Bool { 157 | switch self { 158 | case .decimalNumber, .letterNumber, .otherNumber: return true 159 | default: return false 160 | } 161 | } 162 | 163 | /// Indicates whether the general category is in the grouping that 164 | /// represents punctuation: Pc, Pd, Ps, Pe, Pi, Pf, and Po. 165 | /// 166 | /// This grouping is abbreviated P in the Unicode standard. 167 | var isPunctuation: Bool { 168 | switch self { 169 | case .connectorPunctuation, 170 | .dashPunctuation, 171 | .openPunctuation, 172 | .closePunctuation, 173 | .initialPunctuation, 174 | .finalPunctuation, 175 | .otherPunctuation: 176 | return true 177 | default: return false 178 | } 179 | } 180 | 181 | /// Indicates whether the general category is in the grouping that 182 | /// represents symbols: Sm, Sc, Sk, and So. 183 | /// 184 | /// This grouping is abbreviated S in the Unicode standard. 185 | var isSymbol: Bool { 186 | switch self { 187 | case .mathSymbol, 188 | .currencySymbol, 189 | .modifierSymbol, 190 | .otherSymbol: 191 | return true 192 | default: return false 193 | } 194 | } 195 | 196 | /// Indicates whether the general category is in the grouping that 197 | /// represents separators: Zs, Zl, and Zp. 198 | /// 199 | /// This grouping is abbreviated Z in the Unicode standard. 200 | var isSeparator: Bool { 201 | switch self { 202 | case .spaceSeparator, .lineSeparator, .paragraphSeparator: return true 203 | default: return false 204 | } 205 | } 206 | 207 | /// Indicates whether the general category is in the grouping that 208 | /// represents control characters: Cc, Cf, Cs, Co, and Cn. 209 | /// 210 | /// This grouping is abbreviated C in the Unicode standard. 211 | var isControl: Bool { 212 | switch self { 213 | case .control, 214 | .format, 215 | .surrogate, 216 | .privateUse, 217 | .unassigned: 218 | return true 219 | default: return false 220 | } 221 | } 222 | 223 | /// Creates a new general category value from the given ICU C API value. 224 | /// 225 | /// - Parameter cValue: The ICU C API value. 226 | init?(cValue: UCharCategory) { 227 | switch cValue { 228 | // UNASSIGNED and GENERAL_OTHER_TYPES have the same numeric value (zero) 229 | // so we can't actually distinguish them even though Swift imports them as 230 | // distinct values. 231 | case U_UNASSIGNED, U_GENERAL_OTHER_TYPES: return nil 232 | case U_UPPERCASE_LETTER: self = .uppercaseLetter 233 | case U_LOWERCASE_LETTER: self = .lowercaseLetter 234 | case U_TITLECASE_LETTER: self = .titlecaseLetter 235 | case U_MODIFIER_LETTER: self = .modifierLetter 236 | case U_OTHER_LETTER: self = .otherLetter 237 | case U_NON_SPACING_MARK: self = .nonspacingMark 238 | case U_ENCLOSING_MARK: self = .enclosingMark 239 | case U_COMBINING_SPACING_MARK: self = .spacingMark 240 | case U_DECIMAL_DIGIT_NUMBER: self = .decimalNumber 241 | case U_LETTER_NUMBER: self = .letterNumber 242 | case U_OTHER_NUMBER: self = .otherNumber 243 | case U_SPACE_SEPARATOR: self = .spaceSeparator 244 | case U_LINE_SEPARATOR: self = .lineSeparator 245 | case U_PARAGRAPH_SEPARATOR: self = .paragraphSeparator 246 | case U_CONTROL_CHAR: self = .control 247 | case U_FORMAT_CHAR: self = .format 248 | case U_PRIVATE_USE_CHAR: self = .privateUse 249 | case U_SURROGATE: self = .surrogate 250 | case U_DASH_PUNCTUATION: self = .dashPunctuation 251 | case U_START_PUNCTUATION: self = .openPunctuation 252 | case U_END_PUNCTUATION: self = .closePunctuation 253 | case U_CONNECTOR_PUNCTUATION: self = .connectorPunctuation 254 | case U_OTHER_PUNCTUATION: self = .otherPunctuation 255 | case U_MATH_SYMBOL: self = .mathSymbol 256 | case U_CURRENCY_SYMBOL: self = .currencySymbol 257 | case U_MODIFIER_SYMBOL: self = .modifierSymbol 258 | case U_OTHER_SYMBOL: self = .otherSymbol 259 | case U_INITIAL_PUNCTUATION: self = .initialPunctuation 260 | case U_FINAL_PUNCTUATION: self = .finalPunctuation 261 | default: fatalError("Invalid UCharCategory value: \(cValue)") 262 | } 263 | } 264 | } 265 | } 266 | 267 | extension UnicodeScalar { 268 | 269 | /// The Unicode general category of the receiving scalar. 270 | public var generalCategory: Unicode.GeneralCategory? { 271 | return value(of: UCHAR_GENERAL_CATEGORY) 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /Sources/ICU/WordBreakCursor.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Provides an interface for efficiently locating word boundaries in a text 18 | /// string. 19 | /// 20 | /// ICU defines the rule status tags for word breaks as _ranges_, which allows 21 | /// future versions to subdivide tag groups into finer subcategories. Because of 22 | /// this, they should not (and cannot) be compared using `==`. Instead, use 23 | /// pattern matching (i.e., `if case`, `switch`, or the `~=` operator directly) 24 | /// to determine if a status tag belongs to one of the known categories: 25 | /// 26 | /// ```swift 27 | /// if case .letter = cursor.ruleStatus { 28 | /// // do something 29 | /// } 30 | /// ``` 31 | /// 32 | /// **Terminology note:** The name "cursor" has been chosen instead of 33 | /// "iterator" to better map to the concepts found in Swift. Swift's iterator 34 | /// types provide unidirectional traversal of a sequence; by contrast, a 35 | /// "cursor" more accurately describes this type, which can move forward and 36 | /// backward arbitrarily within a string. 37 | public final class WordBreakCursor { 38 | 39 | /// Represents the status tag from a break rule. 40 | public struct RuleStatus { 41 | 42 | /// The raw integer value of the status tag. 43 | public let rawValue: Int 44 | 45 | /// Creates a new `RuleStatus` value with the given integer value. 46 | /// 47 | /// - Parameter rawValue: The integer value of the status tag. 48 | init(rawValue: Int) { 49 | self.rawValue = rawValue 50 | } 51 | } 52 | 53 | /// Represents a range of status tags from a break rule. 54 | /// 55 | /// The rule status tags for word breaks are meant to be treated as ranges, 56 | /// not exact values; this allows future versions of the library to further 57 | /// subdivide a break rule's tags. Thus, `RuleStatus` values should not be 58 | /// compared using equality, but instead use the `~=` operator (directly, or 59 | /// through pattern matching): 60 | /// 61 | /// ```swift 62 | /// if case .letter = cursor.ruleStatus { 63 | /// // do something 64 | /// } 65 | /// ``` 66 | public struct RuleStatusRange { 67 | 68 | /// The range of integer values. 69 | private let range: Range 70 | 71 | /// Creates a new rule status range from a range of unsigned integers. 72 | /// 73 | /// - Parameter range: The range of unsigned integers. 74 | private init(_ range: Range) { 75 | self.range = Int(range.lowerBound).. Bool { 110 | return range.range.contains(ruleStatus.rawValue) 111 | } 112 | } 113 | 114 | /// The status tag from the break rule that determined the most recently 115 | /// returned break position. 116 | /// 117 | /// If more than one break rule applied at the current position, then the 118 | /// numerically largest status tag is returned. 119 | public var ruleStatus: RuleStatus { 120 | return impl.ruleStatus 121 | } 122 | 123 | /// The status tags from the break rules that determined the most recently 124 | /// returned break position. 125 | public var ruleStatuses: [RuleStatus] { 126 | return impl.ruleStatuses 127 | } 128 | 129 | /// The actual break cursor implementation to which this class's operations 130 | /// are delegated. 131 | private var impl: BreakCursorImpl 132 | 133 | /// The text being scanned by the cursor. 134 | public var text: String? { 135 | get { return impl.text } 136 | set { impl.text = newValue } 137 | } 138 | 139 | /// The most recently returned text boundary. 140 | public var index: String.Index? { 141 | return impl.index 142 | } 143 | 144 | /// The locale used to determine the language rules for text breaking. 145 | public let locale: String? 146 | 147 | /// Creates a new word break cursor with the given rules. 148 | /// 149 | /// - Parameters: 150 | /// - text: The optional initial text that the cursor will scan. 151 | /// - locale: The locale used to determine the language rules for text 152 | /// breaking. 153 | public init(text: String? = nil, locale: String? = nil) { 154 | self.locale = locale 155 | self.impl = BreakCursorImpl( 156 | type: UBRK_WORD, 157 | text: text, 158 | locale: locale, 159 | ruleStatusFactory: RuleStatus.init) 160 | } 161 | 162 | deinit { 163 | impl.release() 164 | } 165 | 166 | /// Returns the start index of the text being scanned. 167 | /// 168 | /// This method also adjusts the cursor such that its `index` is equal to the 169 | /// text's starting index. 170 | /// 171 | /// - Returns: The start index of the text being scanned. 172 | public func first() -> String.Index { 173 | return impl.first() 174 | } 175 | 176 | /// Returns the index past the last character of the text being scanned. 177 | /// 178 | /// This method also adjusts the cursor such that its `index` is equal to the 179 | /// index past the last character of the text. 180 | /// 181 | /// - Returns: The index past the last character of the text being scanned. 182 | public func last() -> String.Index { 183 | return impl.last() 184 | } 185 | 186 | /// Returns the index of the boundary following the current boundary in the 187 | /// text. 188 | /// 189 | /// This method adjusts the cursor such that its `index` is equal to the 190 | /// position of the next boundary, or `nil` if all boundaries have been 191 | /// returned. 192 | /// 193 | /// - Returns: The index of the next boundary in the text, or nil if all 194 | /// boundaries have been returned. 195 | public func next() -> String.Index? { 196 | return impl.next() 197 | } 198 | 199 | /// Returns the index of the boundary preceding the current boundary in the 200 | /// text. 201 | /// 202 | /// This method adjusts the cursor such that its `index` is equal to the 203 | /// position of the previous boundary, or `nil` if all boundaries have been 204 | /// returned. 205 | /// 206 | /// - Returns: The index of the previous boundary in the text, or nil if all 207 | /// boundaries have been returned. 208 | public func previous() -> String.Index? { 209 | return impl.previous() 210 | } 211 | 212 | /// Returns the first index greater than `index` at which a boundary occurs. 213 | /// 214 | /// This method adjusts the cursor such that its `index` is equal to the 215 | /// boundary position if one was found, or `nil` if there were no boundaries 216 | /// after `index`. 217 | /// 218 | /// - Parameter index: The index at which scanning should begin. 219 | /// - Returns: The index of the first boundary following `index`, or nil if no 220 | /// boundaries were found. 221 | public func moveToIndex(following index: String.Index) -> String.Index? { 222 | return impl.moveToIndex(following: index) 223 | } 224 | 225 | /// Returns the first index less than `index` at which a boundary occurs. 226 | /// 227 | /// This method adjusts the cursor such that its `index` is equal to the 228 | /// boundary position if one was found, or `nil` if there were no boundaries 229 | /// before `index`. 230 | /// 231 | /// - Parameter index: The index at which the scanning should begin. 232 | /// - Returns: The index of the first boundary preceding `index`, or nil if no 233 | /// boundaries were found. 234 | public func moveToIndex(preceding index: String.Index) -> String.Index? { 235 | return impl.moveToIndex(preceding: index) 236 | } 237 | 238 | /// Returns true if the given index represents a boundary position in the 239 | /// cursor's text, also moving the cursor to the first boundary at or 240 | /// following that index. 241 | /// 242 | /// - Parameter index: The index to check. 243 | /// - Returns: True if the given index is a boundary position. 244 | public func isBoundary(movingToOrAfter index: String.Index) -> Bool { 245 | return impl.isBoundary(movingToOrAfter: index) 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /Sources/ICU/BreakCursorImpl.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Provides the actual implementation of a break cursor. 18 | /// 19 | /// This type allows the concrete break cursor types to handle creation of the 20 | /// ICU cursor object (using `ubrk_open` or `ubrk_openRules`) and the precise 21 | /// type used to represent rule status tags but otherwise share the 22 | /// implementation of the other operations. 23 | internal struct BreakCursorImpl { 24 | 25 | /// The string being scanned by the cursor. 26 | var text: String? { 27 | didSet { 28 | textPointer?.deallocate() 29 | if let text = text { 30 | textPointer = text.unsafeUTF16CodeUnits() 31 | 32 | var error = UErrorCode() 33 | ubrk_setText( 34 | cBreak, 35 | textPointer?.baseAddress, 36 | Int32(textPointer?.count ?? 0), 37 | &error) 38 | } 39 | } 40 | } 41 | 42 | /// The most recently returned text boundary. 43 | var index: String.Index? { 44 | let offset = ubrk_current(cBreak) 45 | guard let result = stringIndex(forUTF16Offset: offset) else { 46 | preconditionFailure("Break cursor index was not valid") 47 | } 48 | return result 49 | } 50 | 51 | /// The status tag from the break rule that determined the most recently 52 | /// returned break position. 53 | /// 54 | /// If more than one break rule applied at the current position, then the 55 | /// numerically largest status tag is returned. 56 | var ruleStatus: RuleStatus { 57 | return ruleStatusFactory(Int(ubrk_getRuleStatus(cBreak))) 58 | } 59 | 60 | /// The status tags from the break rules that determined the most recently 61 | /// returned break position. 62 | var ruleStatuses: [RuleStatus] { 63 | var error = UErrorCode() 64 | let capacity = ubrk_getRuleStatusVec(cBreak, nil, 0, &error) 65 | 66 | var rawStatuses = [Int32](repeating: 0, count: Int(capacity)) 67 | rawStatuses.withUnsafeMutableBufferPointer { buffer in 68 | error = UErrorCode() 69 | _ = ubrk_getRuleStatusVec(cBreak, buffer.baseAddress, capacity, &error) 70 | } 71 | return rawStatuses.map { ruleStatusFactory(Int($0)) } 72 | } 73 | 74 | /// The pointer to the underlying C ICU break iterator object. 75 | private var cBreak: OpaquePointer 76 | 77 | /// The pointer to the UTF-16 code units of the text being scanned by the 78 | /// break iterator. 79 | private var textPointer: UnsafeMutableBufferPointer? 80 | 81 | /// The function that converts the integer value of a rule status tag to a 82 | /// typed value. 83 | private var ruleStatusFactory: (Int) -> RuleStatus 84 | 85 | /// Creates a new break cursor implementation that wraps an already-created 86 | /// ICU break iterator. 87 | /// 88 | /// - Parameters: 89 | /// - cBreak: The ICU break iterator object. 90 | /// - text: The text string to be iterated over. 91 | /// - textPointer: A pointer to a copy of the UTF-16 code units of `text`. 92 | init( 93 | cBreak: OpaquePointer, 94 | text: String?, 95 | textPointer: UnsafeMutableBufferPointer?, 96 | ruleStatusFactory: @escaping (Int) -> RuleStatus 97 | ) { 98 | self.cBreak = cBreak 99 | self.text = text 100 | self.textPointer = textPointer 101 | self.ruleStatusFactory = ruleStatusFactory 102 | } 103 | 104 | /// Creates a new break cursor of the given type. 105 | /// 106 | /// - Parameters: 107 | /// - type: The ICU break iterator type. 108 | /// - text: The text string to be iterated over. 109 | /// - locale: The locale used to determine the language rules for text 110 | /// breaking. 111 | init( 112 | type: UBreakIteratorType, 113 | text: String?, 114 | locale: String?, 115 | ruleStatusFactory: @escaping (Int) -> RuleStatus 116 | ) { 117 | let textPointer = text?.unsafeUTF16CodeUnits() 118 | 119 | var error = UErrorCode() 120 | let cBreak = ubrk_open( 121 | type, 122 | locale ?? String(cString: uloc_getDefault()), 123 | textPointer?.baseAddress, 124 | Int32(truncatingIfNeeded: textPointer?.count ?? 0), 125 | &error) 126 | 127 | self.init( 128 | cBreak: cBreak!, 129 | text: text, 130 | textPointer: textPointer, 131 | ruleStatusFactory: ruleStatusFactory) 132 | } 133 | 134 | /// Closes the underlying ICU break iterator and frees the memory used to 135 | /// manage the current copy of the text. 136 | func release() { 137 | ubrk_close(cBreak) 138 | textPointer?.deallocate() 139 | } 140 | 141 | /// Returns the start index of the text being scanned. 142 | /// 143 | /// This method also adjusts the cursor such that its `index` is equal to the 144 | /// text's starting index. 145 | /// 146 | /// - Returns: The start index of the text being scanned. 147 | func first() -> String.Index { 148 | let result = ubrk_first(cBreak) 149 | return stringIndex(forUTF16Offset: result)! 150 | } 151 | 152 | /// Returns the index past the last character of the text being scanned. 153 | /// 154 | /// This method also adjusts the cursor such that its `index` is equal to the 155 | /// index past the last character of the text. 156 | /// 157 | /// - Returns: The index past the last character of the text being scanned. 158 | func last() -> String.Index { 159 | let result = ubrk_last(cBreak) 160 | return stringIndex(forUTF16Offset: result)! 161 | } 162 | 163 | /// Returns the index of the boundary following the current boundary in the 164 | /// text. 165 | /// 166 | /// This method adjusts the cursor such that its `index` is equal to the 167 | /// position of the next boundary, or `nil` if all boundaries have been 168 | /// returned. 169 | /// 170 | /// - Returns: The index of the next boundary in the text, or nil if all 171 | /// boundaries have been returned. 172 | func next() -> String.Index? { 173 | let result = ubrk_next(cBreak) 174 | guard result != UBRK_DONE else { return nil } 175 | return stringIndex(forUTF16Offset: result) 176 | } 177 | 178 | /// Returns the index of the boundary preceding the current boundary in the 179 | /// text. 180 | /// 181 | /// This method adjusts the cursor such that its `index` is equal to the 182 | /// position of the previous boundary, or `nil` if all boundaries have been 183 | /// returned. 184 | /// 185 | /// - Returns: The index of the previous boundary in the text, or nil if all 186 | /// boundaries have been returned. 187 | func previous() -> String.Index? { 188 | let result = ubrk_previous(cBreak) 189 | guard result != UBRK_DONE else { return nil } 190 | return stringIndex(forUTF16Offset: result) 191 | } 192 | 193 | /// Returns the first index greater than `index` at which a boundary occurs. 194 | /// 195 | /// This method adjusts the cursor such that its `index` is equal to the 196 | /// boundary position if one was found, or `nil` if there were no boundaries 197 | /// after `index`. 198 | /// 199 | /// - Parameter index: The index at which scanning should begin. 200 | /// - Returns: The index of the first boundary following `index`, or nil if no 201 | /// boundaries were found. 202 | func moveToIndex(following index: String.Index) -> String.Index? { 203 | let offset = utf16Offset(forStringIndex: index) 204 | let result = ubrk_following(cBreak, offset) 205 | guard result != USEARCH_DONE else { return nil } 206 | return stringIndex(forUTF16Offset: result) 207 | } 208 | 209 | /// Returns the first index less than `index` at which a boundary occurs. 210 | /// 211 | /// This method adjusts the cursor such that its `index` is equal to the 212 | /// boundary position if one was found, or `nil` if there were no boundaries 213 | /// before `index`. 214 | /// 215 | /// - Parameter index: The index at which the scanning should begin. 216 | /// - Returns: The index of the first boundary preceding `index`, or nil if no 217 | /// boundaries were found. 218 | func moveToIndex(preceding index: String.Index) -> String.Index? { 219 | let offset = utf16Offset(forStringIndex: index) 220 | let result = ubrk_preceding(cBreak, offset) 221 | guard result != USEARCH_DONE else { return nil } 222 | return stringIndex(forUTF16Offset: result) 223 | } 224 | 225 | /// Returns true if the given index represents a boundary position in the 226 | /// cursor's text, also moving the cursor to the first boundary at or 227 | /// following that index. 228 | /// 229 | /// - Parameter index: The index to check. 230 | /// - Returns: True if the given index is a boundary position. 231 | func isBoundary(movingToOrAfter index: String.Index) -> Bool { 232 | let offset = utf16Offset(forStringIndex: index) 233 | return ubrk_isBoundary(cBreak, offset) != 0 234 | } 235 | 236 | /// Returns the index in the string being searched that corresponds to the 237 | /// given offset in its UTF-16 code units. 238 | /// 239 | /// - Parameters offset: An integer offset in the UTF-16 code units of the 240 | /// string being searched. 241 | /// - Returns: The corresponding string index. 242 | private func stringIndex(forUTF16Offset offset: Int32) -> String.Index? { 243 | guard let text = text else { return nil } 244 | let utf16 = text.utf16 245 | return utf16.index( 246 | utf16.startIndex, 247 | offsetBy: Int(offset) 248 | ).samePosition(in: text) 249 | } 250 | 251 | /// Returns the offset in UTF-16 code units corresponding to an index in the 252 | /// string being searched. 253 | /// 254 | /// - Parameter index: An index into the string being searched. 255 | /// - Returns: The integer offset in UTF-16 code units corresponding to the 256 | /// index. 257 | private func utf16Offset(forStringIndex index: String.Index) -> Int32 { 258 | guard let text = text else { return 0 } 259 | let utf16 = text.utf16 260 | guard let utf16Index = index.samePosition(in: utf16) else { 261 | preconditionFailure("String index must be UTF-16 aligned") 262 | } 263 | let offset = utf16.distance(from: utf16.startIndex, to: utf16Index) 264 | return Int32(truncatingIfNeeded: offset) 265 | } 266 | } 267 | -------------------------------------------------------------------------------- /Sources/ICU/SearchCursor.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | /// Provides an interface for efficiently searching a text string for matches of 18 | /// a pattern in both forward and backward directions. 19 | /// 20 | /// The name "cursor" has been chosen instead of "iterator" to better map to the 21 | /// concepts found in Swift. Swift's iterator types provide unidirectional 22 | /// traversal of a sequence; by contrast, a "cursor" more accurately describes 23 | /// this type, which can move forward and backward arbitrarily within a string. 24 | /// 25 | /// TODO(allevato): Support collators, search attributes, and break iterators. 26 | public final class SearchCursor { 27 | 28 | /// The string being searched by the cursor. 29 | /// 30 | /// Setting this property causes the cursor's position to be reset to the 31 | /// beginning. This allows a cursor to be reüsed to search for the same 32 | /// pattern in multiple text strings. 33 | public var text: String { 34 | didSet { 35 | textPointer.deallocate() 36 | textPointer = text.unsafeUTF16CodeUnits() 37 | 38 | var error = UErrorCode() 39 | usearch_setText( 40 | cSearch, textPointer.baseAddress!, Int32(textPointer.count), &error) 41 | } 42 | } 43 | 44 | /// The pattern used for matching. 45 | /// 46 | /// Setting this property causes the cursor's internal data structures (like 47 | /// the Boyer-Moore table) to be recalculated, but the cursor's position is 48 | /// unchanged. 49 | public var pattern: String { 50 | didSet { 51 | patternPointer.deallocate() 52 | patternPointer = pattern.unsafeUTF16CodeUnits() 53 | 54 | var error = UErrorCode() 55 | usearch_setPattern( 56 | cSearch, 57 | patternPointer.baseAddress!, 58 | Int32(patternPointer.count), 59 | &error) 60 | } 61 | } 62 | 63 | private let cSearch: OpaquePointer 64 | private var patternPointer: UnsafeMutableBufferPointer 65 | private var textPointer: UnsafeMutableBufferPointer 66 | 67 | /// The current index in the string being searched. 68 | /// 69 | /// Setting this property sets the index at which the next search will begin. 70 | public var index: String.Index? { 71 | get { 72 | let offset = usearch_getOffset(cSearch) 73 | guard offset != USEARCH_DONE else { return nil } 74 | guard let result = stringIndex(forUTF16Offset: offset) else { 75 | preconditionFailure("Search index was not valid") 76 | } 77 | return result 78 | } 79 | set { 80 | guard let newValue = newValue else { 81 | preconditionFailure("New value for index may not be nil") 82 | } 83 | var error = UErrorCode() 84 | let position = utf16Offset(forStringIndex: newValue) 85 | usearch_setOffset(cSearch, position, &error) 86 | } 87 | } 88 | 89 | /// The substring representing the cursor's current match. 90 | public var matchedText: Substring? { 91 | let utf16Start = usearch_getMatchedStart(cSearch) 92 | let utf16Length = usearch_getMatchedLength(cSearch) 93 | guard let start = stringIndex(forUTF16Offset: utf16Start), 94 | let end = stringIndex(forUTF16Offset: utf16Start + utf16Length) else { 95 | preconditionFailure( 96 | "ICU unexpectedly returned an index that was not UTF-16 aligned") 97 | } 98 | return text[start.. String.Index? { 150 | var error = UErrorCode() 151 | let result = usearch_first(cSearch, &error) 152 | guard error.isSuccess && result != USEARCH_DONE else { return nil } 153 | return stringIndex(forUTF16Offset: result) 154 | } 155 | 156 | /// Returns the last index at which the text matches the search pattern. 157 | /// 158 | /// This method adjusts the cursor such that its `index` is equal to the match 159 | /// position if the pattern was found, or `nil` if it was not found. 160 | /// 161 | /// - Returns: The last index at which the text matches the search pattern, 162 | /// or nil if no match was found. 163 | public func last() -> String.Index? { 164 | var error = UErrorCode() 165 | let result = usearch_last(cSearch, &error) 166 | guard error.isSuccess && result != USEARCH_DONE else { return nil } 167 | return stringIndex(forUTF16Offset: result) 168 | } 169 | 170 | /// Returns the index of the next point where the text matches the search 171 | /// pattern, starting from the current position. 172 | /// 173 | /// This method adjusts the cursor such that its `index` is equal to the match 174 | /// position if the pattern was found, or `nil` if it was not found. 175 | /// 176 | /// - Returns: The index of the next point where the text matches the search 177 | /// pattern, or nil if no match was found. 178 | public func next() -> String.Index? { 179 | var error = UErrorCode() 180 | let result = usearch_next(cSearch, &error) 181 | guard error.isSuccess && result != USEARCH_DONE else { return nil } 182 | return stringIndex(forUTF16Offset: result) 183 | } 184 | 185 | /// Returns the index of the previous point where the text matches the search 186 | /// pattern, starting from the current position. 187 | /// 188 | /// This method adjusts the cursor such that its `index` is equal to the match 189 | /// position if the pattern was found, or `nil` if it was not found. 190 | /// 191 | /// - Returns: The index of the previous point where the text matches the 192 | /// search pattern, or nil if no match was found. 193 | public func previous() -> String.Index? { 194 | var error = UErrorCode() 195 | let result = usearch_previous(cSearch, &error) 196 | guard error.isSuccess && result != USEARCH_DONE else { return nil } 197 | return stringIndex(forUTF16Offset: result) 198 | } 199 | 200 | /// Returns the first index equal to or greater than `index` at which the 201 | /// text matches the search pattern. 202 | /// 203 | /// This method adjusts the cursor such that its `index` is equal to the match 204 | /// position if the pattern was found, or `nil` if it was not found. 205 | /// 206 | /// - Parameter index: The index at which the search should be started. 207 | /// - Returns: The index of the first match following `index`, or nil if no 208 | /// match was found. 209 | public func moveToIndex(following index: String.Index) -> String.Index? { 210 | var error = UErrorCode() 211 | let offset = utf16Offset(forStringIndex: index) 212 | let result = usearch_following(cSearch, offset, &error) 213 | guard error.isSuccess && result != USEARCH_DONE else { return nil } 214 | return stringIndex(forUTF16Offset: result) 215 | } 216 | 217 | /// Returns the first index equal to or less than `index` at which the text 218 | /// matches the search pattern. 219 | /// 220 | /// This method adjusts the cursor such that its `index` is equal to the match 221 | /// position if the pattern was found, or `nil` if it was not found. 222 | /// 223 | /// - Parameter index: The index at which the search should be started. 224 | /// - Returns: The index of the first match preceding `index`, or nil if no 225 | /// match was found. 226 | public func moveToIndex(preceding index: String.Index) -> String.Index? { 227 | var error = UErrorCode() 228 | let offset = utf16Offset(forStringIndex: index) 229 | let result = usearch_preceding(cSearch, offset, &error) 230 | guard error.isSuccess && result != USEARCH_DONE else { return nil } 231 | return stringIndex(forUTF16Offset: result) 232 | } 233 | 234 | /// Returns the index in the string being searched that corresponds to the 235 | /// given offset in its UTF-16 code units. 236 | /// 237 | /// - Parameters offset: An integer offset in the UTF-16 code units of the 238 | /// string being searched. 239 | /// - Returns: The corresponding string index. 240 | private func stringIndex(forUTF16Offset offset: Int32) -> String.Index? { 241 | let utf16 = text.utf16 242 | return utf16.index( 243 | utf16.startIndex, 244 | offsetBy: Int(offset) 245 | ).samePosition(in: text) 246 | } 247 | 248 | /// Returns the offset in UTF-16 code units corresponding to an index in the 249 | /// string being searched. 250 | /// 251 | /// - Parameter index: An index into the string being searched. 252 | /// - Returns: The integer offset in UTF-16 code units corresponding to the 253 | /// index. 254 | private func utf16Offset(forStringIndex index: String.Index) -> Int32 { 255 | let utf16 = text.utf16 256 | guard let utf16Index = index.samePosition(in: utf16) else { 257 | preconditionFailure("String index must be UTF-16 aligned") 258 | } 259 | let offset = utf16.distance(from: utf16.startIndex, to: utf16Index) 260 | return Int32(truncatingIfNeeded: offset) 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Sources/ICU/JoiningGroup.swift: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Tony Allevato. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import ICU4C 16 | 17 | extension Unicode { 18 | 19 | /// Shaping group subdivisions of dual-joining and right-joining Arabic 20 | /// characters based on the behavior of their letter skeletons when shaped in 21 | /// context. 22 | public enum JoiningGroup: ConvertibleFromUnicodeIntProperty { 23 | 24 | case ain 25 | case alaph 26 | case alef 27 | case beh 28 | case beth 29 | case dal 30 | case dalathRish 31 | case e 32 | case feh 33 | case finalSemkath 34 | case gaf 35 | case gamal 36 | case hah 37 | case tehMarbutaGoal 38 | case hamzaOnHehGoal 39 | case he 40 | case heh 41 | case hehGoal 42 | case heth 43 | case kaf 44 | case kaph 45 | case knottedHeh 46 | case lam 47 | case lamadh 48 | case meem 49 | case mim 50 | case noon 51 | case nun 52 | case pe 53 | case qaf 54 | case qaph 55 | case reh 56 | case reversedPe 57 | case sad 58 | case sadhe 59 | case seen 60 | case semkath 61 | case shin 62 | case swashKaf 63 | case syriacWaw 64 | case tah 65 | case taw 66 | case tehMarbuta 67 | case teth 68 | case waw 69 | case yeh 70 | case yehBarree 71 | case yehWithTail 72 | case yudh 73 | case yudhHe 74 | case zain 75 | case fe 76 | case khaph 77 | case zhain 78 | case burushaskiYehBarree 79 | case farsiYeh 80 | case nya 81 | case rohingyaYeh 82 | case manichaeanAleph 83 | case manichaeanAyin 84 | case manichaeanBeth 85 | case manichaeanDaleth 86 | case manichaeanDhamedh 87 | case manichaeanFive 88 | case manichaeanGimel 89 | case manichaeanHeth 90 | case manichaeanHundred 91 | case manichaeanKaph 92 | case manichaeanLamedh 93 | case manichaeanMem 94 | case manichaeanNun 95 | case manichaeanOne 96 | case manichaeanPe 97 | case manichaeanQoph 98 | case manichaeanResh 99 | case manichaeanSadhe 100 | case manichaeanSamekh 101 | case manichaeanTaw 102 | case manichaeanTen 103 | case manichaeanTeth 104 | case manichaeanThamedh 105 | case manichaeanTwenty 106 | case manichaeanWaw 107 | case manichaeanYodh 108 | case manichaeanZayin 109 | case straightWaw 110 | case africanFeh 111 | case africanNoon 112 | case africanQaf 113 | 114 | /// The C API value of type `UJoiningGroup` that corresponds to the 115 | /// receiving enum case. 116 | var cValue: UJoiningGroup { 117 | switch self { 118 | case .ain: return U_JG_AIN 119 | case .alaph: return U_JG_ALAPH 120 | case .alef: return U_JG_ALEF 121 | case .beh: return U_JG_BEH 122 | case .beth: return U_JG_BETH 123 | case .dal: return U_JG_DAL 124 | case .dalathRish: return U_JG_DALATH_RISH 125 | case .e: return U_JG_E 126 | case .feh: return U_JG_FEH 127 | case .finalSemkath: return U_JG_FINAL_SEMKATH 128 | case .gaf: return U_JG_GAF 129 | case .gamal: return U_JG_GAMAL 130 | case .hah: return U_JG_HAH 131 | case .tehMarbutaGoal: return U_JG_TEH_MARBUTA_GOAL 132 | case .hamzaOnHehGoal: return U_JG_HAMZA_ON_HEH_GOAL 133 | case .he: return U_JG_HE 134 | case .heh: return U_JG_HEH 135 | case .hehGoal: return U_JG_HEH_GOAL 136 | case .heth: return U_JG_HETH 137 | case .kaf: return U_JG_KAF 138 | case .kaph: return U_JG_KAPH 139 | case .knottedHeh: return U_JG_KNOTTED_HEH 140 | case .lam: return U_JG_LAM 141 | case .lamadh: return U_JG_LAMADH 142 | case .meem: return U_JG_MEEM 143 | case .mim: return U_JG_MIM 144 | case .noon: return U_JG_NOON 145 | case .nun: return U_JG_NUN 146 | case .pe: return U_JG_PE 147 | case .qaf: return U_JG_QAF 148 | case .qaph: return U_JG_QAPH 149 | case .reh: return U_JG_REH 150 | case .reversedPe: return U_JG_REVERSED_PE 151 | case .sad: return U_JG_SAD 152 | case .sadhe: return U_JG_SADHE 153 | case .seen: return U_JG_SEEN 154 | case .semkath: return U_JG_SEMKATH 155 | case .shin: return U_JG_SHIN 156 | case .swashKaf: return U_JG_SWASH_KAF 157 | case .syriacWaw: return U_JG_SYRIAC_WAW 158 | case .tah: return U_JG_TAH 159 | case .taw: return U_JG_TAW 160 | case .tehMarbuta: return U_JG_TEH_MARBUTA 161 | case .teth: return U_JG_TETH 162 | case .waw: return U_JG_WAW 163 | case .yeh: return U_JG_YEH 164 | case .yehBarree: return U_JG_YEH_BARREE 165 | case .yehWithTail: return U_JG_YEH_WITH_TAIL 166 | case .yudh: return U_JG_YUDH 167 | case .yudhHe: return U_JG_YUDH_HE 168 | case .zain: return U_JG_ZAIN 169 | case .fe: return U_JG_FE 170 | case .khaph: return U_JG_KHAPH 171 | case .zhain: return U_JG_ZHAIN 172 | case .burushaskiYehBarree: return U_JG_BURUSHASKI_YEH_BARREE 173 | case .farsiYeh: return U_JG_FARSI_YEH 174 | case .nya: return U_JG_NYA 175 | case .rohingyaYeh: return U_JG_ROHINGYA_YEH 176 | case .manichaeanAleph: return U_JG_MANICHAEAN_ALEPH 177 | case .manichaeanAyin: return U_JG_MANICHAEAN_AYIN 178 | case .manichaeanBeth: return U_JG_MANICHAEAN_BETH 179 | case .manichaeanDaleth: return U_JG_MANICHAEAN_DALETH 180 | case .manichaeanDhamedh: return U_JG_MANICHAEAN_DHAMEDH 181 | case .manichaeanFive: return U_JG_MANICHAEAN_FIVE 182 | case .manichaeanGimel: return U_JG_MANICHAEAN_GIMEL 183 | case .manichaeanHeth: return U_JG_MANICHAEAN_HETH 184 | case .manichaeanHundred: return U_JG_MANICHAEAN_HUNDRED 185 | case .manichaeanKaph: return U_JG_MANICHAEAN_KAPH 186 | case .manichaeanLamedh: return U_JG_MANICHAEAN_LAMEDH 187 | case .manichaeanMem: return U_JG_MANICHAEAN_MEM 188 | case .manichaeanNun: return U_JG_MANICHAEAN_NUN 189 | case .manichaeanOne: return U_JG_MANICHAEAN_ONE 190 | case .manichaeanPe: return U_JG_MANICHAEAN_PE 191 | case .manichaeanQoph: return U_JG_MANICHAEAN_QOPH 192 | case .manichaeanResh: return U_JG_MANICHAEAN_RESH 193 | case .manichaeanSadhe: return U_JG_MANICHAEAN_SADHE 194 | case .manichaeanSamekh: return U_JG_MANICHAEAN_SAMEKH 195 | case .manichaeanTaw: return U_JG_MANICHAEAN_TAW 196 | case .manichaeanTen: return U_JG_MANICHAEAN_TEN 197 | case .manichaeanTeth: return U_JG_MANICHAEAN_TETH 198 | case .manichaeanThamedh: return U_JG_MANICHAEAN_THAMEDH 199 | case .manichaeanTwenty: return U_JG_MANICHAEAN_TWENTY 200 | case .manichaeanWaw: return U_JG_MANICHAEAN_WAW 201 | case .manichaeanYodh: return U_JG_MANICHAEAN_YODH 202 | case .manichaeanZayin: return U_JG_MANICHAEAN_ZAYIN 203 | case .straightWaw: return U_JG_STRAIGHT_WAW 204 | case .africanFeh: return U_JG_AFRICAN_FEH 205 | case .africanNoon: return U_JG_AFRICAN_NOON 206 | case .africanQaf: return U_JG_AFRICAN_QAF 207 | } 208 | } 209 | 210 | /// Creates a new value from the given ICU C API value. 211 | /// 212 | /// - Parameter cValue: The ICU C API value. 213 | init?(cValue: UJoiningGroup) { 214 | switch cValue { 215 | case U_JG_NO_JOINING_GROUP: return nil 216 | case U_JG_AIN: self = .ain 217 | case U_JG_ALAPH: self = .alaph 218 | case U_JG_ALEF: self = .alef 219 | case U_JG_BEH: self = .beh 220 | case U_JG_BETH: self = .beth 221 | case U_JG_DAL: self = .dal 222 | case U_JG_DALATH_RISH: self = .dalathRish 223 | case U_JG_E: self = .e 224 | case U_JG_FEH: self = .feh 225 | case U_JG_FINAL_SEMKATH: self = .finalSemkath 226 | case U_JG_GAF: self = .gaf 227 | case U_JG_GAMAL: self = .gamal 228 | case U_JG_HAH: self = .hah 229 | case U_JG_TEH_MARBUTA_GOAL: self = .tehMarbutaGoal 230 | case U_JG_HAMZA_ON_HEH_GOAL: self = .hamzaOnHehGoal 231 | case U_JG_HE: self = .he 232 | case U_JG_HEH: self = .heh 233 | case U_JG_HEH_GOAL: self = .hehGoal 234 | case U_JG_HETH: self = .heth 235 | case U_JG_KAF: self = .kaf 236 | case U_JG_KAPH: self = .kaph 237 | case U_JG_KNOTTED_HEH: self = .knottedHeh 238 | case U_JG_LAM: self = .lam 239 | case U_JG_LAMADH: self = .lamadh 240 | case U_JG_MEEM: self = .meem 241 | case U_JG_MIM: self = .mim 242 | case U_JG_NOON: self = .noon 243 | case U_JG_NUN: self = .nun 244 | case U_JG_PE: self = .pe 245 | case U_JG_QAF: self = .qaf 246 | case U_JG_QAPH: self = .qaph 247 | case U_JG_REH: self = .reh 248 | case U_JG_REVERSED_PE: self = .reversedPe 249 | case U_JG_SAD: self = .sad 250 | case U_JG_SADHE: self = .sadhe 251 | case U_JG_SEEN: self = .seen 252 | case U_JG_SEMKATH: self = .semkath 253 | case U_JG_SHIN: self = .shin 254 | case U_JG_SWASH_KAF: self = .swashKaf 255 | case U_JG_SYRIAC_WAW: self = .syriacWaw 256 | case U_JG_TAH: self = .tah 257 | case U_JG_TAW: self = .taw 258 | case U_JG_TEH_MARBUTA: self = .tehMarbuta 259 | case U_JG_TETH: self = .teth 260 | case U_JG_WAW: self = .waw 261 | case U_JG_YEH: self = .yeh 262 | case U_JG_YEH_BARREE: self = .yehBarree 263 | case U_JG_YEH_WITH_TAIL: self = .yehWithTail 264 | case U_JG_YUDH: self = .yudh 265 | case U_JG_YUDH_HE: self = .yudhHe 266 | case U_JG_ZAIN: self = .zain 267 | case U_JG_FE: self = .fe 268 | case U_JG_KHAPH: self = .khaph 269 | case U_JG_ZHAIN: self = .zhain 270 | case U_JG_BURUSHASKI_YEH_BARREE: self = .burushaskiYehBarree 271 | case U_JG_FARSI_YEH: self = .farsiYeh 272 | case U_JG_NYA: self = .nya 273 | case U_JG_ROHINGYA_YEH: self = .rohingyaYeh 274 | case U_JG_MANICHAEAN_ALEPH: self = .manichaeanAleph 275 | case U_JG_MANICHAEAN_AYIN: self = .manichaeanAyin 276 | case U_JG_MANICHAEAN_BETH: self = .manichaeanBeth 277 | case U_JG_MANICHAEAN_DALETH: self = .manichaeanDaleth 278 | case U_JG_MANICHAEAN_DHAMEDH: self = .manichaeanDhamedh 279 | case U_JG_MANICHAEAN_FIVE: self = .manichaeanFive 280 | case U_JG_MANICHAEAN_GIMEL: self = .manichaeanGimel 281 | case U_JG_MANICHAEAN_HETH: self = .manichaeanHeth 282 | case U_JG_MANICHAEAN_HUNDRED: self = .manichaeanHundred 283 | case U_JG_MANICHAEAN_KAPH: self = .manichaeanKaph 284 | case U_JG_MANICHAEAN_LAMEDH: self = .manichaeanLamedh 285 | case U_JG_MANICHAEAN_MEM: self = .manichaeanMem 286 | case U_JG_MANICHAEAN_NUN: self = .manichaeanNun 287 | case U_JG_MANICHAEAN_ONE: self = .manichaeanOne 288 | case U_JG_MANICHAEAN_PE: self = .manichaeanPe 289 | case U_JG_MANICHAEAN_QOPH: self = .manichaeanQoph 290 | case U_JG_MANICHAEAN_RESH: self = .manichaeanResh 291 | case U_JG_MANICHAEAN_SADHE: self = .manichaeanSadhe 292 | case U_JG_MANICHAEAN_SAMEKH: self = .manichaeanSamekh 293 | case U_JG_MANICHAEAN_TAW: self = .manichaeanTaw 294 | case U_JG_MANICHAEAN_TEN: self = .manichaeanTen 295 | case U_JG_MANICHAEAN_TETH: self = .manichaeanTeth 296 | case U_JG_MANICHAEAN_THAMEDH: self = .manichaeanThamedh 297 | case U_JG_MANICHAEAN_TWENTY: self = .manichaeanTwenty 298 | case U_JG_MANICHAEAN_WAW: self = .manichaeanWaw 299 | case U_JG_MANICHAEAN_YODH: self = .manichaeanYodh 300 | case U_JG_MANICHAEAN_ZAYIN: self = .manichaeanZayin 301 | case U_JG_STRAIGHT_WAW: self = .straightWaw 302 | case U_JG_AFRICAN_FEH: self = .africanFeh 303 | case U_JG_AFRICAN_NOON: self = .africanNoon 304 | case U_JG_AFRICAN_QAF: self = .africanQaf 305 | default: fatalError("Invalid UJoiningGroup value: \(cValue)") 306 | } 307 | } 308 | } 309 | } 310 | 311 | extension UnicodeScalar { 312 | 313 | /// The joining group property of the receiver. 314 | public var joiningGroup: Unicode.JoiningGroup? { 315 | return value(of: UCHAR_JOINING_GROUP) 316 | } 317 | } 318 | --------------------------------------------------------------------------------