├── .github └── workflows │ ├── rustfmt.yml │ └── tests.yml ├── .gitignore ├── COPYRIGHT ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── scripts └── unicode.py └── src ├── confusable_detection.rs ├── general_security_profile.rs ├── lib.rs ├── mixed_script.rs ├── restriction_level.rs ├── tables.rs └── tests.rs /.github/workflows/rustfmt.yml: -------------------------------------------------------------------------------- 1 | name: Rustfmt 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Run rustfmt 13 | run: cargo fmt -- --check 14 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - uses: actions-rs/toolchain@v1 13 | with: 14 | profile: minimal 15 | toolchain: beta 16 | override: true 17 | components: rustfmt 18 | - name: Build 19 | run: cargo build --verbose 20 | - name: Run tests 21 | run: cargo test 22 | - name: Regen 23 | run: python scripts/unicode.py && diff tables.rs src/tables.rs 24 | regen: 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@v2 28 | - name: Verify regenerated files 29 | run: ./scripts/unicode.py && diff tables.rs src/tables.rs 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | scripts/tmp 4 | scripts/*.rs 5 | scripts/*.txt -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Licensed under the Apache License, Version 2.0 2 | or the MIT 4 | license , 5 | at your option. All files in the project carrying such 6 | notice may not be copied, modified, or distributed except 7 | according to those terms. 8 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "unicode-security" 3 | version = "0.1.2" 4 | authors = ["Charles Lew ", "Manish Goregaokar "] 5 | edition = "2018" 6 | homepage = "https://github.com/unicode-rs/unicode-security" 7 | repository = "https://github.com/unicode-rs/unicode-security" 8 | documentation = "https://docs.rs/unicode-security" 9 | license = "MIT/Apache-2.0" 10 | keywords = ["text", "security", "unicode"] 11 | readme = "README.md" 12 | description = """ 13 | Detect possible security problems with Unicode usage 14 | according to Unicode Technical Standard #39 rules. 15 | """ 16 | exclude = [ "target/*", "Cargo.lock" ] 17 | 18 | [dependencies] 19 | unicode-script = { version = "0.5.1", default-features = false } 20 | unicode-normalization = { version = "0.1.12", default-features = false } 21 | std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } 22 | core = { version = "1.0", package = "rustc-std-workspace-core", optional = true } 23 | compiler_builtins = { version = "0.1", optional = true } 24 | 25 | [features] 26 | default = [] 27 | bench = [] 28 | rustc-dep-of-std = ['std', 'core', 'compiler_builtins'] 29 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 The Rust Project Developers 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # unicode-security 2 | 3 | [![Build Status](https://github.com/unicode-rs/unicode-security/workflows/Tests/badge.svg)](https://github.com/unicode-rs/unicode-security/actions) 4 | [![Current Version](https://img.shields.io/crates/v/unicode-security.svg)](https://crates.io/crates/unicode-security) 5 | [![License: MIT/Apache-2.0](https://img.shields.io/crates/l/unicode-security.svg)](#license) 6 | 7 | This crate exposes various utilities from [UAX #39 Unicode Security Mechanisms](https://www.unicode.org/reports/tr39/) 8 | -------------------------------------------------------------------------------- /scripts/unicode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT 4 | # file at the top-level directory of this distribution and at 5 | # http://rust-lang.org/COPYRIGHT. 6 | # 7 | # Licensed under the Apache License, Version 2.0 or the MIT license 9 | # , at your 10 | # option. This file may not be copied, modified, or distributed 11 | # except according to those terms. 12 | 13 | # This script uses the following Unicode security tables: 14 | # - IdentifierStatus.txt 15 | # - IdentifierType.txt 16 | # - PropertyValueAliases.txt 17 | # - confusables.txt 18 | # - ReadMe.txt 19 | # This script also uses the following Unicode UCD data: 20 | # - DerivedCoreProperties.txt 21 | # - Scripts.txt 22 | # 23 | # Since this should not require frequent updates, we just store this 24 | # out-of-line and check the tables.rs file into git. 25 | 26 | import fileinput, re, os, sys, operator 27 | 28 | preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT 29 | // file at the top-level directory of this distribution and at 30 | // http://rust-lang.org/COPYRIGHT. 31 | // 32 | // Licensed under the Apache License, Version 2.0 or the MIT license 34 | // , at your 35 | // option. This file may not be copied, modified, or distributed 36 | // except according to those terms. 37 | 38 | // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly 39 | 40 | #![allow(missing_docs, non_upper_case_globals, non_snake_case)] 41 | ''' 42 | 43 | UNICODE_VERSION = (16, 0, 0) 44 | 45 | UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION 46 | 47 | # Download a Unicode security table file 48 | def fetch(f): 49 | if not os.path.exists(os.path.basename(f)): 50 | os.system("curl -O https://www.unicode.org/Public/security/%s/%s" 51 | % (UNICODE_VERSION_NUMBER, f)) 52 | 53 | if not os.path.exists(os.path.basename(f)): 54 | sys.stderr.write("cannot load %s\n" % f) 55 | exit(1) 56 | 57 | return f 58 | 59 | # Download a UCD table file 60 | def fetch_unidata(f): 61 | if not os.path.exists(os.path.basename(f)): 62 | os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s" 63 | % (UNICODE_VERSION_NUMBER, f)) 64 | 65 | if not os.path.exists(os.path.basename(f)): 66 | sys.stderr.write("cannot load %s" % f) 67 | exit(1) 68 | 69 | return f 70 | 71 | # Loads code point data from provided filename f 72 | # Implementation adapted from unicode-segmentation 73 | def load_properties(f, interestingprops = None): 74 | props = {} 75 | re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#\s]+) *#") 76 | re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *#") 77 | 78 | for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): 79 | prop = None 80 | d_lo = 0 81 | d_hi = 0 82 | m = re1.match(line) 83 | if m: 84 | d_lo = m.group(1) 85 | d_hi = m.group(1) 86 | prop = m.group(2).strip() 87 | else: 88 | m = re2.match(line) 89 | if m: 90 | d_lo = m.group(1) 91 | d_hi = m.group(2) 92 | prop = m.group(3).strip() 93 | else: 94 | continue 95 | if interestingprops and prop not in interestingprops: 96 | continue 97 | d_lo = int(d_lo, 16) 98 | d_hi = int(d_hi, 16) 99 | if prop not in props: 100 | props[prop] = [] 101 | props[prop].append((d_lo, d_hi)) 102 | 103 | return props 104 | 105 | # Loads confusables data from confusables.txt 106 | def load_confusables(f): 107 | fetch(f) 108 | confusables = [] 109 | re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*") 110 | 111 | for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): 112 | d_input = 0 113 | d_outputs = [] 114 | m = re1.match(line) 115 | if not m: 116 | continue 117 | d_inputs = m.group(1).split() 118 | if len(d_inputs) != 1: 119 | raise Exception('More than one code point in first column') 120 | d_input = int(d_inputs[0].strip(), 16) 121 | for d_output in m.group(2).split(): 122 | d_outputitem = int(d_output, 16) 123 | d_outputs.append(d_outputitem) 124 | confusables.append((d_input, d_outputs)) 125 | 126 | return confusables 127 | 128 | # Loads Unicode script name correspondence from PropertyValueAliases.txt 129 | def aliases(): 130 | # This function is taken from the `unicode-script` crate. If significant 131 | # changes are introduced, update accordingly. 132 | 133 | # Note that this file is in UCD directly, not security directory. 134 | # we use `fetch_unidata` function to download it. 135 | fetch_unidata("PropertyValueAliases.txt") 136 | longforms = {} 137 | shortforms = {} 138 | re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)") 139 | for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")): 140 | m = re1.match(line) 141 | if m: 142 | l = m.group(2).strip() 143 | s = m.group(1).strip() 144 | assert(s not in longforms) 145 | assert(l not in shortforms) 146 | longforms[s] = l 147 | shortforms[l] = s 148 | else: 149 | continue 150 | 151 | return (longforms, shortforms) 152 | 153 | # Loads Unicode script name list and correspondence mapping 154 | def load_scripts(f): 155 | # This function is taken from the `unicode-script` crate. If significant 156 | # changes are introduced, update accordingly. 157 | 158 | (longforms, shortforms) = aliases() 159 | scripts = load_properties(fetch_unidata(f), []) 160 | 161 | script_table = [] 162 | script_list = [] 163 | 164 | for script in scripts: 165 | if script not in ["Common", "Unknown", "Inherited"]: 166 | script_list.append(shortforms[script]) 167 | script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]]) 168 | script_list.sort() 169 | script_table.sort(key=lambda w: w[0]) 170 | return (longforms, script_table) 171 | 172 | def is_script_ignored_in_mixedscript(source): 173 | return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz' 174 | 175 | # When a codepoint's prototype consists of multiple codepoints. 176 | # The situation is more complex. Here we make up a few rules 177 | # to cover all the cases in confusables.txt . 178 | # The principle is that when replacing the original codepoint with its prototype. 179 | # Neither a "non-ignored script" appears nor it disappears. 180 | # 181 | # We make up several rules to cover the cases occurred within confusables.txt 182 | # Return True, True when we want to consider it confusable, 183 | # and return True, False when we want to consider it non-confusable. 184 | # and return False, _ when new not-yet-processed cases are added in future Unicode versions. 185 | def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts): 186 | script_lst = script_list(proto_lst, scripts) 187 | script_lst.sort() 188 | # here's a few rules to process current version of Unicode data (13.0 at this time) 189 | script_lst_len = len(script_lst) 190 | assert(script_lst_len > 0) 191 | # Rule: A - A -> Processed, DontAdd 192 | if script_lst_len == 1 and script_lst[0] == script_i: 193 | return True, False 194 | # Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 195 | if (script_lst_len == 1 and not is_script_ignored_in_mixedscript(script_lst[0]) 196 | and not is_script_ignored_in_mixedscript(script_i) 197 | and script_lst[0] != script_i): 198 | return True, True 199 | # Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 200 | if (script_lst_len == 1 and is_script_ignored_in_mixedscript(script_lst[0]) 201 | and not is_script_ignored_in_mixedscript(script_i)): 202 | return True, True 203 | # Rule: A ... - A -> Processed, DontAdd 204 | if script_lst_len > 1 and script_i in script_lst: 205 | return True, False 206 | # Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 207 | if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) 208 | and not is_script_ignored_in_mixedscript(script_lst[1]) 209 | and not is_script_ignored_in_mixedscript(script_i) 210 | and script_lst[1] != script_i): 211 | return True, True 212 | if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[1]) 213 | and not is_script_ignored_in_mixedscript(script_lst[0]) 214 | and not is_script_ignored_in_mixedscript(script_i) 215 | and script_lst[0] != script_i): 216 | return True, True 217 | # Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 218 | if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) 219 | and is_script_ignored_in_mixedscript(script_lst[1]) 220 | and not is_script_ignored_in_mixedscript(script_i)): 221 | return True, True 222 | 223 | # NotProcessed, DontAdd 224 | return False, False 225 | 226 | def is_codepoint_identifier_allowed(c, identifier_allowed): 227 | for data in identifier_allowed: 228 | if c >= data[0] and c <= data[1]: 229 | return True 230 | return False 231 | 232 | # This function load and generates a table of all the confusable characters. 233 | # It returns a pair consists of a `mixedscript_confusable` table and a 234 | # `mixedscript_confusable_unresolved` table. 235 | # The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each 236 | # entry has a value of a inner dict. The inner dict's keys are confusable code points 237 | # converted to string with the `escape_char` function, and its values are pairs. 238 | # pair[0] keeps a copy of the confusable code point itself but as integer. 239 | # pair[1] keeps a list of all the code points that are mixed script confusable with it. 240 | # which is only used for debugging purposes. 241 | # note that the string 'multi' will occur in the list when pair[0] is considered 242 | # confusable with its multiple code point prototype. 243 | # Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible 244 | # that future Unicode version update may cause that table become nonempty, in which 245 | # case more rules needs to be added to the `process_mixedscript_single_to_multi` function 246 | # above to cover those new cases. 247 | def load_potential_mixedscript_confusables(f, identifier_allowed, scripts): 248 | # First, load all confusables data from confusables.txt 249 | confusables = load_confusables(f) 250 | 251 | # The confusables.txt is reductive, means that it is intended to be used in 252 | # on the fly substitutions. The code points that didn't occur in the file can be 253 | # seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C, 254 | # and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable. 255 | 256 | # Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes. 257 | # Principally we'll be using the rhs operands as the representive element of its equivalence classes. 258 | # However some rhs operands are single code point, while some others are not. 259 | # Here we collect them separately into `codepoint_map` and `multicodepoint_map`. 260 | codepoint_map = {} 261 | multicodepoint_map = {} 262 | for item in confusables: 263 | d_source = item[0] 264 | # According to the RFC, we'll skip those code points that are restricted from identifier usage. 265 | if not is_codepoint_identifier_allowed(d_source, identifier_allowed): 266 | continue 267 | d_proto_list = item[1] 268 | if len(d_proto_list) == 1: 269 | d_proto = escape_char(d_proto_list[0]) 270 | # we use the escaped representation of rhs as key to the dict when creating new equivalence class. 271 | if d_proto not in codepoint_map: 272 | codepoint_map[d_proto] = [] 273 | # when we create new equivalence class, we'll check whether the representative element should be collected. 274 | # i.e. if it is not restricted from identifier usage, we collect it into the equivalence class. 275 | if is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed): 276 | codepoint_map[d_proto].append(d_proto_list[0]) 277 | # we collect the original code point to be substituted into this list. 278 | codepoint_map[d_proto].append(d_source) 279 | else: 280 | d_protos = escape_char_list(d_proto_list) 281 | # difference in multi code point case: the rhs part is not directly usable, however we store it in 282 | # dict for further special examination between each lhs and this multi code point rhs. 283 | # and there's an extra level of tuple here. 284 | if d_protos not in multicodepoint_map: 285 | multicodepoint_map[d_protos] = (d_proto_list, []) 286 | multicodepoint_map[d_protos][1].append(d_source) 287 | 288 | mixedscript_confusable = {} 289 | 290 | def confusable_entry_item(confusable, script, item_text, item): 291 | if script not in confusable: 292 | confusable[script] = {} 293 | script_entry = confusable[script] 294 | if item_text not in script_entry: 295 | script_entry[item_text] = (item, []) 296 | return script_entry[item_text][1] 297 | 298 | # First let's examine the each code point having single code point prototype case. 299 | for _, source in codepoint_map.items(): 300 | source_len = len(source) 301 | # Examine each pair in the equivalence class 302 | for i in range(0, source_len - 1): 303 | for j in range(i + 1, source_len): 304 | item_i, item_j = source[i], source[j] 305 | script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) 306 | # If they're in the same script, just skip this pair. 307 | if script_i == script_j: 308 | continue 309 | # If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored), 310 | # this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`. 311 | # We'll consider it a mixed_script_confusable code point. 312 | if not is_script_ignored_in_mixedscript(script_i): 313 | # store it within the map, saving as much information as possible, for further investigation on the final results. 314 | confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) 315 | # Do the same in reverse from `item_j` to `item_i` 316 | if not is_script_ignored_in_mixedscript(script_j): 317 | confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) 318 | 319 | # Then let's examine the each code point having multiple code point prototype case. 320 | # We'll check between the code points that shares the same prototype 321 | for _, proto_lst_and_source in multicodepoint_map.items(): 322 | source = proto_lst_and_source[1] 323 | source_len = len(source) 324 | # This is basically the same as the single code point case. 325 | for i in range(0, source_len - 1): 326 | for j in range(i + 1, source_len): 327 | item_i, item_j = source[i], source[j] 328 | script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) 329 | if script_i == script_j: 330 | continue 331 | if not is_script_ignored_in_mixedscript(script_i): 332 | confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) 333 | if not is_script_ignored_in_mixedscript(script_j): 334 | confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) 335 | 336 | mixedscript_confusable_unresolved = {} 337 | # We'll also check between each code points and its multiple codepoint prototype 338 | for _, proto_lst_and_source in multicodepoint_map.items(): 339 | proto_lst = proto_lst_and_source[0] 340 | proto_lst_can_be_part_of_identifier = True 341 | # If the prototype contains one or more restricted code point, then we skip it. 342 | for c in proto_lst: 343 | if not is_codepoint_identifier_allowed(c, identifier_allowed): 344 | proto_lst_can_be_part_of_identifier = False 345 | break 346 | if not proto_lst_can_be_part_of_identifier: 347 | continue 348 | source = proto_lst_and_source[1] 349 | source_len = len(source) 350 | for i in range(0, source_len): 351 | item_i = source[i] 352 | # So here we're just checking whether the single code point should be considered confusable. 353 | script_i = codepoint_script(item_i, scripts) 354 | # If it's in ignored script, we don't need to do anything here. 355 | if is_script_ignored_in_mixedscript(script_i): 356 | continue 357 | # Here're some rules on examining whether the single code point should be considered confusable. 358 | # The principle is that, when subsitution happens, no new non-ignored script are introduced, and its 359 | # own script is not lost. 360 | processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts) 361 | if should_add: 362 | assert(processed) 363 | # Mark the single code point as confusable. 364 | confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi') 365 | if processed: 366 | # Finished dealing with this code point. 367 | continue 368 | # If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant 369 | # changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw 370 | # an exception after we returned and printed the table out. 371 | proto_lst_text = escape_char_list(proto_lst) 372 | if not proto_lst_text in mixedscript_confusable_unresolved: 373 | mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, []) 374 | mixedscript_confusable_unresolved[proto_lst_text][1].append(item_i) 375 | return (mixedscript_confusable, mixedscript_confusable_unresolved) 376 | 377 | def codepoint_script(c, scripts): 378 | for x, y, script in scripts: 379 | if c >= x and c <= y: 380 | return script 381 | raise Exception("Not in scripts: " + escape_char(c)) 382 | 383 | # Emit some useful information for debugging when further update happens. 384 | def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts): 385 | f.write("/* " + text + "\n") 386 | for script, lst in mixedscript_confusable.items(): 387 | f.write("/// Script - " + script + "\n") 388 | source_lst = [v[0] for (_, v) in lst.items()] 389 | source_lst.sort() 390 | for source in source_lst: 391 | source_text = escape_char(source) 392 | source_item_and_target_lst = lst[source_text] 393 | target_lst = source_item_and_target_lst[1] 394 | f.write(source_text + " => " + escape_char_list(target_lst) + " // " + escape_script_list(target_lst, scripts)+ "\n") 395 | f.write("*/\n") 396 | 397 | 398 | def script_list(char_lst, scripts): 399 | script_lst = [] 400 | for c in char_lst: 401 | if c == 'multi': 402 | script = 'Z~multi' 403 | else: 404 | script = codepoint_script(c, scripts) 405 | if script not in script_lst: 406 | script_lst.append(script) 407 | return script_lst 408 | 409 | def escape_script_list(char_lst, scripts): 410 | script_lst = script_list(char_lst, scripts) 411 | script_lst.sort() 412 | return str(script_lst) 413 | 414 | def debug_emit_mixedscript_confusable_unresolved(f, map, text, scripts): 415 | if len(map) == 0: 416 | return 417 | print("// " + text + "\n") 418 | for prototype_text, pair in map.items(): 419 | prototype = pair[0] 420 | source = pair[1] 421 | print(prototype_text + " => " + escape_char_list(source) + " // " + escape_script_list(prototype, scripts) + " => " + escape_script_list(source, scripts) + "\n") 422 | raise Exception("update the python script to add new rules for new data") 423 | 424 | def format_table_content(f, content, indent): 425 | line = " "*indent 426 | first = True 427 | for chunk in content.split(","): 428 | if len(line) + len(chunk) < 98: 429 | if first: 430 | line += chunk 431 | else: 432 | line += ", " + chunk 433 | first = False 434 | else: 435 | f.write(line + ",\n") 436 | line = " "*indent + chunk 437 | f.write(line) 438 | 439 | def escape_char(c): 440 | if c == 'multi': 441 | return "\"\"" 442 | return "'\\u{%x}'" % c 443 | 444 | def escape_char_list(l): 445 | line = "[" 446 | first = True 447 | for c in l: 448 | if first: 449 | line += escape_char(c) 450 | else: 451 | line += ", " + escape_char(c) 452 | first = False 453 | line += "]" 454 | return line 455 | 456 | def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, 457 | pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): 458 | pub_string = "const" 459 | if not is_const: 460 | pub_string = "let" 461 | if is_pub: 462 | pub_string = "pub " + pub_string 463 | f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) 464 | data = "" 465 | first = True 466 | for dat in t_data: 467 | if not first: 468 | data += "," 469 | first = False 470 | data += pfun(dat) 471 | format_table_content(f, data, 8) 472 | f.write("\n ];\n\n") 473 | 474 | def emit_identifier_module(f): 475 | f.write("pub mod identifier {") 476 | f.write(""" 477 | 478 | #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] 479 | #[allow(non_camel_case_types)] 480 | /// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type 481 | pub enum IdentifierType { 482 | // Restricted 483 | Not_Character, 484 | Deprecated, 485 | Default_Ignorable, 486 | Not_NFKC, 487 | Not_XID, 488 | Exclusion, 489 | Obsolete, 490 | Technical, 491 | Uncommon_Use, 492 | Limited_Use, 493 | 494 | // Allowed 495 | Inclusion, 496 | Recommended 497 | } 498 | #[inline] 499 | pub fn identifier_status_allowed(c: char) -> bool { 500 | // FIXME: do we want to special case ASCII here? 501 | match c as usize { 502 | _ => super::util::bsearch_range_table(c, IDENTIFIER_STATUS) 503 | } 504 | } 505 | 506 | #[inline] 507 | pub fn identifier_type(c: char) -> Option { 508 | // FIXME: do we want to special case ASCII here? 509 | match c as usize { 510 | _ => super::util::bsearch_range_value_table(c, IDENTIFIER_TYPE) 511 | } 512 | } 513 | """) 514 | 515 | f.write(" // Identifier status table:\n") 516 | identifier_status_table = load_properties(fetch("IdentifierStatus.txt")) 517 | emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False, 518 | pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))) 519 | identifier_type = load_properties(fetch("IdentifierType.txt")) 520 | type_table = [] 521 | for ty in identifier_type: 522 | type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]]) 523 | 524 | type_table.sort(key=lambda w: w[0]) 525 | 526 | emit_table(f, "IDENTIFIER_TYPE", type_table, "&'static [(char, char, IdentifierType)]", is_pub=False, 527 | pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) 528 | f.write("}\n\n") 529 | 530 | def emit_default_ignorable_detection_module(f): 531 | f.write("pub mod default_ignorable_code_point {") 532 | f.write(""" 533 | 534 | #[inline] 535 | pub fn default_ignorable_code_point(c: char) -> bool { 536 | match c as usize { 537 | _ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE) 538 | } 539 | } 540 | 541 | """) 542 | 543 | f.write(" // Default ignorable code point table:\n") 544 | default_ignorable_table = load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"]) 545 | emit_table(f, "DEFAULT_IGNORABLE", default_ignorable_table["Default_Ignorable_Code_Point"], "&'static [(char, char)]", is_pub=False, 546 | pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))) 547 | 548 | f.write("}\n\n") 549 | 550 | def emit_confusable_detection_module(f): 551 | f.write("pub mod confusable_detection {") 552 | f.write(""" 553 | 554 | #[inline] 555 | pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> { 556 | // FIXME: do we want to special case ASCII here? 557 | match c as usize { 558 | _ => super::util::bsearch_value_table(c, CONFUSABLES) 559 | } 560 | } 561 | 562 | """) 563 | 564 | f.write(" // Confusable table:\n") 565 | confusable_table = load_confusables("confusables.txt") 566 | confusable_table.sort(key=lambda w: w[0]) 567 | 568 | last_key = None 569 | for (k, _) in confusable_table: 570 | if k == last_key: 571 | raise Exception("duplicate keys in confusables table: %s" % k) 572 | last_key = k 573 | 574 | emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False, 575 | pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1]))) 576 | f.write("}\n\n") 577 | 578 | def escape_script_constant(name, longforms): 579 | return "Script::" + longforms[name].strip() 580 | 581 | def emit_potiential_mixed_script_confusable(f): 582 | f.write("pub mod potential_mixed_script_confusable {") 583 | f.write(""" 584 | #[inline] 585 | pub fn potential_mixed_script_confusable(c: char) -> bool { 586 | match c as usize { 587 | _ => super::util::bsearch_table(c, CONFUSABLES) 588 | } 589 | } 590 | """) 591 | identifier_status_table = load_properties(fetch("IdentifierStatus.txt")) 592 | _, scripts = load_scripts("Scripts.txt") 593 | identifier_allowed = identifier_status_table['Allowed'] 594 | (mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts) 595 | debug = False 596 | if debug == True: 597 | debug_emit_mixedscript_confusable(f, mixedscript_confusable, "mixedscript_confusable", scripts) 598 | debug_emit_mixedscript_confusable_unresolved(f, mixedscript_confusable_unresolved, "mixedscript_confusable_unresolved", scripts) 599 | confusable_table = [] 600 | for script, lst in mixedscript_confusable.items(): 601 | for _, pair in lst.items(): 602 | source = pair[0] 603 | confusable_table.append((source, script)) 604 | confusable_table.sort(key=lambda w: w[0]) 605 | emit_table(f, "CONFUSABLES", confusable_table, "&'static [char]", is_pub=False, 606 | pfun=lambda x: "%s" % escape_char(x[0])) 607 | f.write("}\n\n") 608 | 609 | 610 | def emit_util_mod(f): 611 | f.write(""" 612 | pub mod util { 613 | use core::result::Result::{Ok, Err}; 614 | 615 | #[inline] 616 | pub fn bsearch_table(c: char, r: &'static [char]) -> bool { 617 | r.binary_search(&c).is_ok() 618 | } 619 | 620 | #[inline] 621 | pub fn bsearch_value_table(c: char, r: &'static [(char, T)]) -> Option { 622 | match r.binary_search_by_key(&c, |&(k, _)| k) { 623 | Ok(idx) => { 624 | let (_, v) = r[idx]; 625 | Some(v) 626 | } 627 | Err(_) => None 628 | } 629 | } 630 | 631 | #[inline] 632 | pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { 633 | use core::cmp::Ordering::{Equal, Less, Greater}; 634 | r.binary_search_by(|&(lo,hi)| { 635 | if lo <= c && c <= hi { Equal } 636 | else if hi < c { Less } 637 | else { Greater } 638 | }).is_ok() 639 | } 640 | 641 | pub fn bsearch_range_value_table(c: char, r: &'static [(char, char, T)]) -> Option { 642 | use core::cmp::Ordering::{Equal, Less, Greater}; 643 | match r.binary_search_by(|&(lo, hi, _)| { 644 | if lo <= c && c <= hi { Equal } 645 | else if hi < c { Less } 646 | else { Greater } 647 | }) { 648 | Ok(idx) => { 649 | let (_, _, cat) = r[idx]; 650 | Some(cat) 651 | } 652 | Err(_) => None 653 | } 654 | } 655 | 656 | } 657 | 658 | """) 659 | 660 | if __name__ == "__main__": 661 | r = "tables.rs" 662 | if os.path.exists(r): 663 | os.remove(r) 664 | with open(r, "w") as rf: 665 | # write the file's preamble 666 | rf.write(preamble) 667 | 668 | rf.write(""" 669 | /// The version of [Unicode](http://www.unicode.org/) 670 | /// that this version of unicode-security is based on. 671 | pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); 672 | 673 | """ % UNICODE_VERSION) 674 | 675 | emit_util_mod(rf) 676 | ### identifier module 677 | emit_identifier_module(rf) 678 | ### default_ignorable_detection module 679 | emit_default_ignorable_detection_module(rf) 680 | ### confusable_detection module 681 | emit_confusable_detection_module(rf) 682 | ### mixed_script_confusable_detection module 683 | emit_potiential_mixed_script_confusable(rf) 684 | -------------------------------------------------------------------------------- /src/confusable_detection.rs: -------------------------------------------------------------------------------- 1 | //! [Confusable detection](https://www.unicode.org/reports/tr39/#Confusable_Detection) 2 | 3 | use core::iter; 4 | 5 | enum OnceOrMore { 6 | Once(iter::Once), 7 | More(I), 8 | } 9 | 10 | impl Iterator for OnceOrMore 11 | where 12 | I: Iterator, 13 | { 14 | type Item = T; 15 | 16 | fn next(&mut self) -> Option { 17 | use OnceOrMore::*; 18 | match self { 19 | Once(v) => v.next(), 20 | More(i) => i.next(), 21 | } 22 | } 23 | } 24 | 25 | type StaticSliceIterCloned = core::iter::Cloned>; 26 | 27 | fn char_prototype(c: char) -> OnceOrMore { 28 | use crate::tables::confusable_detection::char_confusable_prototype; 29 | match char_confusable_prototype(c) { 30 | None => OnceOrMore::Once(iter::once(c)), 31 | Some(l) => OnceOrMore::More(l.iter().cloned()), 32 | } 33 | } 34 | 35 | /// Calculate skeleton for string, as defined by UTS 39 36 | pub fn skeleton(s: &str) -> impl Iterator + '_ { 37 | use crate::tables::default_ignorable_code_point::default_ignorable_code_point; 38 | use unicode_normalization::UnicodeNormalization; 39 | 40 | s.chars() 41 | .nfd() 42 | .filter(|c| !default_ignorable_code_point(*c)) 43 | .flat_map(char_prototype) 44 | .nfd() 45 | } 46 | -------------------------------------------------------------------------------- /src/general_security_profile.rs: -------------------------------------------------------------------------------- 1 | //! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile) 2 | //! for identifiers 3 | 4 | use crate::tables::identifier; 5 | 6 | pub use identifier::IdentifierType; 7 | 8 | /// Methods for determining characters not restricted from use for identifiers. 9 | pub trait GeneralSecurityProfile { 10 | /// Returns whether the character is not restricted from use for identifiers. 11 | fn identifier_allowed(self) -> bool; 12 | 13 | /// Returns the [identifier type](https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type) 14 | fn identifier_type(self) -> Option; 15 | } 16 | 17 | impl GeneralSecurityProfile for char { 18 | #[inline] 19 | fn identifier_allowed(self) -> bool { 20 | identifier::identifier_status_allowed(self) 21 | } 22 | #[inline] 23 | fn identifier_type(self) -> Option { 24 | identifier::identifier_type(self) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | //! Detect possible security problems with Unicode usage according to 12 | //! [Unicode Technical Standard #39](http://www.unicode.org/reports/tr39/) 13 | //! rules. 14 | //! 15 | //! ```rust 16 | //! extern crate unicode_security; 17 | //! 18 | //! use unicode_security::GeneralSecurityProfile; 19 | //! 20 | //! fn main() { 21 | //! let ch = 'µ'; // U+00B5 MICRO SIGN 22 | //! let allowed = 'µ'.identifier_allowed(); 23 | //! println!("{}", ch); 24 | //! println!("The above char is {} in unicode identifiers.", 25 | //! if allowed { "allowed" } else { "restricted" }); 26 | //! } 27 | //! ``` 28 | //! 29 | //! # features 30 | //! 31 | //! unicode-security supports a `no_std` feature. This eliminates dependence 32 | //! on std, and instead uses equivalent functions from core. 33 | //! 34 | //! # crates.io 35 | //! 36 | //! You can use this package in your project by adding the following 37 | //! to your `Cargo.toml`: 38 | //! 39 | //! ```toml 40 | //! [dependencies] 41 | //! unicode-security = "0.0.1" 42 | //! ``` 43 | 44 | #![deny(missing_docs, unsafe_code)] 45 | #![doc( 46 | html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", 47 | html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" 48 | )] 49 | #![cfg_attr(feature = "bench", feature(test))] 50 | #![no_std] 51 | 52 | #[cfg(test)] 53 | #[macro_use] 54 | extern crate std; 55 | 56 | #[cfg(feature = "bench")] 57 | extern crate test; 58 | 59 | pub use tables::UNICODE_VERSION; 60 | 61 | pub mod confusable_detection; 62 | pub mod general_security_profile; 63 | pub mod mixed_script; 64 | pub mod restriction_level; 65 | 66 | pub use confusable_detection::skeleton; 67 | pub use general_security_profile::GeneralSecurityProfile; 68 | pub use mixed_script::is_potential_mixed_script_confusable_char; 69 | pub use mixed_script::MixedScript; 70 | pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection}; 71 | 72 | #[rustfmt::skip] 73 | pub(crate) mod tables; 74 | 75 | #[cfg(test)] 76 | mod tests; 77 | -------------------------------------------------------------------------------- /src/mixed_script.rs: -------------------------------------------------------------------------------- 1 | //! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) 2 | 3 | use core::fmt::{self, Debug}; 4 | use unicode_script::{Script, ScriptExtension}; 5 | 6 | /// An Augmented script set, as defined by UTS 39 7 | /// 8 | /// https://www.unicode.org/reports/tr39/#def-augmented-script-set 9 | #[derive(Copy, Clone, PartialEq, Hash, Eq)] 10 | pub struct AugmentedScriptSet { 11 | /// The base ScriptExtension value 12 | pub base: ScriptExtension, 13 | /// Han With Bopomofo 14 | pub hanb: bool, 15 | /// Japanese 16 | pub jpan: bool, 17 | /// Korean 18 | pub kore: bool, 19 | } 20 | 21 | impl From for AugmentedScriptSet { 22 | fn from(ext: ScriptExtension) -> Self { 23 | let mut hanb = false; 24 | let mut jpan = false; 25 | let mut kore = false; 26 | 27 | if ext.is_common() || ext.is_inherited() || ext.contains_script(Script::Han) { 28 | hanb = true; 29 | jpan = true; 30 | kore = true; 31 | } else { 32 | if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) { 33 | jpan = true; 34 | } 35 | 36 | if ext.contains_script(Script::Hangul) { 37 | kore = true; 38 | } 39 | 40 | if ext.contains_script(Script::Bopomofo) { 41 | hanb = true; 42 | } 43 | } 44 | Self { 45 | base: ext, 46 | hanb, 47 | jpan, 48 | kore, 49 | } 50 | } 51 | } 52 | 53 | impl From for AugmentedScriptSet { 54 | fn from(c: char) -> Self { 55 | AugmentedScriptSet::for_char(c) 56 | } 57 | } 58 | 59 | impl From<&'_ str> for AugmentedScriptSet { 60 | fn from(s: &'_ str) -> Self { 61 | AugmentedScriptSet::for_str(s) 62 | } 63 | } 64 | 65 | impl Default for AugmentedScriptSet { 66 | fn default() -> Self { 67 | AugmentedScriptSet { 68 | base: Script::Common.into(), 69 | hanb: true, 70 | jpan: true, 71 | kore: true, 72 | } 73 | } 74 | } 75 | 76 | impl Debug for AugmentedScriptSet { 77 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 78 | if self.is_empty() { 79 | write!(f, "AugmentedScriptSet {{∅}}")?; 80 | } else if self.is_all() { 81 | write!(f, "AugmentedScriptSet {{ALL}}")?; 82 | } else { 83 | write!(f, "AugmentedScriptSet {{")?; 84 | let mut first_entry = true; 85 | let hanb = if self.hanb { Some("Hanb") } else { None }; 86 | let jpan = if self.jpan { Some("Jpan") } else { None }; 87 | let kore = if self.kore { Some("Kore") } else { None }; 88 | for writing_system in None 89 | .into_iter() 90 | .chain(hanb) 91 | .chain(jpan) 92 | .chain(kore) 93 | .chain(self.base.iter().map(Script::short_name)) 94 | { 95 | if !first_entry { 96 | write!(f, ", ")?; 97 | } else { 98 | first_entry = false; 99 | } 100 | write!(f, "{}", writing_system)?; 101 | } 102 | write!(f, "}}")?; 103 | } 104 | Ok(()) 105 | } 106 | } 107 | 108 | impl fmt::Display for AugmentedScriptSet { 109 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 110 | if self.is_empty() { 111 | write!(f, "Empty")?; 112 | } else if self.is_all() { 113 | write!(f, "All")?; 114 | } else { 115 | let mut first_entry = true; 116 | let hanb = if self.hanb { 117 | Some("Han with Bopomofo") 118 | } else { 119 | None 120 | }; 121 | let jpan = if self.jpan { Some("Japanese") } else { None }; 122 | let kore = if self.kore { Some("Korean") } else { None }; 123 | for writing_system in None 124 | .into_iter() 125 | .chain(hanb) 126 | .chain(jpan) 127 | .chain(kore) 128 | .chain(self.base.iter().map(Script::full_name)) 129 | { 130 | if !first_entry { 131 | write!(f, ", ")?; 132 | } else { 133 | first_entry = false; 134 | } 135 | write!(f, "{}", writing_system)?; 136 | } 137 | } 138 | Ok(()) 139 | } 140 | } 141 | 142 | impl AugmentedScriptSet { 143 | /// Intersect this set with another 144 | pub fn intersect_with(&mut self, other: Self) { 145 | self.base.intersect_with(other.base); 146 | self.hanb = self.hanb && other.hanb; 147 | self.jpan = self.jpan && other.jpan; 148 | self.kore = self.kore && other.kore; 149 | } 150 | 151 | /// Check if the set is empty 152 | pub fn is_empty(&self) -> bool { 153 | self.base.is_empty() && !self.hanb && !self.jpan && !self.kore 154 | } 155 | 156 | /// Check if the set is "All" (Common or Inherited) 157 | pub fn is_all(&self) -> bool { 158 | self.base.is_common() || self.base.is_inherited() 159 | } 160 | 161 | /// Construct an AugmentedScriptSet for a given character 162 | pub fn for_char(c: char) -> Self { 163 | ScriptExtension::from(c).into() 164 | } 165 | 166 | /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string 167 | pub fn for_str(s: &str) -> Self { 168 | let mut set = AugmentedScriptSet::default(); 169 | for ch in s.chars() { 170 | set.intersect_with(ch.into()) 171 | } 172 | set 173 | } 174 | } 175 | 176 | /// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) 177 | pub trait MixedScript { 178 | /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script) 179 | /// 180 | /// Note that a single-script string may still contain multiple Script properties! 181 | fn is_single_script(self) -> bool; 182 | 183 | /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string 184 | fn resolve_script_set(self) -> AugmentedScriptSet; 185 | } 186 | 187 | impl MixedScript for &'_ str { 188 | fn is_single_script(self) -> bool { 189 | !AugmentedScriptSet::for_str(self).is_empty() 190 | } 191 | 192 | fn resolve_script_set(self) -> AugmentedScriptSet { 193 | self.into() 194 | } 195 | } 196 | 197 | /// Check if a character is considered potential mixed script confusable. 198 | /// 199 | /// If the specified character is not restricted from use for identifiers, 200 | /// this function returns whether it is considered mixed script confusable 201 | /// with another character that is not restricted from use for identifiers. 202 | /// 203 | /// If the specified character is restricted from use for identifiers, 204 | /// the return value is unspecified. 205 | pub fn is_potential_mixed_script_confusable_char(c: char) -> bool { 206 | use crate::tables::potential_mixed_script_confusable::potential_mixed_script_confusable; 207 | 208 | potential_mixed_script_confusable(c) 209 | } 210 | -------------------------------------------------------------------------------- /src/restriction_level.rs: -------------------------------------------------------------------------------- 1 | //! For detecting the [restriction level](https://www.unicode.org/reports/tr39/#Restriction_Level_Detection) 2 | //! a string conforms to 3 | 4 | use crate::mixed_script::AugmentedScriptSet; 5 | use crate::GeneralSecurityProfile; 6 | use unicode_script::Script; 7 | 8 | #[derive(Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Debug, Hash)] 9 | /// The [Restriction level](https://www.unicode.org/reports/tr39/#Restriction_Level_Detection) 10 | /// a string conforms to 11 | pub enum RestrictionLevel { 12 | /// https://www.unicode.org/reports/tr39/#ascii_only 13 | ASCIIOnly, 14 | /// https://www.unicode.org/reports/tr39/#single_script 15 | SingleScript, 16 | /// https://www.unicode.org/reports/tr39/#highly_restrictive 17 | HighlyRestrictive, 18 | /// https://www.unicode.org/reports/tr39/#moderately_restrictive 19 | ModeratelyRestrictive, 20 | /// https://www.unicode.org/reports/tr39/#minimally_restrictive 21 | MinimallyRestrictive, 22 | /// https://www.unicode.org/reports/tr39/#unrestricted 23 | Unrestricted, 24 | } 25 | 26 | /// Utilities for determining which [restriction level](https://www.unicode.org/reports/tr39/#Restriction_Level_Detection) 27 | /// a string satisfies 28 | pub trait RestrictionLevelDetection: Sized { 29 | /// Detect the [restriction level](https://www.unicode.org/reports/tr39/#Restriction_Level_Detection) 30 | /// 31 | /// This will _not_ check identifier well-formedness, as different applications may have different notions of well-formedness 32 | fn detect_restriction_level(self) -> RestrictionLevel; 33 | 34 | /// Check if a string satisfies the supplied [restriction level](https://www.unicode.org/reports/tr39/#Restriction_Level_Detection) 35 | /// 36 | /// This will _not_ check identifier well-formedness, as different applications may have different notions of well-formedness 37 | fn check_restriction_level(self, level: RestrictionLevel) -> bool { 38 | self.detect_restriction_level() <= level 39 | } 40 | } 41 | 42 | impl RestrictionLevelDetection for &'_ str { 43 | fn detect_restriction_level(self) -> RestrictionLevel { 44 | let mut ascii_only = true; 45 | let mut set = AugmentedScriptSet::default(); 46 | let mut exclude_latin_set = AugmentedScriptSet::default(); 47 | for ch in self.chars() { 48 | if !GeneralSecurityProfile::identifier_allowed(ch) { 49 | return RestrictionLevel::Unrestricted; 50 | } 51 | if !ch.is_ascii() { 52 | ascii_only = false; 53 | } 54 | let ch_set = ch.into(); 55 | set.intersect_with(ch_set); 56 | if !ch_set.base.contains_script(Script::Latin) { 57 | exclude_latin_set.intersect_with(ch_set); 58 | } 59 | } 60 | 61 | if ascii_only { 62 | return RestrictionLevel::ASCIIOnly; 63 | } else if !set.is_empty() { 64 | return RestrictionLevel::SingleScript; 65 | } else if exclude_latin_set.kore || exclude_latin_set.hanb || exclude_latin_set.jpan { 66 | return RestrictionLevel::HighlyRestrictive; 67 | } else if exclude_latin_set.base.len() == 1 { 68 | let script = exclude_latin_set.base.iter().next().unwrap(); 69 | if script.is_recommended() && script != Script::Cyrillic && script != Script::Greek { 70 | return RestrictionLevel::ModeratelyRestrictive; 71 | } 72 | } 73 | return RestrictionLevel::MinimallyRestrictive; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/tests.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | #[test] 12 | fn test_general_security_profile_identifier_allowed() { 13 | use crate::GeneralSecurityProfile; 14 | assert_eq!(GeneralSecurityProfile::identifier_allowed('A'), true); 15 | assert_eq!('A'.identifier_allowed(), true); 16 | assert_eq!(GeneralSecurityProfile::identifier_allowed('0'), true); 17 | assert_eq!('0'.identifier_allowed(), true); 18 | assert_eq!(GeneralSecurityProfile::identifier_allowed('_'), true); 19 | assert_eq!('_'.identifier_allowed(), true); 20 | assert_eq!(GeneralSecurityProfile::identifier_allowed('\x00'), false); 21 | assert_eq!('\x00'.identifier_allowed(), false); 22 | // U+00B5 MICRO SIGN 23 | assert_eq!(GeneralSecurityProfile::identifier_allowed('µ'), false); 24 | assert_eq!('µ'.identifier_allowed(), false); 25 | // U+2160 ROMAN NUMERAL ONE 26 | assert_eq!(GeneralSecurityProfile::identifier_allowed('Ⅰ'), false); 27 | assert_eq!('Ⅰ'.identifier_allowed(), false); 28 | } 29 | 30 | #[test] 31 | fn test_mixed_script() { 32 | use crate::MixedScript; 33 | assert_eq!("".is_single_script(), true); 34 | assert_eq!("".resolve_script_set().is_empty(), false); 35 | assert_eq!("".resolve_script_set().is_all(), true); 36 | assert_eq!("A".is_single_script(), true); 37 | assert_eq!("A".resolve_script_set().is_empty(), false); 38 | assert_eq!("A".resolve_script_set().is_all(), false); 39 | assert_eq!("A0".is_single_script(), true); 40 | assert_eq!("A0".resolve_script_set().is_empty(), false); 41 | assert_eq!("A0".resolve_script_set().is_all(), false); 42 | assert_eq!("0.".is_single_script(), true); 43 | assert_eq!("0.".resolve_script_set().is_empty(), false); 44 | assert_eq!("0.".resolve_script_set().is_all(), true); 45 | assert_eq!("福".is_single_script(), true); 46 | assert_eq!("福".resolve_script_set().is_empty(), false); 47 | assert_eq!("福".resolve_script_set().is_all(), false); 48 | assert_eq!("冬の雪".is_single_script(), true); 49 | assert_eq!("冬の雪".resolve_script_set().is_empty(), false); 50 | assert_eq!("冬の雪".resolve_script_set().is_all(), false); 51 | assert_eq!("幻ㄒㄧㄤ".is_single_script(), true); 52 | assert_eq!("幻ㄒㄧㄤ".resolve_script_set().is_empty(), false); 53 | assert_eq!("幻ㄒㄧㄤ".resolve_script_set().is_all(), false); 54 | assert_eq!("日出은".is_single_script(), true); 55 | assert_eq!("日出은".resolve_script_set().is_empty(), false); 56 | assert_eq!("日出은".resolve_script_set().is_all(), false); 57 | assert_eq!("夏の幻ㄒㄧㄤ".is_single_script(), false); 58 | assert_eq!("夏の幻ㄒㄧㄤ".resolve_script_set().is_empty(), true); 59 | assert_eq!("夏の幻ㄒㄧㄤ".resolve_script_set().is_all(), false); 60 | } 61 | 62 | #[test] 63 | fn test_confusable_detection() { 64 | use crate::skeleton; 65 | use std::string::String; 66 | assert_eq!(&skeleton("").collect::(), ""); 67 | assert_eq!(&skeleton("s").collect::(), "s"); 68 | assert_eq!(&skeleton("sss").collect::(), "sss"); 69 | assert_eq!(&skeleton("s؜s؜s").collect::(), "sss"); 70 | assert_eq!(&skeleton("ﶛ").collect::(), "نمى"); 71 | assert_eq!(&skeleton("ﶛﶛ").collect::(), "نمىنمى"); 72 | } 73 | 74 | #[test] 75 | fn test_potential_mixed_script_detection() { 76 | use crate::is_potential_mixed_script_confusable_char; 77 | 78 | assert!(is_potential_mixed_script_confusable_char('A')); 79 | assert!(!is_potential_mixed_script_confusable_char('D')); 80 | } 81 | 82 | #[test] 83 | fn test_augmented_script_set_fmt_debug() { 84 | use crate::mixed_script::AugmentedScriptSet; 85 | let augmented_script_sets = vec![ 86 | AugmentedScriptSet::default(), 87 | AugmentedScriptSet::from('0'), 88 | AugmentedScriptSet::from('a'), 89 | AugmentedScriptSet::from('μ'), 90 | AugmentedScriptSet::from('汉'), 91 | AugmentedScriptSet::from('ひ'), 92 | AugmentedScriptSet::from('カ'), 93 | AugmentedScriptSet::from('한'), 94 | AugmentedScriptSet::from("汉ひ"), 95 | AugmentedScriptSet::from("汉a"), 96 | AugmentedScriptSet::from("汉μ"), 97 | AugmentedScriptSet::from("〆切"), 98 | ]; 99 | let debug_output = vec![ 100 | "AugmentedScriptSet {ALL}", 101 | "AugmentedScriptSet {ALL}", 102 | "AugmentedScriptSet {Latn}", 103 | "AugmentedScriptSet {Grek}", 104 | "AugmentedScriptSet {Hanb, Jpan, Kore, Hani}", 105 | "AugmentedScriptSet {Jpan, Hira}", 106 | "AugmentedScriptSet {Jpan, Kana}", 107 | "AugmentedScriptSet {Kore, Hang}", 108 | "AugmentedScriptSet {Jpan}", 109 | "AugmentedScriptSet {∅}", 110 | "AugmentedScriptSet {∅}", 111 | "AugmentedScriptSet {Hanb, Jpan, Kore, Hani}", 112 | ]; 113 | 114 | for (ss, output) in augmented_script_sets.into_iter().zip(debug_output) { 115 | assert_eq!(format!("{:?}", ss), output); 116 | } 117 | } 118 | 119 | #[test] 120 | fn test_augmented_script_set_fmt_display() { 121 | use crate::mixed_script::AugmentedScriptSet; 122 | let augmented_script_sets = vec![ 123 | AugmentedScriptSet::default(), 124 | AugmentedScriptSet::from('0'), 125 | AugmentedScriptSet::from('a'), 126 | AugmentedScriptSet::from('μ'), 127 | AugmentedScriptSet::from('汉'), 128 | AugmentedScriptSet::from('ひ'), 129 | AugmentedScriptSet::from('カ'), 130 | AugmentedScriptSet::from('한'), 131 | AugmentedScriptSet::from("汉ひ"), 132 | AugmentedScriptSet::from("汉a"), 133 | AugmentedScriptSet::from("汉μ"), 134 | AugmentedScriptSet::from("〆切"), 135 | ]; 136 | let debug_output = vec![ 137 | "All", 138 | "All", 139 | "Latin", 140 | "Greek", 141 | "Han with Bopomofo, Japanese, Korean, Han", 142 | "Japanese, Hiragana", 143 | "Japanese, Katakana", 144 | "Korean, Hangul", 145 | "Japanese", 146 | "Empty", 147 | "Empty", 148 | "Han with Bopomofo, Japanese, Korean, Han", 149 | ]; 150 | 151 | for (ss, output) in augmented_script_sets.into_iter().zip(debug_output) { 152 | assert_eq!(format!("{}", ss), output); 153 | } 154 | } 155 | --------------------------------------------------------------------------------