├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .gitmodules ├── Cargo.toml ├── LICENSE ├── README.md ├── resources └── tla-unicode.csv ├── src ├── lib.rs ├── main.rs └── strmeasure.rs └── tests ├── BlankFile.tla ├── InvalidSyntax.tla └── corpus_tests.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Build & Test 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | jobs: 10 | build-and-test: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [windows-latest, ubuntu-latest, macos-latest] 15 | fail-fast: false 16 | steps: 17 | - name: Clone repo 18 | uses: actions/checkout@v4 19 | with: 20 | submodules: true 21 | - name: Use stable rust toolchain 22 | run: rustup default stable 23 | - name: Build 24 | run: cargo build 25 | - name: Check Formatting 26 | run: cargo fmt --check 27 | - name: Test 28 | run: cargo test -- --nocapture 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | release: 4 | types: [created] 5 | jobs: 6 | release: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | matrix: 10 | os: [ubuntu-latest, macos-latest, windows-latest] 11 | include: 12 | - os: ubuntu-latest 13 | binname: tlauc-linux.tar.gz 14 | - os: macos-latest 15 | binname: tlauc-macos.tar.gz 16 | - os: windows-latest 17 | binname: tlauc-windows.zip 18 | fail-fast: true 19 | steps: 20 | - name: Clone repo 21 | uses: actions/checkout@v4 22 | - name: Set package version 23 | if: matrix.os != 'windows-latest' 24 | run: | 25 | sed -i -e "s/\"0\.0\.0\"/\"${{ github.ref_name }}\"/" Cargo.toml 26 | cat Cargo.toml 27 | - name: Set package version 28 | if: matrix.os == 'windows-latest' 29 | run: | 30 | function Convert-PackageFile { 31 | param($path, $source, $target) 32 | $packageFile = Get-Content -Path $path -Raw 33 | $updatedPackageFile = $packageFile -replace [Regex]::Escape($source), $target 34 | Set-Content -Path $path -Value $updatedPackageFile 35 | $updatedPackageFile 36 | } 37 | Convert-PackageFile 'Cargo.toml' '"0.0.0"' """${{ github.ref_name }}""" 38 | - name: Use stable rust toolchain 39 | run: rustup default stable 40 | - name: Build 41 | run: cargo build --release 42 | - name: Package Binary 43 | if: matrix.os == 'windows-latest' 44 | shell: pwsh 45 | run: Compress-Archive -Path target/release/tlauc.exe -DestinationPath ${{ matrix.binname }} 46 | - name: Package Binary 47 | if: matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest' 48 | run: tar -czvf ${{ matrix.binname }} -C target/release tlauc 49 | - name: Upload Binary 50 | uses: actions/upload-release-asset@v1 51 | env: 52 | GITHUB_TOKEN: ${{ secrets.GH_PAT }} 53 | with: 54 | upload_url: ${{ github.event.release.upload_url }} 55 | asset_path: ${{ matrix.binname }} 56 | asset_name: ${{ matrix.binname }} 57 | asset_content_type: application/gzip 58 | - name: Publish Crate 59 | if: matrix.os == 'ubuntu-latest' 60 | run: cargo publish --token ${{secrets.CRATES_AUTH_TOKEN}} --allow-dirty 61 | 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | target 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tests/corpus"] 2 | path = tests/corpus 3 | url = https://github.com/tlaplus/examples 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tlauc" 3 | description = "Rewrites TLA⁺ specs to use Unicode symbols instead of ASCII, and vice-versa" 4 | version = "0.0.0" 5 | authors = ["Andrew Helwer <2n8rn1w1f@mozmail.com>"] 6 | repository = "https://github.com/tlaplus-community/tlauc" 7 | license = "MIT" 8 | readme = "README.md" 9 | keywords = ["tla+", "tlaplus", "pluscal", "unicode"] 10 | categories = ["command-line-utilities", "text-editors"] 11 | edition = "2021" 12 | exclude = ["tests", ".github", ".gitignore", ".gitmodules"] 13 | 14 | [dependencies] 15 | anyhow = "1.0.81" 16 | clap = { version = "4.5.4", features = ["derive"] } 17 | csv = "1.3.0" 18 | serde = { version = "1.0.197", features = ["derive"] } 19 | streaming-iterator = "0.1.9" 20 | tree-sitter = "0.24.3" 21 | tree-sitter-language = "0.1.2" 22 | tree-sitter-tlaplus = "1.5.0" 23 | 24 | [dev-dependencies] 25 | glob = "0.3.1" 26 | rayon = "1.10.0" 27 | 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 tlaplus-community 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TLAUC: The TLA⁺ Unicode Converter 2 | [![Build & Test](https://github.com/tlaplus-community/tlauc/actions/workflows/ci.yml/badge.svg)](https://github.com/tlaplus-community/tlauc/actions/workflows/ci.yml) 3 | [![crates.io](https://img.shields.io/crates/v/tlauc.svg)](https://crates.io/crates/tlauc) 4 | 5 | Take the leap! Move from 6 | ```tla 7 | S^+ == {e \in S : e > 0} 8 | Infinitesimal == \A x \in Real^+: \E y \in Real^+: y < x 9 | ``` 10 | to 11 | ```tla 12 | S⁺ ≜ {e ∈ S : e > 0} 13 | Infinitesimal ≜ ∀ x ∈ ℝ⁺: ∃ y ∈ ℝ⁺: y < x 14 | ``` 15 | 16 | This package will take any ASCII TLA⁺ file and convert all its symbols to their Unicode equivalent, or take any Unicode TLA⁺ file and convert all its symbols to their ASCII equivalent. 17 | It consists of two crates: a library exposing this functionality (using [tree-sitter-tlaplus](https://github.com/tlaplus-community/tree-sitter-tlaplus) under the hood), and a command line wrapper. 18 | 19 | Use this tool to: 20 | * Create a nice-looking copy of your spec that is pleasant to read but can still be edited and meaningfully tracked by source control 21 | * Convert your existing ASCII specs to Unicode and use them with Unicode-aware tooling like [tla-web](https://github.com/will62794/tla-web) or TLC 22 | * Confidently write specs in Unicode using [Neovim](https://github.com/tlaplus-community/tlaplus-nvim-plugin) or [Emacs](https://github.com/bugarela/tla-input) plugins then output their ASCII equivalent to a temporary file for use with legacy non-Unicode-aware tooling 23 | 24 | Note that GitHub itself uses the tree-sitter-tlaplus grammar for highlighting, so it supports Unicode TLA⁺ as shown in the highlighted code snippets here. 25 | SANY and TLC also now both support Unicode. 26 | 27 | The symbol mapping can be found in the [`./resources/tla-unicode.csv`](./resources/tla-unicode.csv) file, taken from the [TLA⁺ standard](https://github.com/tlaplus/rfcs/tree/2a772d9dd11acec5d7dedf30abfab91a49de48b8/accepted_rfcs/rfc5_unicode). 28 | The crate also provides programmatic access to these mappings. 29 | For an optimal TLA⁺ Unicode experience you'll want a monospace font that renders all these symbols in fixed width. 30 | 31 | ## Install & Use 32 | 33 | This crate contains both a library and its command line wrapper. 34 | 35 | To get the command line tool, either download it directly from [a release](https://github.com/tlaplus-community/tlauc/releases/latest) or install it with `cargo`: 36 | 1. Install rust: https://www.rust-lang.org/tools/install 37 | 1. Run `cargo install tlauc` 38 | 1. Ensure the [cargo installation directory](https://doc.rust-lang.org/cargo/commands/cargo-install.html#description) is on your path 39 | 40 | From the command line, convert a TLA⁺ file from ASCII to Unicode in place as follows: 41 | ```sh 42 | tlauc Ascii.tla 43 | ``` 44 | Convert from Unicode to ASCII in place: 45 | ```sh 46 | tlauc Unicode.tla --ascii 47 | ``` 48 | To output to a separate file instead of overwriting the input, use the `--output` or `-o` parameter with a filepath. 49 | There are several safety checks performed during the translation process, like that the input spec parses correctly and that the output spec has the same parse tree as the input spec. 50 | You can override these safety checks with the `--force` or `-f` flag. 51 | 52 | If parse errors exist their locations will be output as a best-effort list of line numbers. 53 | Unfortunately tree-sitter does not expose more advanced parse error reporting at this time. 54 | 55 | To consume the library, add [the tlauc package](https://crates.io/crates/tlauc) as a dependency of your project then use it as follows: 56 | ```rs 57 | use tlauc::{rewrite, Mode}; 58 | 59 | fn main() { 60 | let input = r#"---- MODULE TotalOrder ---- 61 | EXTENDS Reals 62 | 63 | Reflexive(S) == \A a \in S : a <= a 64 | Transitive(S) == \A a, b, c \in S : (a <= b /\ b <= c) => (a <= c) 65 | Antisymmetric(S) == \A a, b \in S : (a <= b /\ a >= b) => (a = b) 66 | Total(S) == \A a, b \in S : a <= b \/ a >= b 67 | IsTotallyOrdered(S) == 68 | /\ Reflexive(S) 69 | /\ Transitive(S) 70 | /\ Antisymmetric(S) 71 | /\ Rotal(S) 72 | THEOREM RealsTotallyOrdered == IsTotallyOrdered(Real) 73 | ===="#; 74 | println!("{}", rewrite(input, &Mode::AsciiToUnicode, false).unwrap()); 75 | } 76 | ``` 77 | which will output: 78 | ```tla 79 | ---- MODULE TotalOrder ---- 80 | EXTENDS Reals 81 | 82 | Reflexive(S) ≜ ∀ a ∈ S : a ≤ a 83 | Transitive(S) ≜ ∀ a, b, c ∈ S : (a ≤ b ∧ b ≤ c) ⇒ (a ≤ c) 84 | Antisymmetric(S) ≜ ∀ a, b ∈ S : (a ≤ b ∧ a ≥ b) ⇒ (a = b) 85 | Total(S) ≜ ∀ a, b ∈ S : a ≤ b ∨ a ≥ b 86 | IsTotallyOrdered(S) ≜ 87 | ∧ Reflexive(S) 88 | ∧ Transitive(S) 89 | ∧ Antisymmetric(S) 90 | ∧ Total(S) 91 | THEOREM RealsTotallyOrdered ≜ IsTotallyOrdered(ℝ) 92 | ==== 93 | ``` 94 | Details of error handling and reading & writing files are left to the user, but you can look at the command line wrapper for an example. 95 | 96 | Access the list of Unicode mappings as follows: 97 | ```rs 98 | use tlauc::{SymbolMapping, get_unicode_mappings}; 99 | 100 | fn main() { 101 | let mappings: Vec = get_unicode_mappings(); 102 | println!("{:#?}", mappings); 103 | } 104 | ``` 105 | 106 | ## Build & Test 107 | 108 | 1. Install Rust: https://www.rust-lang.org/tools/install 109 | 1. Clone repo with the `--recurse-submodules` parameter 110 | 1. Run `cargo build` 111 | 1. Run `cargo test` 112 | 113 | ## Details 114 | 115 | TLA⁺ often has several ASCII symbols all representing the same operator (for example, `<=`, `=<`, and `\leq`); these will all map to the same Unicode symbol (`≤`), and when mapping back to ASCII the first ASCII symbol in the semicolon-separated CSV cell will be used (`<=`). 116 | 117 | The reason this program isn't just a simple search & replace is that blank space and column alignment matters for some TLA⁺ constructs, specifically conjunction and disjunction lists (henceforth called jlists): 118 | 119 | ```tla 120 | def == /\ A 121 | /\ \/ B 122 | \/ C 123 | /\ D 124 | ``` 125 | 126 | If we were to naively replace every ASCII symbol with their Unicode 127 | equivalent, we would end up with: 128 | 129 | ```tla 130 | def ≜ ∧ A 131 | ∧ ∨ B 132 | ∨ C 133 | ∧ D 134 | ``` 135 | 136 | We see that both the jlists lost their alignment. 137 | This is unlikely to change the logical value of the expression, but is still undesirable. 138 | Thus we need to analyze the parse tree to find all jlists, and ensure our modifications maintain the alignments of their items. 139 | For this purpose we use [tree-sitter-tlaplus](https://github.com/tlaplus-community/tree-sitter-tlaplus), which correctly parses these constructs. 140 | The tree-sitter parse tree for the above (correctly aligned) code snippet is: 141 | 142 | ```sexp 143 | (operator_definition (identifier) (def_eq) 144 | (conj_list 145 | (conj_item (bullet_conj) (identifier_ref)) 146 | (conj_item 147 | (bullet_conj) 148 | (disj_list 149 | (disj_item (bullet_disj) (identifier_ref)) 150 | (disj_item (bullet_disj) (identifier_ref)) 151 | ) 152 | ) 153 | (conj_item (bullet_conj) (identifier_ref)) 154 | ) 155 | ) 156 | ``` 157 | For safety, the program checks to ensure the converted TLA⁺ file has the exact same parse tree as the original. 158 | It also will not convert the input file if it contains any parse errors. 159 | Both of these checks can be bypassed with the `--force` command line parameter (also exposed in the library). 160 | 161 | ## Algorithm 162 | 163 | The high-level conversion algorithm is as follows: 164 | 165 | 1. For each line in the input file, create two vectors: a jlist vector, and a symbol vector. 166 | 1. Parse the input file and use tree-sitter queries to identify the locations & scope of all jlists. 167 | For each line, push details of any jlists starting on that line onto the jlist vector, sorted from left to right. 168 | 1. Use tree-sitter queries to identify the locations of all symbols to be replaced. 169 | Sort the symbol locations by line and then push them onto the line's symbol vector, sorted from left to right. 170 | 1. For each line, iteratively pop the top element off the symbol vector and replace it in the text. 171 | If no jlists start to the right of that symbol the line, no further action is required; otherwise: 172 | 1. For each jlist starting to the right of the replaced symbol on that line, iterate through all subsequent bullet lines and add or remove spaces to fix the alignment. 173 | Update the positions of entities in the jlist and symbol stacks on those lines. 174 | 1. For each jlist bullet alignment fixed, check whether any additional jlists start on that line; recursively fix their alignment with the same process until no jlists remain to be fixed. 175 | 176 | 1. After iterating through all lines, the process is complete; parse the converted tree and compare it to the original. 177 | They should be identical. 178 | 179 | ## Complications 180 | 181 | As always with variable-width UTF-8 encoded text, care must be taken to differentiate the byte index of a symbol (henceforth called a "codepoint") from its character index. 182 | We [long ago](https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/) left the world of "plain text = ASCII = characters are 1 byte". 183 | Now, each "character" is really a codepoint (an arbitrarily-large number identifying a symbol in the international Unicode standard) that can be 1 byte (as all the ASCII-equivalent codepoints remain, for seamless backward compatibility) or 2 bytes, or 3, 4, etc. 184 | Fundamentally this means that given a byte index, you can't know a codepoint's character index (here also called its "displayed" index) without reading from the beginning of whatever line you are on and counting how many codepoints you encounter. 185 | This complexity is of particular concern for this project, which involves a lot of maintaining text alignment, shifting, inserting Unicode symbols, and index arithmetic. 186 | Rust's type system proved very helpful here; instead of storing indices or offsets as primitive types like `usize` or `i8`, a number of wrapper types were defined to enforce index arithmetic safety at the type-checking level. 187 | You can only add or compare values of like types, and converting from one type to the other requires reading the indexed line of text from the beginning. 188 | At the expense of some additional verbiage this greatly reduced the difficulty of keeping character and byte indices separate and reasoning about when it is appropriate to use each. 189 | For possible (but unlikely) future work there is even more complexity to be found with modifier codepoints, where multiple codepoints combine to form one "grapheme cluster" (what we would think of as a "character" in the ASCII world); for example, the grapheme cluster `é` can either be written directly as codepoint `U+00E9` or as codepoints `U+0301 U+0065`, where `U+0301` is the accent modifier ("combining diacritical mark") applied to `U+0065`, which is our familiar ASCII-equivalent code for `e`. 190 | This program does not handle grapheme clusters and (wrongly, but conveniently) assumes one codepoint = one displayed character. 191 | This would only ever be an issue if someone were to use modifiers in comments prepending alignment-sensitive syntax (see below), which is such a niche use case that for simplicity it will not be handled at this time. 192 | 193 | For actual syntax processing, the most troublesome edge case is as follows: 194 | ```tla 195 | op == /\ A 196 | /\ B 197 | => C 198 | ``` 199 | When converting from ASCII to Unicode using the naive algorithm, this results in: 200 | ```tla 201 | op ≜ ∧ A 202 | ∧ B 203 | ⇒ C 204 | ``` 205 | So this changes `(A ∧ B) ⇒ C` into `A ∧ (B ⇒ C)`, absolutely a different logical expression. 206 | The solution to this edge case is to look for infix operator nodes that are the parent of jlist nodes where the jlist is the left-hand expression. 207 | Thankfully this is easily done with the tree-sitter query `(bound_infix_op lhs: [(conj_list) (disj_list)]) @capture`. 208 | Then, record the operator symbol column offset relative to the jlist column, and maintain it as much as possible as the jlist is shifted. 209 | The edge case is also present in the other direction when converting from Unicode to ASCII: 210 | ```tla 211 | op ≜ ∧ A 212 | ∧ B 213 | = C 214 | ∧ D 215 | = E 216 | ``` 217 | Which converts to: 218 | ```tla 219 | op == /\ A 220 | /\ B 221 | = C 222 | /\ D 223 | = E 224 | ``` 225 | So `(A ∧ (B = C)) ∧ (D = E)` is changed to `((A ∧ B) = C) ∧ D) = E`. 226 | This direction is substantially more difficult to detect via tree-sitter queries, since `B = C` can be an arbitrarily-long and complicated expression that eventually spills onto additional lines. 227 | Since this scenario is very unlikely to occur in the wild until large numbers of TLA⁺ specs are being written in Unicode first, this case is not currently handled by the program (see issue https://github.com/tlaplus-community/tlauc/issues/1). 228 | 229 | Another edge case involves block comments in the (usually empty) space before jlist items: 230 | ```tla 231 | op == /\ A 232 | (***) /\ B 233 | (***) /\ C 234 | ``` 235 | If one or more comments are present in this way they function as hard constraints on how much the jlist can be shifted to the left. 236 | This turns jlist shifting from a simple greedy algorithm into more of a constraint satisfaction problem, especially once nested jlists are involved or even combined with the infix operator edge case up above, forming a tricky corner case: 237 | ```tla 238 | op == /\ A 239 | (***) /\ \/ B 240 | (******) \/ C 241 | (***) => D 242 | ``` 243 | Note also that comments can include arbitrary Unicode symbols so care must be taken to use character indices instead of byte indices for column alignment (see discussion of Unicode difficulties above). 244 | Of course this means the jlists will not be aligned in non-Unicode-aware tooling, but that is the concern of the user; this tool does not modify comment text. 245 | It really only seems feasible to assume one codepoint = one displayed character; alignment according to grapheme clusters would add unnecessary complication to a very niche use case. 246 | 247 | The block comment edge case has not been observed in the wild and so is not yet supported; see issue https://github.com/tlaplus-community/tlauc/issues/2. 248 | 249 | ## Prior Art 250 | 251 | [Ron Pressler](https://pron.github.io/) did [a lot of work](https://github.com/pron/tlaplus/commits/unicode-presentation-2) in early 2017 trying to add Unicode support to SANY and the TLA⁺ Toolbox, including replacing ASCII symbols with Unicode as the user types. 252 | He also wrote [a similar Unicode conversion tool](https://github.com/pron/tlaplus/tree/unicode-presentation-2/tlatools/src/tla2unicode) in Java, which faced many of the same challenges around jlist alignment. 253 | Unfortunately none of this work was upstreamed. 254 | 255 | -------------------------------------------------------------------------------- /resources/tla-unicode.csv: -------------------------------------------------------------------------------- 1 | Name,ASCII,Unicode,Unicode ID 2 | def_eq,==,≜,U+225C 3 | set_in,\in,∈,U+2208 4 | gets,<-,←,U+2190 5 | forall,\A;\forall,∀,U+2200 6 | exists,\E;\exists,∃,U+2203 7 | all_map_to,|->,↦,U+21A6 8 | maps_to,->,→,U+2192 9 | langle_bracket,<<,⟨,U+27E8 10 | rangle_bracket,>>,⟩,U+27E9 11 | rangle_bracket_sub,>>_,⟩_,U+27E9;U+005F 12 | case_box,[],□,U+25A1 13 | case_arrow,->,→,U+2192 14 | label_as,::,∷,U+2237 15 | lnot,~;\lnot;\neg,¬,U+00AC 16 | always,[],□,U+25A1 17 | eventually,<>,◇,U+25C7 18 | implies,=>,⇒,U+21D2 19 | plus_arrow,-+->,⇸,U+21F8 20 | equiv,\equiv,≡,U+2261 21 | iff,<=>,⇔,U+21D4 22 | leads_to,~>,↝,U+219D 23 | land,/\;\land,∧,U+2227 24 | lor,\/;\lor,∨,U+2228 25 | assign,:=,≔,U+2254 26 | bnf_rule,::=,⩴,U+2A74 27 | neq,/=;#,≠,U+2260 28 | leq,<=;=<;\leq,≤,U+2264 29 | geq,>=;\geq,≥,U+2265 30 | approx,\approx,≈,U+2248 31 | rs_ttile,|-,⊢,U+22A2 32 | rd_ttile,|=,⊨,U+22A8 33 | ls_ttile,-|,⊣,U+22A3 34 | ld_ttile,=|,⫤,U+2AE4 35 | asymp,\asymp,≍,U+224D 36 | cong,\cong,≅,U+2245 37 | doteq,\doteq,≐,U+2250 38 | gg,\gg,≫,U+226B 39 | ll,\ll,≪,U+226A 40 | in,\in,∈,U+2208 41 | notin,\notin,∉,U+2209 42 | prec,\prec,≺,U+227A 43 | succ,\succ,≻,U+227B 44 | preceq,\preceq,⪯,U+2AAF 45 | succeq,\succeq,⪰,U+2AB0 46 | propto,\propto,∝,U+221D 47 | sim,\sim,∼,U+223C 48 | simeq,\simeq,≃,U+2243 49 | sqsubset,\sqsubset,⊏,U+228F 50 | sqsupset,\sqsupset,⊐,U+2290 51 | sqsubseteq,\sqsubseteq,⊑,U+2291 52 | sqsupseteq,\sqsupseteq,⊒,U+2292 53 | subset,\subset,⊂,U+2282 54 | supset,\supset,⊃,U+2283 55 | subseteq,\subseteq,⊆,U+2286 56 | supseteq,\supseteq,⊇,U+2287 57 | cap,\intersect;\cap,∩,U+2229 58 | cup,\union;\cup,∪,U+222A 59 | dots_2,..,‥,U+2025 60 | dots_3,...,…,U+2026 61 | oplus,(+);\oplus,⊕,U+2295 62 | ominus,(-);\ominus,⊖,U+2296 63 | vertvert,||,‖,U+2016 64 | odot,(.);\odot,⊙,U+2299 65 | oslash,(/);\oslash,⊘,U+2298 66 | otimes,(\X);\otimes,⊗,U+2297 67 | bigcirc,\bigcirc,◯,U+25EF 68 | bullet,\bullet,●,U+25CF 69 | div,\div,÷,U+00F7 70 | circ,\o;\circ,∘,U+221 71 | star,\star,⋆,U+22C6 72 | excl,!!,‼,U+203C 73 | qq,??,⁇,U+2047 74 | sqcap,\sqcap,⊓,U+2293 75 | sqcup,\sqcup,⊔,U+2294 76 | uplus,\uplus,⊎,U+228E 77 | times,\X;\times,×,U+00D7 78 | wr,\wr,≀,U+2240 79 | cdot,\cdot,⋅,U+22C5 80 | sup_plus,^+,⁺,U+207A 81 | nat_number_set,Nat,ℕ,U+2115 82 | int_number_set,Int,ℤ,U+2124 83 | real_number_set,Real,ℝ,U+211D 84 | bullet_conj,/\,∧,U+2227 85 | bullet_disj,\/,∨,U+2228 86 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod strmeasure; 2 | use crate::strmeasure::*; 3 | 4 | use serde::{Deserialize, Deserializer}; 5 | use std::ops::Range; 6 | use streaming_iterator::StreamingIterator; 7 | use tree_sitter::{Node, Parser, Query, QueryCursor, Tree, TreeCursor}; 8 | 9 | pub enum Mode { 10 | AsciiToUnicode, 11 | UnicodeToAscii, 12 | } 13 | 14 | #[derive(Debug)] 15 | pub enum TlaError { 16 | InputFileParseError { 17 | parse_tree: Tree, 18 | error_lines: Vec, 19 | }, 20 | OutputFileParseError { 21 | output_tree: Tree, 22 | output: String, 23 | }, 24 | InvalidTranslationError { 25 | input_tree: Tree, 26 | output_tree: Tree, 27 | output: String, 28 | first_diff: String, 29 | }, 30 | } 31 | 32 | pub fn rewrite(input: &str, mode: &Mode, force: bool) -> Result { 33 | let mut parser = Parser::new(); 34 | parser 35 | .set_language(&tree_sitter_tlaplus::LANGUAGE.into()) 36 | .expect("Error loading TLA⁺ grammar"); 37 | let mut cursor = QueryCursor::new(); 38 | 39 | // Parse input TLA⁺ file and construct data structures to hold information about it 40 | let input_tree = parser.parse(input, None).unwrap(); 41 | if !force && input_tree.root_node().has_error() { 42 | let error_lines = find_error_lines(&input_tree); 43 | return Err(TlaError::InputFileParseError { 44 | parse_tree: input_tree, 45 | error_lines, 46 | }); 47 | } 48 | 49 | let mut tla_lines = TlaLine::construct_from(input); 50 | 51 | // Identify & replace symbols 52 | mark_jlists(&input_tree, &mut cursor, &mut tla_lines); 53 | mark_symbols(&input_tree, &mut cursor, &mut tla_lines, mode); 54 | //println!("{:#?}", tla_lines); 55 | replace_symbols(&mut tla_lines); 56 | 57 | // if the input ends with '\n', we should put the '\n' back to output 58 | let extra_newline = input 59 | .chars() 60 | .last() 61 | .map_or("", |x| if x == '\n' { "\n" } else { "" }); 62 | 63 | // Ensure output parse tree is identical to input parse tree 64 | let output = TlaLine::output_from_lines(&tla_lines, &extra_newline); 65 | 66 | let output_tree = parser.parse(&output, None).unwrap(); 67 | if !force { 68 | if output_tree.root_node().has_error() { 69 | return Err(TlaError::OutputFileParseError { 70 | output_tree, 71 | output, 72 | }); 73 | } 74 | if let Err(first_diff) = compare_parse_trees(&input_tree, &output_tree) { 75 | return Err(TlaError::InvalidTranslationError { 76 | input_tree, 77 | output_tree, 78 | output, 79 | first_diff, 80 | }); 81 | } 82 | } 83 | 84 | Ok(output) 85 | } 86 | 87 | fn find_error_lines(tree: &Tree) -> Vec { 88 | let mut error_lines: Vec = vec![]; 89 | traverse_parse_tree(tree, |n| { 90 | if n.is_error() || n.is_missing() { 91 | error_lines.push(n.start_position().row + 1); 92 | } 93 | }); 94 | error_lines 95 | } 96 | 97 | fn traverse_parse_tree(tree: &Tree, mut visit: F) 98 | where 99 | F: FnMut(Node), 100 | { 101 | let mut cursor: TreeCursor = tree.walk(); 102 | loop { 103 | // Every time a new node is found the control flow passes here 104 | visit(cursor.node()); 105 | // Descend as far as possible 106 | if !cursor.goto_first_child() { 107 | loop { 108 | // Attempt to go to sibling 109 | if cursor.goto_next_sibling() { 110 | // If sibling exists, break out into descent loop 111 | break; 112 | } else { 113 | // If sibling does not exist, go to parent, then 114 | // parent's sibling in next loop iteration 115 | if !cursor.goto_parent() { 116 | // If parent does not exist, we are done 117 | return; 118 | } 119 | } 120 | } 121 | } 122 | } 123 | } 124 | 125 | fn compare_parse_trees(input_tree: &Tree, output_tree: &Tree) -> Result<(), String> { 126 | let mut input_cursor: TreeCursor = input_tree.walk(); 127 | let mut output_cursor: TreeCursor = output_tree.walk(); 128 | 129 | loop { 130 | check_node_equality(&input_cursor, &output_cursor)?; 131 | if !simultaneous_step(&mut input_cursor, &mut output_cursor, |c| { 132 | c.goto_first_child() 133 | })? { 134 | loop { 135 | if !simultaneous_step(&mut input_cursor, &mut output_cursor, |c| { 136 | c.goto_next_sibling() 137 | })? { 138 | if !simultaneous_step(&mut input_cursor, &mut output_cursor, |c| { 139 | c.goto_parent() 140 | })? { 141 | return Ok(()); 142 | } 143 | } else { 144 | break; 145 | } 146 | } 147 | } 148 | } 149 | } 150 | 151 | fn simultaneous_step( 152 | input_cursor: &mut TreeCursor, 153 | output_cursor: &mut TreeCursor, 154 | step: fn(&mut TreeCursor) -> bool, 155 | ) -> Result { 156 | let (input_next, output_next) = (step(input_cursor), step(output_cursor)); 157 | if input_next != output_next { 158 | return Err(format!( 159 | "First diff: Input {:?} Output {:?}", 160 | input_cursor.node(), 161 | output_cursor.node() 162 | )); 163 | } 164 | 165 | Ok(input_next) 166 | } 167 | 168 | fn check_node_equality( 169 | input_cursor: &TreeCursor, 170 | output_cursor: &TreeCursor, 171 | ) -> Result<(), String> { 172 | if (input_cursor.node().is_named() || output_cursor.node().is_named()) 173 | && input_cursor.node().kind() != output_cursor.node().kind() 174 | { 175 | return Err(format!( 176 | "First diff: Input {:?} Output {:?}", 177 | input_cursor.node(), 178 | output_cursor.node() 179 | )); 180 | } 181 | 182 | Ok(()) 183 | } 184 | 185 | #[derive(Debug, Deserialize)] 186 | pub struct SymbolMapping { 187 | #[serde(rename = "Name")] 188 | name: String, 189 | #[serde( 190 | rename = "ASCII", 191 | deserialize_with = "vec_from_semicolon_separated_str" 192 | )] 193 | ascii: Vec, 194 | #[serde(rename = "Unicode")] 195 | unicode: String, 196 | } 197 | 198 | impl SymbolMapping { 199 | pub fn canonical_ascii(&self) -> &str { 200 | self.ascii.first().unwrap() 201 | } 202 | 203 | pub fn ascii_query(&self) -> String { 204 | let query = self 205 | .ascii 206 | .iter() 207 | .map(|a| a.replace('\\', "\\\\")) 208 | .map(|a| format!("\"{}\"", a)) 209 | .reduce(|a, b| a + " " + &b) 210 | .unwrap(); 211 | let name = &self.name; 212 | format!("({name} [{query}] @{name})") 213 | } 214 | 215 | pub fn unicode_query(&self) -> String { 216 | let name = &self.name; 217 | let unicode = &self.unicode; 218 | format!("({name} \"{unicode}\" @{name})") 219 | } 220 | 221 | fn target_symbol(&self, mode: &Mode) -> &str { 222 | match mode { 223 | Mode::AsciiToUnicode => &self.unicode, 224 | Mode::UnicodeToAscii => self.canonical_ascii(), 225 | } 226 | } 227 | 228 | fn source_query(&self, mode: &Mode) -> String { 229 | match mode { 230 | Mode::AsciiToUnicode => self.ascii_query(), 231 | Mode::UnicodeToAscii => self.unicode_query(), 232 | } 233 | } 234 | 235 | fn chars_added(&self, mode: &Mode, src_symbol: &str) -> CharDiff { 236 | match mode { 237 | Mode::AsciiToUnicode => { 238 | CharQuantity(self.unicode.chars().count()) 239 | - CharQuantity(src_symbol.chars().count()) 240 | } 241 | Mode::UnicodeToAscii => { 242 | CharQuantity(self.canonical_ascii().chars().count()) 243 | - CharQuantity(self.unicode.chars().count()) 244 | } 245 | } 246 | } 247 | } 248 | 249 | fn vec_from_semicolon_separated_str<'de, D>(deserializer: D) -> Result, D::Error> 250 | where 251 | D: Deserializer<'de>, 252 | { 253 | let s: &str = Deserialize::deserialize(deserializer)?; 254 | Ok(s.split(';').map(|s| s.to_string()).collect()) 255 | } 256 | 257 | pub fn get_unicode_mappings() -> Vec { 258 | let csv = include_str!("../resources/tla-unicode.csv"); 259 | let mut reader = csv::Reader::from_reader(csv.as_bytes()); 260 | reader.deserialize().map(|result| result.unwrap()).collect() 261 | } 262 | 263 | #[derive(Debug)] 264 | struct TlaLine { 265 | text: String, 266 | jlists: Vec, 267 | symbols: Vec, 268 | } 269 | 270 | impl TlaLine { 271 | fn construct_from(input: &str) -> Vec { 272 | input 273 | .lines() 274 | .map(|line| TlaLine { 275 | jlists: Vec::new(), 276 | symbols: Vec::new(), 277 | text: line.to_string(), 278 | }) 279 | .collect() 280 | } 281 | 282 | // same as join("\n") + extra, 283 | // but to avoid unnecessary the reallocation, 284 | // ref: https://doc.rust-lang.org/src/alloc/slice.rs.html#787 285 | fn output_from_lines(tla_lines: &Vec, extra: &str) -> String { 286 | let mut iter = tla_lines.iter(); 287 | let first = match iter.next() { 288 | Some(first) => first, 289 | None => return extra.to_string(), 290 | }; 291 | let text_size = tla_lines.iter().map(|v| v.text.len()).sum::(); 292 | // Note: tla_lines.len() > 0 is always true 293 | let size = text_size + tla_lines.len() - 1 + extra.len(); 294 | let mut result = String::with_capacity(size); 295 | result.push_str(&first.text); 296 | for v in iter { 297 | result.push('\n'); 298 | result.push_str(&v.text); 299 | } 300 | result.push_str(extra); 301 | result 302 | } 303 | 304 | fn shift_jlists(&mut self, &diff: &CharDiff, &start_index: &CharQuantity) { 305 | for jlist in &mut self.jlists { 306 | if jlist.column > start_index { 307 | jlist.column = jlist.column + diff; 308 | } 309 | } 310 | } 311 | 312 | fn shift_symbols(&mut self, diff: &StrElementDiff, start_index: &StrElementQuantity) { 313 | for symbol in &mut self.symbols { 314 | if symbol.src_range.start.byte >= start_index.byte { 315 | symbol.src_range.start.byte = symbol.src_range.start.byte + diff.byte; 316 | symbol.src_range.end.byte = symbol.src_range.end.byte + diff.byte; 317 | } 318 | if symbol.src_range.start.char >= start_index.char { 319 | symbol.src_range.start.char = symbol.src_range.start.char + diff.char; 320 | symbol.src_range.end.char = symbol.src_range.end.char + diff.char; 321 | } 322 | } 323 | } 324 | } 325 | 326 | #[derive(Debug)] 327 | struct JList { 328 | column: CharQuantity, 329 | bullet_line_offsets: Vec, 330 | terminating_infix_op_offset: Option, 331 | } 332 | 333 | #[derive(Debug)] 334 | struct InfixOp { 335 | line_offset: usize, 336 | column: CharQuantity, 337 | } 338 | 339 | impl JList { 340 | fn query() -> Query { 341 | Query::new( 342 | &tree_sitter_tlaplus::LANGUAGE.into(), 343 | "[(conj_list) (disj_list)] @jlist", 344 | ) 345 | .unwrap() 346 | } 347 | 348 | fn terminating_infix_op_query() -> Query { 349 | Query::new( 350 | &tree_sitter_tlaplus::LANGUAGE.into(), 351 | "(bound_infix_op lhs: [(conj_list) (disj_list)]) @capture", 352 | ) 353 | .unwrap() 354 | } 355 | 356 | fn is_jlist_item_node(cursor: &TreeCursor) -> bool { 357 | "conj_item" == cursor.node().kind() || "disj_item" == cursor.node().kind() 358 | } 359 | } 360 | 361 | fn mark_jlists(tree: &Tree, query_cursor: &mut QueryCursor, tla_lines: &mut [TlaLine]) { 362 | let mut tree_cursor: TreeCursor = tree.walk(); 363 | let query = JList::query(); 364 | let mut captures = query_cursor.matches(&query, tree.root_node(), "".as_bytes()); 365 | while let Some(capture) = captures.next() { 366 | let node = capture.captures[0].node; 367 | let start_line = node.start_position().row; 368 | let line = &mut tla_lines[start_line]; 369 | let column = 370 | CharQuantity::from_byte_index(&ByteQuantity(node.start_position().column), &line.text); 371 | let mut jlist = JList { 372 | column, 373 | bullet_line_offsets: Vec::new(), 374 | terminating_infix_op_offset: None, 375 | }; 376 | tree_cursor.reset(node); 377 | tree_cursor.goto_first_child(); 378 | while { 379 | if JList::is_jlist_item_node(&tree_cursor) { 380 | jlist 381 | .bullet_line_offsets 382 | .push(tree_cursor.node().start_position().row - start_line); 383 | } 384 | 385 | tree_cursor.goto_next_sibling() 386 | } {} 387 | 388 | line.jlists.push(jlist); 389 | } 390 | 391 | let query = JList::terminating_infix_op_query(); 392 | let mut captures = query_cursor.matches(&query, tree.root_node(), "".as_bytes()); 393 | while let Some(capture) = captures.next() { 394 | let infix_op_node = capture.captures[0].node; 395 | let jlist_node = infix_op_node.child_by_field_name("lhs").unwrap(); 396 | let jlist_start_line_index = jlist_node.start_position().row; 397 | let (prefix, suffix) = tla_lines.split_at_mut(jlist_start_line_index + 1); 398 | let jlist_start_line = &mut prefix[jlist_start_line_index]; 399 | let jlist_column = CharQuantity::from_byte_index( 400 | &ByteQuantity(jlist_node.start_position().column), 401 | &jlist_start_line.text, 402 | ); 403 | let jlist = jlist_start_line 404 | .jlists 405 | .iter_mut() 406 | .find(|j| j.column == jlist_column) 407 | .unwrap(); 408 | let symbol_node = infix_op_node.child_by_field_name("symbol").unwrap(); 409 | let symbol_line_offset = symbol_node.start_position().row - jlist_start_line_index; 410 | let symbol_line = &suffix[symbol_line_offset - 1]; 411 | let symbol_column = ByteQuantity(symbol_node.start_position().column); 412 | jlist.terminating_infix_op_offset = Some(InfixOp { 413 | line_offset: symbol_line_offset, 414 | column: CharQuantity::from_byte_index(&symbol_column, &symbol_line.text), 415 | }); 416 | } 417 | } 418 | 419 | #[derive(Debug)] 420 | struct Symbol { 421 | diff: CharDiff, 422 | src_range: Range, 423 | target: String, 424 | } 425 | 426 | fn mark_symbols(tree: &Tree, cursor: &mut QueryCursor, tla_lines: &mut [TlaLine], mode: &Mode) { 427 | let mappings = get_unicode_mappings(); 428 | let queries = &mappings 429 | .iter() 430 | .map(|s| s.source_query(mode)) 431 | .collect::>() 432 | .join(""); 433 | let query = Query::new(&tree_sitter_tlaplus::LANGUAGE.into(), queries).unwrap(); 434 | 435 | let mut captures = cursor.matches(&query, tree.root_node(), "".as_bytes()); 436 | while let Some(capture) = captures.next() { 437 | let capture = capture.captures[0]; 438 | let mapping = &mappings[capture.index as usize]; 439 | let start_position = capture.node.start_position(); 440 | let end_position = capture.node.end_position(); 441 | assert!(start_position.row == end_position.row); 442 | let line = &mut tla_lines[start_position.row]; 443 | let src_range = 444 | StrElementQuantity::from_byte_index(&ByteQuantity(start_position.column), &line.text) 445 | ..StrElementQuantity::from_byte_index( 446 | &ByteQuantity(end_position.column), 447 | &line.text, 448 | ); 449 | let src_symbol = &line.text[StrElementQuantity::as_byte_range(&src_range)]; 450 | let target = mapping.target_symbol(mode).to_string(); 451 | line.symbols.push(Symbol { 452 | diff: mapping.chars_added(mode, src_symbol), 453 | src_range, 454 | target, 455 | }); 456 | } 457 | } 458 | 459 | fn replace_symbols(tla_lines: &mut [TlaLine]) { 460 | for line_number in 0..tla_lines.len().saturating_add_signed(-1) { 461 | let (prefix, suffix) = tla_lines.split_at_mut(line_number + 1); 462 | let line = &mut prefix[line_number]; 463 | while let Some(symbol) = line.symbols.pop() { 464 | line.text.replace_range( 465 | StrElementQuantity::as_byte_range(&symbol.src_range), 466 | &symbol.target, 467 | ); 468 | line.shift_jlists(&symbol.diff, &symbol.src_range.start.char); 469 | fix_alignment(line, suffix, &symbol.diff, &symbol.src_range.start); 470 | } 471 | } 472 | } 473 | 474 | fn fix_alignment( 475 | line: &mut TlaLine, 476 | suffix: &mut [TlaLine], 477 | &diff: &CharDiff, 478 | symbol_start_index: &StrElementQuantity, 479 | ) { 480 | // If there was no net change in character count, there is no need to fix alignment 481 | if diff == CharDiff(0) { 482 | return; 483 | } 484 | 485 | // Recursively fix alignment of all jlist bullets 486 | for jlist in &mut line.jlists { 487 | // Ignore jlists starting before the index of modification in this line 488 | if jlist.column <= symbol_start_index.char { 489 | continue; 490 | } 491 | 492 | // Add or remove spaces from the start of the line for each bullet in this jlist 493 | let mod_index = StrElementQuantity { 494 | char: CharQuantity(0), 495 | byte: ByteQuantity(0), 496 | }; 497 | for &line_offset in &jlist.bullet_line_offsets { 498 | // Alignment of first element of jlist was already changed by original modification 499 | if line_offset == 0 { 500 | continue; 501 | } 502 | 503 | let (suffix_prefix, suffix_suffix) = suffix.split_at_mut(line_offset); 504 | let bullet_line = &mut suffix_prefix[line_offset - 1]; 505 | let bullet_column = jlist.column - diff; 506 | pad(bullet_line, &diff, &mod_index, &bullet_column); 507 | 508 | // Recursively fix alignment of any jlists starting on this line 509 | fix_alignment(bullet_line, suffix_suffix, &diff, &mod_index); 510 | } 511 | 512 | // Fix alignment of terminating infix op for this jlist, if it exists 513 | if let Some(infix_op_offset) = &mut jlist.terminating_infix_op_offset { 514 | let (suffix_prefix, suffix_suffix) = suffix.split_at_mut(infix_op_offset.line_offset); 515 | let infix_op_line = &mut suffix_prefix[infix_op_offset.line_offset - 1]; 516 | let diff = pad(infix_op_line, &diff, &mod_index, &infix_op_offset.column); 517 | infix_op_offset.column = infix_op_offset.column + diff; 518 | fix_alignment(infix_op_line, suffix_suffix, &diff, &mod_index); 519 | } 520 | } 521 | } 522 | 523 | fn pad( 524 | line: &mut TlaLine, 525 | &diff: &CharDiff, 526 | mod_index: &StrElementQuantity, 527 | &first_symbol_index: &CharQuantity, 528 | ) -> CharDiff { 529 | if diff < CharDiff(0) { 530 | // Calculate min to ensure we don't move a symbol to before the end of the line 531 | let spaces_to_remove = CharQuantity::min(diff.magnitude(), first_symbol_index); 532 | let bytes_to_remove = ByteQuantity::from_char_index(&spaces_to_remove, &line.text); 533 | line.text.drain(bytes_to_remove.range_to()); 534 | let diff = StrElementDiff { 535 | char: mod_index.char - spaces_to_remove, 536 | byte: mod_index.byte - bytes_to_remove, 537 | }; 538 | line.shift_jlists(&diff.char, &mod_index.char); 539 | line.shift_symbols(&diff, &mod_index); 540 | diff.char 541 | } else { 542 | let spaces_to_add = diff.magnitude(); 543 | line.text.insert_str(0, &spaces_to_add.repeat(" ")); 544 | let spaces_added_in_bytes = ByteQuantity::from_char_index(&spaces_to_add, &line.text); 545 | let diff = StrElementDiff { 546 | char: diff, 547 | byte: spaces_added_in_bytes - mod_index.byte, 548 | }; 549 | line.shift_jlists(&diff.char, &mod_index.char); 550 | line.shift_symbols(&diff, &mod_index); 551 | diff.char 552 | } 553 | } 554 | 555 | #[cfg(test)] 556 | mod tests { 557 | use super::*; 558 | use std::iter::zip; 559 | 560 | fn check_ascii_replaced(text: &str) { 561 | let mut parser = Parser::new(); 562 | parser 563 | .set_language(&tree_sitter_tlaplus::LANGUAGE.into()) 564 | .unwrap(); 565 | let tree = parser.parse(&text, None).unwrap(); 566 | assert!(!tree.root_node().has_error()); 567 | let mut cursor = QueryCursor::new(); 568 | let queries = get_unicode_mappings() 569 | .iter() 570 | .map(|s| s.ascii_query()) 571 | .collect::>() 572 | .join(""); 573 | let query = Query::new(&tree_sitter_tlaplus::LANGUAGE.into(), &queries).unwrap(); 574 | assert!(cursor 575 | .matches(&query, tree.root_node(), "".as_bytes()) 576 | .is_done()); 577 | } 578 | 579 | fn unwrap_conversion(input: Result) -> String { 580 | match input { 581 | Ok(converted) => converted, 582 | Err(TlaError::InputFileParseError { 583 | parse_tree, 584 | error_lines, 585 | }) => { 586 | panic!("{:?}\n{}", error_lines, parse_tree.root_node().to_sexp()) 587 | } 588 | Err(TlaError::OutputFileParseError { 589 | output_tree, 590 | output, 591 | }) => { 592 | panic!("{}\n{}", output, output_tree.root_node().to_sexp()) 593 | } 594 | Err(TlaError::InvalidTranslationError { 595 | input_tree: _, 596 | output_tree: _, 597 | output: _, 598 | first_diff, 599 | }) => { 600 | panic!("{}", first_diff) 601 | } 602 | } 603 | } 604 | 605 | fn run_roundtrip_test(expected: &str) { 606 | let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false)); 607 | check_ascii_replaced(&intermediate); 608 | let actual = unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false)); 609 | assert_eq!( 610 | expected, actual, 611 | "\nExpected:\n{}\nActual:\n{}", 612 | expected, actual 613 | ); 614 | } 615 | 616 | #[test] 617 | fn basic_roundtrip() { 618 | run_roundtrip_test( 619 | r#" 620 | ---- MODULE Test ---- 621 | op == \A n \in Nat: n >= 0 622 | ===="#, 623 | ); 624 | } 625 | 626 | #[test] 627 | fn all_canonical_symbols_roundtrip() { 628 | run_roundtrip_test( 629 | r#" 630 | ---- MODULE Test ---- 631 | op == \A n \in Nat : \E r \in Real : ~(n = r) 632 | op == {x \in R : TRUE} 633 | op == INSTANCE Module WITH x <- y 634 | op == [n \in Nat |-> n] 635 | op == [Nat -> Real] 636 | op == <<1,2,3>> 637 | op == <<<>F>>_vars 638 | op == CASE A -> B [] C -> D [] OTHER -> E 639 | op == label :: []P => Q 640 | op == A -+-> B \equiv C <=> D ~> E /\ F \/ G 641 | op == A := B ::= C /= D <= E >= F \approx G 642 | op == A |- B |= C -| D =| E \asymp F \cong G 643 | op == A \doteq B \gg C \ll D \in E \notin F \prec G 644 | op == A \succ B \preceq C \succeq D \propto E \sim F \simeq G 645 | op == A \sqsubset B \sqsupset C \sqsubseteq D \sqsupseteq E 646 | op == A \subset B \supset C \subseteq D \supseteq E 647 | op == A \intersect B \union C .. D ... E (+) F (-) G 648 | op == A || B (.) C (/) D (\X) E \bigcirc F \bullet G 649 | op == A \div B \o C \star D !! E ?? F \sqcap G 650 | op == A \sqcup B \uplus C \X D \wr E \cdot F ^+ 651 | ===="#, 652 | ); 653 | } 654 | 655 | #[test] 656 | fn all_non_canonical_symbols_roundtrip() { 657 | let expected = r#" 658 | ---- MODULE Test ---- 659 | op == \forall n \in Nat : TRUE 660 | op == \exists r \in Real : TRUE 661 | op == \neg P 662 | op == P \land Q 663 | op == P \lor Q 664 | op == x # y 665 | op == x =< y 666 | op == x \leq y 667 | op == x \geq y 668 | op == P \cap Q 669 | op == P \cup Q 670 | op == x \oplus y 671 | op == x \ominus y 672 | op == x \odot y 673 | op == x \oslash y 674 | op == x \otimes y 675 | op == x \circ y 676 | op == P \times Q 677 | ===="#; 678 | let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false)); 679 | check_ascii_replaced(&intermediate); 680 | let actual = unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false)); 681 | // Only first and last lines should be the same 682 | for (i, (expected_line, actual_line)) in zip(expected.lines(), actual.lines()).enumerate() { 683 | if i <= 1 || i == expected.lines().count() - 1 { 684 | assert_eq!(expected_line, actual_line); 685 | } else { 686 | assert_ne!(expected_line, actual_line); 687 | } 688 | } 689 | } 690 | 691 | #[test] 692 | fn test_basic_jlist() { 693 | run_roundtrip_test( 694 | r#" 695 | ---- MODULE Test ---- 696 | op == /\ A 697 | /\ B 698 | /\ C 699 | /\ D 700 | ===="#, 701 | ); 702 | } 703 | 704 | #[test] 705 | fn test_nested_jlist() { 706 | run_roundtrip_test( 707 | r#" 708 | ---- MODULE Test ---- 709 | op == /\ A 710 | /\ \/ B 711 | \/ C 712 | /\ D 713 | ===="#, 714 | ); 715 | } 716 | 717 | #[test] 718 | fn test_full_binary_tree_jlist() { 719 | run_roundtrip_test( 720 | r#" 721 | ---- MODULE Test ---- 722 | op == /\ \/ /\ \/ /\ A 723 | /\ B 724 | \/ /\ C 725 | /\ D 726 | /\ \/ /\ E 727 | /\ F 728 | \/ /\ G 729 | /\ H 730 | \/ /\ \/ /\ I 731 | /\ J 732 | \/ /\ K 733 | /\ L 734 | /\ \/ /\ M 735 | /\ N 736 | \/ /\ O 737 | /\ P 738 | /\ \/ /\ \/ /\ Q 739 | /\ R 740 | \/ /\ S 741 | /\ T 742 | /\ \/ /\ U 743 | /\ V 744 | \/ /\ W 745 | /\ X 746 | \/ /\ \/ /\ Y 747 | /\ Z 748 | \/ /\ A 749 | /\ B 750 | /\ \/ /\ C 751 | /\ D 752 | \/ /\ E 753 | /\ F 754 | ===="#, 755 | ); 756 | } 757 | 758 | #[test] 759 | fn jlist_with_comments() { 760 | run_roundtrip_test( 761 | r#" 762 | ---- MODULE Test ---- 763 | op == /\ A 764 | /\ \/ B 765 | \* This is a comment 766 | \/ C 767 | (* This is another comment *) 768 | /\ D 769 | ===="#, 770 | ); 771 | } 772 | 773 | #[test] 774 | fn test_aligned_trailing_infix_op() { 775 | run_roundtrip_test( 776 | r#" 777 | ---- MODULE Test ---- 778 | op == /\ A 779 | /\ B 780 | => C 781 | ===="#, 782 | ); 783 | } 784 | 785 | #[test] 786 | fn test_trailing_infix_op_at_line_start() { 787 | let expected = r#" 788 | ---- MODULE Test ---- 789 | op == /\ A 790 | /\ B 791 | => C 792 | ===="#; 793 | let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false)); 794 | check_ascii_replaced(&intermediate); 795 | unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false)); 796 | } 797 | 798 | #[test] 799 | fn test_nested_trailing_infix_op() { 800 | let expected = r#" 801 | ---- MODULE Test ---- 802 | op == /\ A 803 | /\ B 804 | => /\ C 805 | /\ \/ D 806 | \/ E 807 | => /\ F 808 | /\ G 809 | => H 810 | op == A <=> /\ B 811 | /\ C 812 | => D 813 | ===="#; 814 | let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false)); 815 | check_ascii_replaced(&intermediate); 816 | unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false)); 817 | } 818 | 819 | #[test] 820 | fn test_misaligned_jlist() { 821 | run_roundtrip_test( 822 | r#" 823 | ---- MODULE Test ---- 824 | op == /\ A 825 | /\ B 826 | /\ C 827 | ===="#, 828 | ); 829 | } 830 | 831 | // See https://github.com/tlaplus-community/tlauc/issues/11 832 | // Test translation of number sets in their three forms: 833 | // 1. As an expression 834 | // 2. As the left-hand-side of an operator definition 835 | // 3. As a reference to an imported module 836 | #[test] 837 | fn test_translate_number_set() { 838 | run_roundtrip_test( 839 | r#" 840 | ---- MODULE Test ---- 841 | Nat == Nat \union A!B!Nat 842 | Int == Int \union A!B!Int 843 | Real == Real \union A!B!Real 844 | ===="#, 845 | ); 846 | } 847 | 848 | // https://github.com/tlaplus-community/tlauc/issues/1 849 | #[ignore] 850 | #[test] 851 | fn test_infix_op_jlist_from_unicode() { 852 | run_roundtrip_test( 853 | r#" 854 | ---- MODULE Test ---- 855 | op ≜ ∧ A 856 | ∧ B 857 | = C 858 | ∧ D 859 | = E 860 | ===="#, 861 | ); 862 | } 863 | 864 | // https://github.com/tlaplus-community/tlauc/issues/2 865 | #[ignore] 866 | #[test] 867 | fn test_block_comments_prefixing_jlist_items() { 868 | run_roundtrip_test( 869 | r#" 870 | ---- MODULE Test ---- 871 | op == /\ A 872 | (***) /\ \/ B 873 | (******) \/ C 874 | (***) => D 875 | ===="#, 876 | ); 877 | } 878 | 879 | // Tests that file ends with newline (or without newline) 880 | #[test] 881 | fn test_empty_input() { 882 | let input = ""; 883 | let output = rewrite(&input, &Mode::UnicodeToAscii, true); 884 | assert_eq!(input, output.unwrap()); 885 | let output = rewrite(&input, &Mode::AsciiToUnicode, true); 886 | assert_eq!(input, output.unwrap()); 887 | } 888 | 889 | #[test] 890 | fn test_single_newline() { 891 | let input = "\n"; 892 | let output = rewrite(&input, &Mode::UnicodeToAscii, true); 893 | assert_eq!(input, output.unwrap()); 894 | let output = rewrite(&input, &Mode::AsciiToUnicode, true); 895 | assert_eq!(input, output.unwrap()); 896 | } 897 | 898 | #[test] 899 | fn test_normal_input_without_newline() { 900 | run_roundtrip_test( 901 | r#" 902 | ---- MODULE Test ---- 903 | op == 1 904 | ===="#, 905 | ); 906 | } 907 | 908 | #[test] 909 | fn test_normal_input_with_newline() { 910 | run_roundtrip_test( 911 | r#" 912 | ---- MODULE Test ---- 913 | op == 1 914 | ==== 915 | "#, 916 | ); 917 | } 918 | } 919 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{anyhow, Context, Result}; 2 | use clap::Parser; 3 | use std::fs::File; 4 | use std::io::{Read, Write}; 5 | use std::path::{Path, PathBuf}; 6 | use tlauc::{rewrite, Mode, TlaError}; 7 | 8 | #[derive(Parser)] 9 | #[command(author, version, about, long_about = None)] 10 | struct Args { 11 | #[arg(help = "Path to TLA⁺ file to convert")] 12 | input: PathBuf, 13 | 14 | #[arg( 15 | short, 16 | long, 17 | help = "Optional path to output; will overwrite input file by default" 18 | )] 19 | output: Option, 20 | 21 | #[arg( 22 | short, 23 | long, 24 | default_value_t = false, 25 | help = "Whether to force a best-effort conversion, ignoring TLA⁺ parse errors" 26 | )] 27 | force: bool, 28 | 29 | #[arg( 30 | long, 31 | default_value_t = false, 32 | help = "Convert the TLA⁺ file to ASCII instead of Unicode" 33 | )] 34 | ascii: bool, 35 | } 36 | 37 | fn main() -> Result<()> { 38 | let args = Args::parse(); 39 | let output_path = if let Some(output_path) = args.output { 40 | output_path 41 | } else { 42 | args.input.clone() 43 | }; 44 | convert( 45 | args.input.as_path(), 46 | output_path.as_path(), 47 | if args.ascii { 48 | Mode::UnicodeToAscii 49 | } else { 50 | Mode::AsciiToUnicode 51 | }, 52 | args.force, 53 | ) 54 | } 55 | 56 | fn convert(input_path: &Path, output_path: &Path, mode: Mode, force: bool) -> Result<()> { 57 | let mut input = String::new(); 58 | { 59 | let mut input_file = File::open(input_path) 60 | .context(format!("Failed to open input file [{:?}]", input_path))?; 61 | input_file 62 | .read_to_string(&mut input) 63 | .context(format!("Failed to read input file [{:?}]", input_path))?; 64 | } 65 | 66 | match rewrite(&input, &mode, force) { 67 | Ok(output) => { 68 | let mut output_file = File::create(output_path)?; 69 | output_file.write_all(output.as_bytes()).context(format!("Failed to write to output file [{:?}]", output_path))?; 70 | Ok(()) 71 | }, 72 | Err(TlaError::InputFileParseError { error_lines, .. }) => { 73 | let line_msg = match error_lines.as_slice() { 74 | [] => "Could not identify line of first syntax error.".to_string(), 75 | [..] => format!("Syntax errors might occur on or near the following lines: {:?}.", error_lines) 76 | }; 77 | Err(anyhow!("Failed to correctly parse input TLA⁺ file; use --force flag to bypass this check.\n".to_string() + &line_msg)) 78 | } 79 | Err(TlaError::OutputFileParseError{..}) => Err(anyhow!("Failed to correctly parse converted TLA⁺ output; this is a bug, please report it to the maintainer! Use --force to bypass this check (not recommended).")), 80 | Err(TlaError::InvalidTranslationError { input_tree: _, output_tree: _, output: _, first_diff }) => { 81 | let err_msg = "Converted TLA⁺ parse tree differs from original; this is a bug, please report it to the maintainer! Use --force to bypass this check (not recommended)."; 82 | Err(anyhow!("{}\n{}", err_msg, first_diff)) 83 | } 84 | } 85 | } 86 | 87 | #[cfg(test)] 88 | mod tests { 89 | use super::*; 90 | 91 | #[test] 92 | // https://github.com/tlaplus-community/tlauc/issues/14 93 | fn test_input_file_unchanged_on_parse_failure() { 94 | let project_root = std::env::var("CARGO_MANIFEST_DIR").unwrap(); 95 | let input_path = PathBuf::from(project_root) 96 | .join("tests") 97 | .join("InvalidSyntax.tla"); 98 | let expected = std::fs::read_to_string(&input_path).unwrap(); 99 | let output_path = input_path.clone(); 100 | let result: Result<()> = convert( 101 | input_path.as_path(), 102 | output_path.as_path(), 103 | tlauc::Mode::AsciiToUnicode, 104 | false, 105 | ); 106 | assert!(result.is_err()); 107 | let actual = std::fs::read_to_string(&output_path).unwrap(); 108 | assert_eq!(expected, actual); 109 | } 110 | 111 | #[test] 112 | fn test_blank_input_file() { 113 | let project_root = std::env::var("CARGO_MANIFEST_DIR").unwrap(); 114 | let input_path = PathBuf::from(project_root) 115 | .join("tests") 116 | .join("BlankFile.tla"); 117 | let output_path = input_path.clone(); 118 | let result: Result<()> = convert( 119 | input_path.as_path(), 120 | output_path.as_path(), 121 | tlauc::Mode::AsciiToUnicode, 122 | false, 123 | ); 124 | assert!(result.is_err()); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/strmeasure.rs: -------------------------------------------------------------------------------- 1 | use std::ops::{Add, Neg, Range, RangeTo, Sub}; 2 | 3 | #[derive(Debug, PartialEq, PartialOrd)] 4 | pub struct StrElementQuantity { 5 | pub char: CharQuantity, 6 | pub byte: ByteQuantity, 7 | } 8 | 9 | #[derive(Debug)] 10 | pub struct StrElementDiff { 11 | pub char: CharDiff, 12 | pub byte: ByteDiff, 13 | } 14 | 15 | impl StrElementQuantity { 16 | pub fn from_byte_index(&byte_index: &ByteQuantity, text: &str) -> Self { 17 | StrElementQuantity { 18 | char: CharQuantity::from_byte_index(&byte_index, text), 19 | byte: byte_index, 20 | } 21 | } 22 | 23 | pub fn as_byte_range(range: &Range) -> Range { 24 | let range = range.start.byte..range.end.byte; 25 | ByteQuantity::as_range(&range) 26 | } 27 | } 28 | 29 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] 30 | pub struct CharQuantity(pub usize); 31 | 32 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] 33 | pub struct ByteQuantity(pub usize); 34 | 35 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] 36 | pub struct CharDiff(pub i8); 37 | 38 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] 39 | pub struct ByteDiff(pub i8); 40 | 41 | impl CharQuantity { 42 | pub fn from_byte_index(byte_index: &ByteQuantity, text: &str) -> Self { 43 | CharQuantity(text[byte_index.range_to()].chars().count()) 44 | } 45 | 46 | pub fn repeat(&self, text: &str) -> String { 47 | text.repeat(self.0) 48 | } 49 | } 50 | 51 | impl ByteQuantity { 52 | pub fn from_char_index(char_index: &CharQuantity, text: &str) -> Self { 53 | match text.char_indices().nth(char_index.0) { 54 | Some((byte_index, _)) => ByteQuantity(byte_index), 55 | None => panic!("Cannot get character {} in string {}", char_index.0, text), 56 | } 57 | } 58 | 59 | pub fn as_range(range: &Range) -> Range { 60 | range.start.range(&range.end) 61 | } 62 | 63 | pub fn range_to(&self) -> RangeTo { 64 | ..self.0 65 | } 66 | 67 | fn range(&self, other: &ByteQuantity) -> Range { 68 | self.0..other.0 69 | } 70 | } 71 | 72 | impl CharDiff { 73 | pub fn magnitude(&self) -> CharQuantity { 74 | CharQuantity(i8::abs(self.0) as usize) 75 | } 76 | } 77 | 78 | impl Add for CharQuantity { 79 | type Output = Self; 80 | 81 | fn add(self, offset: CharDiff) -> Self::Output { 82 | let result = self.0 as i32 + offset.0 as i32; 83 | assert!( 84 | result >= 0, 85 | "Adding char offset to char index results in negative value: {} {}", 86 | self.0, 87 | offset.0 88 | ); 89 | CharQuantity(result as usize) 90 | } 91 | } 92 | 93 | impl Add for ByteQuantity { 94 | type Output = Self; 95 | 96 | fn add(self, offset: ByteDiff) -> Self::Output { 97 | let result = self.0 as i32 + offset.0 as i32; 98 | assert!( 99 | result >= 0, 100 | "Adding byte offset to byte index results in negative value: {} {}", 101 | self.0, 102 | offset.0 103 | ); 104 | ByteQuantity(result as usize) 105 | } 106 | } 107 | 108 | impl Add for CharDiff { 109 | type Output = Self; 110 | 111 | fn add(self, other: CharDiff) -> Self::Output { 112 | CharDiff(self.0 + other.0) 113 | } 114 | } 115 | 116 | impl Sub for CharQuantity { 117 | type Output = CharDiff; 118 | 119 | fn sub(self, other: CharQuantity) -> Self::Output { 120 | CharDiff((self.0 as i32 - other.0 as i32) as i8) 121 | } 122 | } 123 | 124 | impl Sub for CharQuantity { 125 | type Output = Self; 126 | 127 | fn sub(self, other: CharDiff) -> Self::Output { 128 | self + -other 129 | } 130 | } 131 | 132 | impl Sub for ByteQuantity { 133 | type Output = ByteDiff; 134 | 135 | fn sub(self, other: ByteQuantity) -> Self::Output { 136 | ByteDiff((self.0 as i32 - other.0 as i32) as i8) 137 | } 138 | } 139 | 140 | impl Sub for CharDiff { 141 | type Output = Self; 142 | 143 | fn sub(self, other: CharDiff) -> Self::Output { 144 | CharDiff(self.0 + -other.0) 145 | } 146 | } 147 | 148 | impl Neg for CharDiff { 149 | type Output = Self; 150 | 151 | fn neg(self) -> Self::Output { 152 | CharDiff(-self.0) 153 | } 154 | } 155 | 156 | impl Neg for ByteDiff { 157 | type Output = Self; 158 | 159 | fn neg(self) -> Self::Output { 160 | ByteDiff(-self.0) 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /tests/BlankFile.tla: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tlaplus-community/tlauc/39f45e218ec02c57eb5991b967bef6d0ac0e4968/tests/BlankFile.tla -------------------------------------------------------------------------------- /tests/InvalidSyntax.tla: -------------------------------------------------------------------------------- 1 | ---- MODULE test ---- 2 | foo == invalid, 3 | ==== 4 | 5 | -------------------------------------------------------------------------------- /tests/corpus_tests.rs: -------------------------------------------------------------------------------- 1 | mod corpus_tests { 2 | use glob::glob; 3 | use rayon::prelude::*; 4 | use std::ffi::OsStr; 5 | use std::fs::File; 6 | use std::io::Read; 7 | use std::path::PathBuf; 8 | use std::time::Instant; 9 | use tlauc::{rewrite, Mode, TlaError}; 10 | 11 | fn unwrap_conversion(input: Result, path: &PathBuf) -> String { 12 | match input { 13 | Ok(converted) => converted, 14 | Err(TlaError::InputFileParseError { .. }) => { 15 | panic!("Failed to parse input file [{:?}]", path) 16 | } 17 | Err(TlaError::OutputFileParseError { .. }) => { 18 | panic!("Failed to parse output file [{:?}]", path) 19 | } 20 | Err(TlaError::InvalidTranslationError { 21 | input_tree: _, 22 | output_tree: _, 23 | output: _, 24 | first_diff, 25 | }) => panic!( 26 | "Input/output parse tree mismatch for [{:?}]: [{:?}]", 27 | path, first_diff 28 | ), 29 | } 30 | } 31 | 32 | #[test] 33 | fn roundtrip_all_example_specs() { 34 | let start = Instant::now(); 35 | let skip: Vec<&str> = vec!["SomeSpecName.tla"]; 36 | println!("SKIPPING {:?}", skip); 37 | let skip: Vec<&OsStr> = skip.iter().map(|s| OsStr::new(s)).collect(); 38 | let paths: Vec = glob("tests/corpus/**/*.tla") 39 | .unwrap() 40 | .into_iter() 41 | .filter_map(|path| path.ok()) 42 | .filter(|path| !skip.contains(&path.file_name().unwrap())) 43 | .collect(); 44 | 45 | paths.par_iter().for_each(|path| { 46 | println!("{:?}", path); 47 | let mut input = String::new(); 48 | { 49 | let mut input_file = 50 | File::open(&path).expect(&format!("Failed to open input file [{:?}]", path)); 51 | input_file 52 | .read_to_string(&mut input) 53 | .expect(&format!("Failed to read input file [{:?}]", path)); 54 | } 55 | 56 | let intermediate = 57 | unwrap_conversion(rewrite(&input, &Mode::AsciiToUnicode, false), path); 58 | unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false), path); 59 | }); 60 | 61 | println!("Corpus tests took {} seconds", start.elapsed().as_secs()); 62 | } 63 | } 64 | --------------------------------------------------------------------------------