├── .github
    └── workflows
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── .gitmodules
├── Cargo.toml
├── LICENSE
├── README.md
├── resources
    └── tla-unicode.csv
├── src
    ├── lib.rs
    ├── main.rs
    └── strmeasure.rs
└── tests
    ├── BlankFile.tla
    ├── InvalidSyntax.tla
    └── corpus_tests.rs


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Build & Test
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |   pull_request:
 7 |     branches:
 8 |       - main
 9 | jobs:
10 |   build-and-test:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: [windows-latest, ubuntu-latest, macos-latest]
15 |       fail-fast: false
16 |     steps:
17 |       - name: Clone repo
18 |         uses: actions/checkout@v4
19 |         with:
20 |           submodules: true
21 |       - name: Use stable rust toolchain
22 |         run: rustup default stable
23 |       - name: Build
24 |         run: cargo build
25 |       - name: Check Formatting
26 |         run: cargo fmt --check
27 |       - name: Test
28 |         run: cargo test -- --nocapture
29 | 
30 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   release:
 4 |     types: [created]
 5 | jobs:
 6 |   release:
 7 |     runs-on: ${{ matrix.os }}
 8 |     strategy:
 9 |       matrix:
10 |         os: [ubuntu-latest, macos-latest, windows-latest]
11 |         include:
12 |           - os: ubuntu-latest
13 |             binname: tlauc-linux.tar.gz
14 |           - os: macos-latest
15 |             binname: tlauc-macos.tar.gz
16 |           - os: windows-latest
17 |             binname: tlauc-windows.zip
18 |       fail-fast: true
19 |     steps:
20 |       - name: Clone repo
21 |         uses: actions/checkout@v4
22 |       - name: Set package version
23 |         if: matrix.os != 'windows-latest'
24 |         run: |
25 |           sed -i -e "s/\"0\.0\.0\"/\"${{ github.ref_name }}\"/" Cargo.toml
26 |           cat Cargo.toml
27 |       - name: Set package version
28 |         if: matrix.os == 'windows-latest'
29 |         run: |
30 |           function Convert-PackageFile {
31 |             param($path, $source, $target)
32 |             $packageFile = Get-Content -Path $path -Raw
33 |             $updatedPackageFile = $packageFile -replace [Regex]::Escape($source), $target
34 |             Set-Content -Path $path -Value $updatedPackageFile
35 |             $updatedPackageFile
36 |           }
37 |           Convert-PackageFile 'Cargo.toml' '"0.0.0"' """${{ github.ref_name }}"""
38 |       - name: Use stable rust toolchain
39 |         run: rustup default stable
40 |       - name: Build
41 |         run: cargo build --release
42 |       - name: Package Binary
43 |         if: matrix.os == 'windows-latest'
44 |         shell: pwsh
45 |         run: Compress-Archive -Path target/release/tlauc.exe -DestinationPath ${{ matrix.binname }}
46 |       - name: Package Binary
47 |         if: matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'
48 |         run: tar -czvf ${{ matrix.binname }} -C target/release tlauc
49 |       - name: Upload Binary
50 |         uses: actions/upload-release-asset@v1
51 |         env:
52 |           GITHUB_TOKEN: ${{ secrets.GH_PAT }}
53 |         with:
54 |           upload_url: ${{ github.event.release.upload_url }}
55 |           asset_path: ${{ matrix.binname }}
56 |           asset_name: ${{ matrix.binname }}
57 |           asset_content_type: application/gzip
58 |       - name: Publish Crate
59 |         if: matrix.os == 'ubuntu-latest'
60 |         run: cargo publish --token ${{secrets.CRATES_AUTH_TOKEN}} --allow-dirty
61 | 
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Cargo.lock
2 | target
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/corpus"]
2 | 	path = tests/corpus
3 | 	url = https://github.com/tlaplus/examples
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tlauc"
 3 | description = "Rewrites TLA⁺ specs to use Unicode symbols instead of ASCII, and vice-versa"
 4 | version = "0.0.0"
 5 | authors = ["Andrew Helwer <2n8rn1w1f@mozmail.com>"]
 6 | repository = "https://github.com/tlaplus-community/tlauc"
 7 | license = "MIT"
 8 | readme = "README.md"
 9 | keywords = ["tla+", "tlaplus", "pluscal", "unicode"]
10 | categories = ["command-line-utilities", "text-editors"]
11 | edition = "2021"
12 | exclude = ["tests", ".github", ".gitignore", ".gitmodules"]
13 | 
14 | [dependencies]
15 | anyhow = "1.0.81"
16 | clap = { version = "4.5.4", features = ["derive"] }
17 | csv = "1.3.0"
18 | serde = { version = "1.0.197", features = ["derive"] }
19 | streaming-iterator = "0.1.9"
20 | tree-sitter = "0.24.3"
21 | tree-sitter-language = "0.1.2"
22 | tree-sitter-tlaplus = "1.5.0"
23 | 
24 | [dev-dependencies]
25 | glob = "0.3.1"
26 | rayon = "1.10.0"
27 | 
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 tlaplus-community
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TLAUC: The TLA⁺ Unicode Converter
  2 | [![Build & Test](https://github.com/tlaplus-community/tlauc/actions/workflows/ci.yml/badge.svg)](https://github.com/tlaplus-community/tlauc/actions/workflows/ci.yml)
  3 | [![crates.io](https://img.shields.io/crates/v/tlauc.svg)](https://crates.io/crates/tlauc)
  4 | 
  5 | Take the leap! Move from
  6 | ```tla
  7 | S^+ == {e \in S : e > 0}
  8 | Infinitesimal == \A x \in Real^+: \E y \in Real^+: y < x
  9 | ```
 10 | to
 11 | ```tla
 12 | S⁺ ≜ {e ∈ S : e > 0}
 13 | Infinitesimal ≜ ∀ x ∈ ℝ⁺: ∃ y ∈ ℝ⁺: y < x
 14 | ```
 15 | 
 16 | This package will take any ASCII TLA⁺ file and convert all its symbols to their Unicode equivalent, or take any Unicode TLA⁺ file and convert all its symbols to their ASCII equivalent.
 17 | It consists of two crates: a library exposing this functionality (using [tree-sitter-tlaplus](https://github.com/tlaplus-community/tree-sitter-tlaplus) under the hood), and a command line wrapper.
 18 | 
 19 | Use this tool to:
 20 | * Create a nice-looking copy of your spec that is pleasant to read but can still be edited and meaningfully tracked by source control
 21 | * Convert your existing ASCII specs to Unicode and use them with Unicode-aware tooling like [tla-web](https://github.com/will62794/tla-web) or TLC
 22 | * Confidently write specs in Unicode using [Neovim](https://github.com/tlaplus-community/tlaplus-nvim-plugin) or [Emacs](https://github.com/bugarela/tla-input) plugins then output their ASCII equivalent to a temporary file for use with legacy non-Unicode-aware tooling
 23 | 
 24 | Note that GitHub itself uses the tree-sitter-tlaplus grammar for highlighting, so it supports Unicode TLA⁺ as shown in the highlighted code snippets here.
 25 | SANY and TLC also now both support Unicode.
 26 | 
 27 | The symbol mapping can be found in the [`./resources/tla-unicode.csv`](./resources/tla-unicode.csv) file, taken from the [TLA⁺ standard](https://github.com/tlaplus/rfcs/tree/2a772d9dd11acec5d7dedf30abfab91a49de48b8/accepted_rfcs/rfc5_unicode).
 28 | The crate also provides programmatic access to these mappings.
 29 | For an optimal TLA⁺ Unicode experience you'll want a monospace font that renders all these symbols in fixed width.
 30 | 
 31 | ## Install & Use
 32 | 
 33 | This crate contains both a library and its command line wrapper.
 34 | 
 35 | To get the command line tool, either download it directly from [a release](https://github.com/tlaplus-community/tlauc/releases/latest) or install it with `cargo`:
 36 | 1. Install rust: https://www.rust-lang.org/tools/install
 37 | 1. Run `cargo install tlauc`
 38 | 1. Ensure the [cargo installation directory](https://doc.rust-lang.org/cargo/commands/cargo-install.html#description) is on your path
 39 | 
 40 | From the command line, convert a TLA⁺ file from ASCII to Unicode in place as follows:
 41 | ```sh
 42 | tlauc Ascii.tla
 43 | ```
 44 | Convert from Unicode to ASCII in place:
 45 | ```sh
 46 | tlauc Unicode.tla --ascii
 47 | ```
 48 | To output to a separate file instead of overwriting the input, use the `--output` or `-o` parameter with a filepath.
 49 | There are several safety checks performed during the translation process, like that the input spec parses correctly and that the output spec has the same parse tree as the input spec.
 50 | You can override these safety checks with the `--force` or `-f` flag.
 51 | 
 52 | If parse errors exist their locations will be output as a best-effort list of line numbers.
 53 | Unfortunately tree-sitter does not expose more advanced parse error reporting at this time.
 54 | 
 55 | To consume the library, add [the tlauc package](https://crates.io/crates/tlauc) as a dependency of your project then use it as follows:
 56 | ```rs
 57 | use tlauc::{rewrite, Mode};
 58 | 
 59 | fn main() {
 60 |     let input = r#"---- MODULE TotalOrder ----
 61 | EXTENDS Reals
 62 | 
 63 | Reflexive(S) == \A a \in S : a <= a
 64 | Transitive(S) == \A a, b, c \in S : (a <= b /\ b <= c) => (a <= c)
 65 | Antisymmetric(S) == \A a, b \in S : (a <= b /\ a >= b) => (a = b)
 66 | Total(S) == \A a, b \in S : a <= b \/ a >= b
 67 | IsTotallyOrdered(S) ==
 68 |     /\ Reflexive(S)
 69 |     /\ Transitive(S)
 70 |     /\ Antisymmetric(S)
 71 |     /\ Rotal(S)
 72 | THEOREM RealsTotallyOrdered == IsTotallyOrdered(Real)
 73 | ===="#;
 74 |     println!("{}", rewrite(input, &Mode::AsciiToUnicode, false).unwrap());
 75 | }
 76 | ```
 77 | which will output:
 78 | ```tla
 79 | ---- MODULE TotalOrder ----
 80 | EXTENDS Reals
 81 | 
 82 | Reflexive(S) ≜ ∀ a ∈ S : a ≤ a
 83 | Transitive(S) ≜ ∀ a, b, c ∈ S : (a ≤ b ∧ b ≤ c) ⇒ (a ≤ c)
 84 | Antisymmetric(S) ≜ ∀ a, b ∈ S : (a ≤ b ∧ a ≥ b) ⇒ (a = b)
 85 | Total(S) ≜ ∀ a, b ∈ S : a ≤ b ∨ a ≥ b
 86 | IsTotallyOrdered(S) ≜
 87 |     ∧ Reflexive(S)
 88 |     ∧ Transitive(S)
 89 |     ∧ Antisymmetric(S)
 90 |     ∧ Total(S)
 91 | THEOREM RealsTotallyOrdered ≜ IsTotallyOrdered(ℝ)
 92 | ====
 93 | ```
 94 | Details of error handling and reading & writing files are left to the user, but you can look at the command line wrapper for an example.
 95 | 
 96 | Access the list of Unicode mappings as follows:
 97 | ```rs
 98 | use tlauc::{SymbolMapping, get_unicode_mappings};
 99 | 
100 | fn main() {
101 |     let mappings: Vec<SymbolMapping> = get_unicode_mappings();
102 |     println!("{:#?}", mappings);
103 | }
104 | ```
105 | 
106 | ## Build & Test
107 | 
108 | 1. Install Rust: https://www.rust-lang.org/tools/install
109 | 1. Clone repo with the `--recurse-submodules` parameter
110 | 1. Run `cargo build`
111 | 1. Run `cargo test`
112 | 
113 | ## Details
114 | 
115 | TLA⁺ often has several ASCII symbols all representing the same operator (for example, `<=`, `=<`, and `\leq`); these will all map to the same Unicode symbol (`≤`), and when mapping back to ASCII the first ASCII symbol in the semicolon-separated CSV cell will be used (`<=`).
116 | 
117 | The reason this program isn't just a simple search & replace is that blank space and column alignment matters for some TLA⁺ constructs, specifically conjunction and disjunction lists (henceforth called jlists):
118 | 
119 | ```tla
120 | def == /\ A
121 |        /\ \/ B
122 |           \/ C
123 |        /\ D
124 | ```
125 | 
126 | If we were to naively replace every ASCII symbol with their Unicode
127 | equivalent, we would end up with:
128 | 
129 | ```tla
130 | def ≜ ∧ A
131 |        ∧ ∨ B
132 |           ∨ C
133 |        ∧ D
134 | ```
135 | 
136 | We see that both the jlists lost their alignment.
137 | This is unlikely to change the logical value of the expression, but is still undesirable.
138 | Thus we need to analyze the parse tree to find all jlists, and ensure our modifications maintain the alignments of their items.
139 | For this purpose we use [tree-sitter-tlaplus](https://github.com/tlaplus-community/tree-sitter-tlaplus), which correctly parses these constructs.
140 | The tree-sitter parse tree for the above (correctly aligned) code snippet is:
141 | 
142 | ```sexp
143 | (operator_definition (identifier) (def_eq)
144 |   (conj_list
145 |     (conj_item (bullet_conj) (identifier_ref))
146 |     (conj_item
147 |       (bullet_conj)
148 |       (disj_list
149 |         (disj_item (bullet_disj) (identifier_ref))
150 |         (disj_item (bullet_disj) (identifier_ref))
151 |       )
152 |     )
153 |     (conj_item (bullet_conj) (identifier_ref))
154 |   )
155 | )
156 | ```
157 | For safety, the program checks to ensure the converted TLA⁺ file has the exact same parse tree as the original.
158 | It also will not convert the input file if it contains any parse errors.
159 | Both of these checks can be bypassed with the `--force` command line parameter (also exposed in the library).
160 | 
161 | ## Algorithm
162 | 
163 | The high-level conversion algorithm is as follows:
164 | 
165 | 1. For each line in the input file, create two vectors: a jlist vector, and a symbol vector.
166 | 1. Parse the input file and use tree-sitter queries to identify the locations & scope of all jlists.
167 | For each line, push details of any jlists starting on that line onto the jlist vector, sorted from left to right.
168 | 1. Use tree-sitter queries to identify the locations of all symbols to be replaced.
169 | Sort the symbol locations by line and then push them onto the line's symbol vector, sorted from left to right.
170 | 1. For each line, iteratively pop the top element off the symbol vector and replace it in the text.
171 | If no jlists start to the right of that symbol the line, no further action is required; otherwise:
172 |    1. For each jlist starting to the right of the replaced symbol on that line, iterate through all subsequent bullet lines and add or remove spaces to fix the alignment.
173 |    Update the positions of entities in the jlist and symbol stacks on those lines.
174 |    1. For each jlist bullet alignment fixed, check whether any additional jlists start on that line; recursively fix their alignment with the same process until no jlists remain to be fixed.
175 | 
176 | 1. After iterating through all lines, the process is complete; parse the converted tree and compare it to the original.
177 | They should be identical.
178 | 
179 | ## Complications
180 | 
181 | As always with variable-width UTF-8 encoded text, care must be taken to differentiate the byte index of a symbol (henceforth called a "codepoint") from its character index.
182 | We [long ago](https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/) left the world of "plain text = ASCII = characters are 1 byte".
183 | Now, each "character" is really a codepoint (an arbitrarily-large number identifying a symbol in the international Unicode standard) that can be 1 byte (as all the ASCII-equivalent codepoints remain, for seamless backward compatibility) or 2 bytes, or 3, 4, etc.
184 | Fundamentally this means that given a byte index, you can't know a codepoint's character index (here also called its "displayed" index) without reading from the beginning of whatever line you are on and counting how many codepoints you encounter.
185 | This complexity is of particular concern for this project, which involves a lot of maintaining text alignment, shifting, inserting Unicode symbols, and index arithmetic.
186 | Rust's type system proved very helpful here; instead of storing indices or offsets as primitive types like `usize` or `i8`, a number of wrapper types were defined to enforce index arithmetic safety at the type-checking level.
187 | You can only add or compare values of like types, and converting from one type to the other requires reading the indexed line of text from the beginning.
188 | At the expense of some additional verbiage this greatly reduced the difficulty of keeping character and byte indices separate and reasoning about when it is appropriate to use each.
189 | For possible (but unlikely) future work there is even more complexity to be found with modifier codepoints, where multiple codepoints combine to form one "grapheme cluster" (what we would think of as a "character" in the ASCII world); for example, the grapheme cluster `é` can either be written directly as codepoint `U+00E9` or as codepoints `U+0301 U+0065`, where `U+0301` is the accent modifier ("combining diacritical mark") applied to `U+0065`, which is our familiar ASCII-equivalent code for `e`.
190 | This program does not handle grapheme clusters and (wrongly, but conveniently) assumes one codepoint = one displayed character.
191 | This would only ever be an issue if someone were to use modifiers in comments prepending alignment-sensitive syntax (see below), which is such a niche use case that for simplicity it will not be handled at this time.
192 | 
193 | For actual syntax processing, the most troublesome edge case is as follows:
194 | ```tla
195 | op == /\ A
196 |       /\ B
197 |       => C
198 | ```
199 | When converting from ASCII to Unicode using the naive algorithm, this results in:
200 | ```tla
201 | op ≜ ∧ A
202 |      ∧ B
203 |       ⇒ C
204 | ```
205 | So this changes `(A ∧ B) ⇒ C` into `A ∧ (B ⇒ C)`, absolutely a different logical expression.
206 | The solution to this edge case is to look for infix operator nodes that are the parent of jlist nodes where the jlist is the left-hand expression.
207 | Thankfully this is easily done with the tree-sitter query `(bound_infix_op lhs: [(conj_list) (disj_list)]) @capture`.
208 | Then, record the operator symbol column offset relative to the jlist column, and maintain it as much as possible as the jlist is shifted.
209 | The edge case is also present in the other direction when converting from Unicode to ASCII:
210 | ```tla
211 | op ≜ ∧ A
212 |      ∧ B
213 |       = C
214 |      ∧ D
215 |       = E
216 | ```
217 | Which converts to:
218 | ```tla
219 | op == /\ A
220 |       /\ B
221 |       = C
222 |       /\ D
223 |       = E
224 | ```
225 | So `(A ∧ (B = C)) ∧ (D = E)` is changed to `((A ∧ B) = C) ∧ D) = E`.
226 | This direction is substantially more difficult to detect via tree-sitter queries, since `B = C` can be an arbitrarily-long and complicated expression that eventually spills onto additional lines.
227 | Since this scenario is very unlikely to occur in the wild until large numbers of TLA⁺ specs are being written in Unicode first, this case is not currently handled by the program (see issue https://github.com/tlaplus-community/tlauc/issues/1).
228 | 
229 | Another edge case involves block comments in the (usually empty) space before jlist items:
230 | ```tla
231 | op == /\ A
232 | (***) /\ B
233 | (***) /\ C
234 | ```
235 | If one or more comments are present in this way they function as hard constraints on how much the jlist can be shifted to the left.
236 | This turns jlist shifting from a simple greedy algorithm into more of a constraint satisfaction problem, especially once nested jlists are involved or even combined with the infix operator edge case up above, forming a tricky corner case:
237 | ```tla
238 | op == /\ A
239 | (***) /\ \/ B
240 | (******) \/ C
241 | (***) => D
242 | ```
243 | Note also that comments can include arbitrary Unicode symbols so care must be taken to use character indices instead of byte indices for column alignment (see discussion of Unicode difficulties above).
244 | Of course this means the jlists will not be aligned in non-Unicode-aware tooling, but that is the concern of the user; this tool does not modify comment text.
245 | It really only seems feasible to assume one codepoint = one displayed character; alignment according to grapheme clusters would add unnecessary complication to a very niche use case.
246 | 
247 | The block comment edge case has not been observed in the wild and so is not yet supported; see issue https://github.com/tlaplus-community/tlauc/issues/2.
248 | 
249 | ## Prior Art
250 | 
251 | [Ron Pressler](https://pron.github.io/) did [a lot of work](https://github.com/pron/tlaplus/commits/unicode-presentation-2) in early 2017 trying to add Unicode support to SANY and the TLA⁺ Toolbox, including replacing ASCII symbols with Unicode as the user types.
252 | He also wrote [a similar Unicode conversion tool](https://github.com/pron/tlaplus/tree/unicode-presentation-2/tlatools/src/tla2unicode) in Java, which faced many of the same challenges around jlist alignment.
253 | Unfortunately none of this work was upstreamed.
254 | 
255 | 


--------------------------------------------------------------------------------
/resources/tla-unicode.csv:
--------------------------------------------------------------------------------
 1 | Name,ASCII,Unicode,Unicode ID
 2 | def_eq,==,≜,U+225C
 3 | set_in,\in,∈,U+2208
 4 | gets,<-,←,U+2190
 5 | forall,\A;\forall,∀,U+2200
 6 | exists,\E;\exists,∃,U+2203
 7 | all_map_to,|->,↦,U+21A6
 8 | maps_to,->,→,U+2192
 9 | langle_bracket,<<,⟨,U+27E8
10 | rangle_bracket,>>,⟩,U+27E9
11 | rangle_bracket_sub,>>_,⟩_,U+27E9;U+005F
12 | case_box,[],□,U+25A1
13 | case_arrow,->,→,U+2192
14 | label_as,::,∷,U+2237
15 | lnot,~;\lnot;\neg,¬,U+00AC
16 | always,[],□,U+25A1
17 | eventually,<>,◇,U+25C7
18 | implies,=>,⇒,U+21D2
19 | plus_arrow,-+->,⇸,U+21F8
20 | equiv,\equiv,≡,U+2261
21 | iff,<=>,⇔,U+21D4
22 | leads_to,~>,↝,U+219D
23 | land,/\;\land,∧,U+2227
24 | lor,\/;\lor,∨,U+2228
25 | assign,:=,≔,U+2254
26 | bnf_rule,::=,⩴,U+2A74
27 | neq,/=;#,≠,U+2260
28 | leq,<=;=<;\leq,≤,U+2264
29 | geq,>=;\geq,≥,U+2265
30 | approx,\approx,≈,U+2248
31 | rs_ttile,|-,⊢,U+22A2
32 | rd_ttile,|=,⊨,U+22A8
33 | ls_ttile,-|,⊣,U+22A3
34 | ld_ttile,=|,⫤,U+2AE4
35 | asymp,\asymp,≍,U+224D
36 | cong,\cong,≅,U+2245
37 | doteq,\doteq,≐,U+2250
38 | gg,\gg,≫,U+226B
39 | ll,\ll,≪,U+226A
40 | in,\in,∈,U+2208
41 | notin,\notin,∉,U+2209
42 | prec,\prec,≺,U+227A
43 | succ,\succ,≻,U+227B
44 | preceq,\preceq,⪯,U+2AAF
45 | succeq,\succeq,⪰,U+2AB0
46 | propto,\propto,∝,U+221D
47 | sim,\sim,∼,U+223C
48 | simeq,\simeq,≃,U+2243
49 | sqsubset,\sqsubset,⊏,U+228F
50 | sqsupset,\sqsupset,⊐,U+2290
51 | sqsubseteq,\sqsubseteq,⊑,U+2291
52 | sqsupseteq,\sqsupseteq,⊒,U+2292
53 | subset,\subset,⊂,U+2282
54 | supset,\supset,⊃,U+2283
55 | subseteq,\subseteq,⊆,U+2286
56 | supseteq,\supseteq,⊇,U+2287
57 | cap,\intersect;\cap,∩,U+2229
58 | cup,\union;\cup,∪,U+222A
59 | dots_2,..,‥,U+2025
60 | dots_3,...,…,U+2026
61 | oplus,(+);\oplus,⊕,U+2295
62 | ominus,(-);\ominus,⊖,U+2296
63 | vertvert,||,‖,U+2016
64 | odot,(.);\odot,⊙,U+2299
65 | oslash,(/);\oslash,⊘,U+2298
66 | otimes,(\X);\otimes,⊗,U+2297
67 | bigcirc,\bigcirc,◯,U+25EF
68 | bullet,\bullet,●,U+25CF
69 | div,\div,÷,U+00F7
70 | circ,\o;\circ,∘,U+221
71 | star,\star,⋆,U+22C6
72 | excl,!!,‼,U+203C
73 | qq,??,⁇,U+2047
74 | sqcap,\sqcap,⊓,U+2293
75 | sqcup,\sqcup,⊔,U+2294
76 | uplus,\uplus,⊎,U+228E
77 | times,\X;\times,×,U+00D7
78 | wr,\wr,≀,U+2240
79 | cdot,\cdot,⋅,U+22C5
80 | sup_plus,^+,⁺,U+207A
81 | nat_number_set,Nat,ℕ,U+2115
82 | int_number_set,Int,ℤ,U+2124
83 | real_number_set,Real,ℝ,U+211D
84 | bullet_conj,/\,∧,U+2227
85 | bullet_disj,\/,∨,U+2228
86 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | mod strmeasure;
  2 | use crate::strmeasure::*;
  3 | 
  4 | use serde::{Deserialize, Deserializer};
  5 | use std::ops::Range;
  6 | use streaming_iterator::StreamingIterator;
  7 | use tree_sitter::{Node, Parser, Query, QueryCursor, Tree, TreeCursor};
  8 | 
  9 | pub enum Mode {
 10 |     AsciiToUnicode,
 11 |     UnicodeToAscii,
 12 | }
 13 | 
 14 | #[derive(Debug)]
 15 | pub enum TlaError {
 16 |     InputFileParseError {
 17 |         parse_tree: Tree,
 18 |         error_lines: Vec<usize>,
 19 |     },
 20 |     OutputFileParseError {
 21 |         output_tree: Tree,
 22 |         output: String,
 23 |     },
 24 |     InvalidTranslationError {
 25 |         input_tree: Tree,
 26 |         output_tree: Tree,
 27 |         output: String,
 28 |         first_diff: String,
 29 |     },
 30 | }
 31 | 
 32 | pub fn rewrite(input: &str, mode: &Mode, force: bool) -> Result<String, TlaError> {
 33 |     let mut parser = Parser::new();
 34 |     parser
 35 |         .set_language(&tree_sitter_tlaplus::LANGUAGE.into())
 36 |         .expect("Error loading TLA⁺ grammar");
 37 |     let mut cursor = QueryCursor::new();
 38 | 
 39 |     // Parse input TLA⁺ file and construct data structures to hold information about it
 40 |     let input_tree = parser.parse(input, None).unwrap();
 41 |     if !force && input_tree.root_node().has_error() {
 42 |         let error_lines = find_error_lines(&input_tree);
 43 |         return Err(TlaError::InputFileParseError {
 44 |             parse_tree: input_tree,
 45 |             error_lines,
 46 |         });
 47 |     }
 48 | 
 49 |     let mut tla_lines = TlaLine::construct_from(input);
 50 | 
 51 |     // Identify & replace symbols
 52 |     mark_jlists(&input_tree, &mut cursor, &mut tla_lines);
 53 |     mark_symbols(&input_tree, &mut cursor, &mut tla_lines, mode);
 54 |     //println!("{:#?}", tla_lines);
 55 |     replace_symbols(&mut tla_lines);
 56 | 
 57 |     // if the input ends with '\n', we should put the '\n' back to output
 58 |     let extra_newline = input
 59 |         .chars()
 60 |         .last()
 61 |         .map_or("", |x| if x == '\n' { "\n" } else { "" });
 62 | 
 63 |     // Ensure output parse tree is identical to input parse tree
 64 |     let output = TlaLine::output_from_lines(&tla_lines, &extra_newline);
 65 | 
 66 |     let output_tree = parser.parse(&output, None).unwrap();
 67 |     if !force {
 68 |         if output_tree.root_node().has_error() {
 69 |             return Err(TlaError::OutputFileParseError {
 70 |                 output_tree,
 71 |                 output,
 72 |             });
 73 |         }
 74 |         if let Err(first_diff) = compare_parse_trees(&input_tree, &output_tree) {
 75 |             return Err(TlaError::InvalidTranslationError {
 76 |                 input_tree,
 77 |                 output_tree,
 78 |                 output,
 79 |                 first_diff,
 80 |             });
 81 |         }
 82 |     }
 83 | 
 84 |     Ok(output)
 85 | }
 86 | 
 87 | fn find_error_lines(tree: &Tree) -> Vec<usize> {
 88 |     let mut error_lines: Vec<usize> = vec![];
 89 |     traverse_parse_tree(tree, |n| {
 90 |         if n.is_error() || n.is_missing() {
 91 |             error_lines.push(n.start_position().row + 1);
 92 |         }
 93 |     });
 94 |     error_lines
 95 | }
 96 | 
 97 | fn traverse_parse_tree<F>(tree: &Tree, mut visit: F)
 98 | where
 99 |     F: FnMut(Node),
100 | {
101 |     let mut cursor: TreeCursor = tree.walk();
102 |     loop {
103 |         // Every time a new node is found the control flow passes here
104 |         visit(cursor.node());
105 |         // Descend as far as possible
106 |         if !cursor.goto_first_child() {
107 |             loop {
108 |                 // Attempt to go to sibling
109 |                 if cursor.goto_next_sibling() {
110 |                     // If sibling exists, break out into descent loop
111 |                     break;
112 |                 } else {
113 |                     // If sibling does not exist, go to parent, then
114 |                     // parent's sibling in next loop iteration
115 |                     if !cursor.goto_parent() {
116 |                         // If parent does not exist, we are done
117 |                         return;
118 |                     }
119 |                 }
120 |             }
121 |         }
122 |     }
123 | }
124 | 
125 | fn compare_parse_trees(input_tree: &Tree, output_tree: &Tree) -> Result<(), String> {
126 |     let mut input_cursor: TreeCursor = input_tree.walk();
127 |     let mut output_cursor: TreeCursor = output_tree.walk();
128 | 
129 |     loop {
130 |         check_node_equality(&input_cursor, &output_cursor)?;
131 |         if !simultaneous_step(&mut input_cursor, &mut output_cursor, |c| {
132 |             c.goto_first_child()
133 |         })? {
134 |             loop {
135 |                 if !simultaneous_step(&mut input_cursor, &mut output_cursor, |c| {
136 |                     c.goto_next_sibling()
137 |                 })? {
138 |                     if !simultaneous_step(&mut input_cursor, &mut output_cursor, |c| {
139 |                         c.goto_parent()
140 |                     })? {
141 |                         return Ok(());
142 |                     }
143 |                 } else {
144 |                     break;
145 |                 }
146 |             }
147 |         }
148 |     }
149 | }
150 | 
151 | fn simultaneous_step(
152 |     input_cursor: &mut TreeCursor,
153 |     output_cursor: &mut TreeCursor,
154 |     step: fn(&mut TreeCursor) -> bool,
155 | ) -> Result<bool, String> {
156 |     let (input_next, output_next) = (step(input_cursor), step(output_cursor));
157 |     if input_next != output_next {
158 |         return Err(format!(
159 |             "First diff: Input {:?} Output {:?}",
160 |             input_cursor.node(),
161 |             output_cursor.node()
162 |         ));
163 |     }
164 | 
165 |     Ok(input_next)
166 | }
167 | 
168 | fn check_node_equality(
169 |     input_cursor: &TreeCursor,
170 |     output_cursor: &TreeCursor,
171 | ) -> Result<(), String> {
172 |     if (input_cursor.node().is_named() || output_cursor.node().is_named())
173 |         && input_cursor.node().kind() != output_cursor.node().kind()
174 |     {
175 |         return Err(format!(
176 |             "First diff: Input {:?} Output {:?}",
177 |             input_cursor.node(),
178 |             output_cursor.node()
179 |         ));
180 |     }
181 | 
182 |     Ok(())
183 | }
184 | 
185 | #[derive(Debug, Deserialize)]
186 | pub struct SymbolMapping {
187 |     #[serde(rename = "Name")]
188 |     name: String,
189 |     #[serde(
190 |         rename = "ASCII",
191 |         deserialize_with = "vec_from_semicolon_separated_str"
192 |     )]
193 |     ascii: Vec<String>,
194 |     #[serde(rename = "Unicode")]
195 |     unicode: String,
196 | }
197 | 
198 | impl SymbolMapping {
199 |     pub fn canonical_ascii(&self) -> &str {
200 |         self.ascii.first().unwrap()
201 |     }
202 | 
203 |     pub fn ascii_query(&self) -> String {
204 |         let query = self
205 |             .ascii
206 |             .iter()
207 |             .map(|a| a.replace('\\', "\\\\"))
208 |             .map(|a| format!("\"{}\"", a))
209 |             .reduce(|a, b| a + " " + &b)
210 |             .unwrap();
211 |         let name = &self.name;
212 |         format!("({name} [{query}] @{name})")
213 |     }
214 | 
215 |     pub fn unicode_query(&self) -> String {
216 |         let name = &self.name;
217 |         let unicode = &self.unicode;
218 |         format!("({name} \"{unicode}\" @{name})")
219 |     }
220 | 
221 |     fn target_symbol(&self, mode: &Mode) -> &str {
222 |         match mode {
223 |             Mode::AsciiToUnicode => &self.unicode,
224 |             Mode::UnicodeToAscii => self.canonical_ascii(),
225 |         }
226 |     }
227 | 
228 |     fn source_query(&self, mode: &Mode) -> String {
229 |         match mode {
230 |             Mode::AsciiToUnicode => self.ascii_query(),
231 |             Mode::UnicodeToAscii => self.unicode_query(),
232 |         }
233 |     }
234 | 
235 |     fn chars_added(&self, mode: &Mode, src_symbol: &str) -> CharDiff {
236 |         match mode {
237 |             Mode::AsciiToUnicode => {
238 |                 CharQuantity(self.unicode.chars().count())
239 |                     - CharQuantity(src_symbol.chars().count())
240 |             }
241 |             Mode::UnicodeToAscii => {
242 |                 CharQuantity(self.canonical_ascii().chars().count())
243 |                     - CharQuantity(self.unicode.chars().count())
244 |             }
245 |         }
246 |     }
247 | }
248 | 
249 | fn vec_from_semicolon_separated_str<'de, D>(deserializer: D) -> Result<Vec<String>, D::Error>
250 | where
251 |     D: Deserializer<'de>,
252 | {
253 |     let s: &str = Deserialize::deserialize(deserializer)?;
254 |     Ok(s.split(';').map(|s| s.to_string()).collect())
255 | }
256 | 
257 | pub fn get_unicode_mappings() -> Vec<SymbolMapping> {
258 |     let csv = include_str!("../resources/tla-unicode.csv");
259 |     let mut reader = csv::Reader::from_reader(csv.as_bytes());
260 |     reader.deserialize().map(|result| result.unwrap()).collect()
261 | }
262 | 
263 | #[derive(Debug)]
264 | struct TlaLine {
265 |     text: String,
266 |     jlists: Vec<JList>,
267 |     symbols: Vec<Symbol>,
268 | }
269 | 
270 | impl TlaLine {
271 |     fn construct_from(input: &str) -> Vec<Self> {
272 |         input
273 |             .lines()
274 |             .map(|line| TlaLine {
275 |                 jlists: Vec::new(),
276 |                 symbols: Vec::new(),
277 |                 text: line.to_string(),
278 |             })
279 |             .collect()
280 |     }
281 | 
282 |     // same as join("\n") + extra,
283 |     // but to avoid unnecessary the reallocation,
284 |     // ref: https://doc.rust-lang.org/src/alloc/slice.rs.html#787
285 |     fn output_from_lines(tla_lines: &Vec<Self>, extra: &str) -> String {
286 |         let mut iter = tla_lines.iter();
287 |         let first = match iter.next() {
288 |             Some(first) => first,
289 |             None => return extra.to_string(),
290 |         };
291 |         let text_size = tla_lines.iter().map(|v| v.text.len()).sum::<usize>();
292 |         // Note: tla_lines.len() > 0 is always true
293 |         let size = text_size + tla_lines.len() - 1 + extra.len();
294 |         let mut result = String::with_capacity(size);
295 |         result.push_str(&first.text);
296 |         for v in iter {
297 |             result.push('\n');
298 |             result.push_str(&v.text);
299 |         }
300 |         result.push_str(extra);
301 |         result
302 |     }
303 | 
304 |     fn shift_jlists(&mut self, &diff: &CharDiff, &start_index: &CharQuantity) {
305 |         for jlist in &mut self.jlists {
306 |             if jlist.column > start_index {
307 |                 jlist.column = jlist.column + diff;
308 |             }
309 |         }
310 |     }
311 | 
312 |     fn shift_symbols(&mut self, diff: &StrElementDiff, start_index: &StrElementQuantity) {
313 |         for symbol in &mut self.symbols {
314 |             if symbol.src_range.start.byte >= start_index.byte {
315 |                 symbol.src_range.start.byte = symbol.src_range.start.byte + diff.byte;
316 |                 symbol.src_range.end.byte = symbol.src_range.end.byte + diff.byte;
317 |             }
318 |             if symbol.src_range.start.char >= start_index.char {
319 |                 symbol.src_range.start.char = symbol.src_range.start.char + diff.char;
320 |                 symbol.src_range.end.char = symbol.src_range.end.char + diff.char;
321 |             }
322 |         }
323 |     }
324 | }
325 | 
326 | #[derive(Debug)]
327 | struct JList {
328 |     column: CharQuantity,
329 |     bullet_line_offsets: Vec<usize>,
330 |     terminating_infix_op_offset: Option<InfixOp>,
331 | }
332 | 
333 | #[derive(Debug)]
334 | struct InfixOp {
335 |     line_offset: usize,
336 |     column: CharQuantity,
337 | }
338 | 
339 | impl JList {
340 |     fn query() -> Query {
341 |         Query::new(
342 |             &tree_sitter_tlaplus::LANGUAGE.into(),
343 |             "[(conj_list) (disj_list)] @jlist",
344 |         )
345 |         .unwrap()
346 |     }
347 | 
348 |     fn terminating_infix_op_query() -> Query {
349 |         Query::new(
350 |             &tree_sitter_tlaplus::LANGUAGE.into(),
351 |             "(bound_infix_op lhs: [(conj_list) (disj_list)]) @capture",
352 |         )
353 |         .unwrap()
354 |     }
355 | 
356 |     fn is_jlist_item_node(cursor: &TreeCursor) -> bool {
357 |         "conj_item" == cursor.node().kind() || "disj_item" == cursor.node().kind()
358 |     }
359 | }
360 | 
361 | fn mark_jlists(tree: &Tree, query_cursor: &mut QueryCursor, tla_lines: &mut [TlaLine]) {
362 |     let mut tree_cursor: TreeCursor = tree.walk();
363 |     let query = JList::query();
364 |     let mut captures = query_cursor.matches(&query, tree.root_node(), "".as_bytes());
365 |     while let Some(capture) = captures.next() {
366 |         let node = capture.captures[0].node;
367 |         let start_line = node.start_position().row;
368 |         let line = &mut tla_lines[start_line];
369 |         let column =
370 |             CharQuantity::from_byte_index(&ByteQuantity(node.start_position().column), &line.text);
371 |         let mut jlist = JList {
372 |             column,
373 |             bullet_line_offsets: Vec::new(),
374 |             terminating_infix_op_offset: None,
375 |         };
376 |         tree_cursor.reset(node);
377 |         tree_cursor.goto_first_child();
378 |         while {
379 |             if JList::is_jlist_item_node(&tree_cursor) {
380 |                 jlist
381 |                     .bullet_line_offsets
382 |                     .push(tree_cursor.node().start_position().row - start_line);
383 |             }
384 | 
385 |             tree_cursor.goto_next_sibling()
386 |         } {}
387 | 
388 |         line.jlists.push(jlist);
389 |     }
390 | 
391 |     let query = JList::terminating_infix_op_query();
392 |     let mut captures = query_cursor.matches(&query, tree.root_node(), "".as_bytes());
393 |     while let Some(capture) = captures.next() {
394 |         let infix_op_node = capture.captures[0].node;
395 |         let jlist_node = infix_op_node.child_by_field_name("lhs").unwrap();
396 |         let jlist_start_line_index = jlist_node.start_position().row;
397 |         let (prefix, suffix) = tla_lines.split_at_mut(jlist_start_line_index + 1);
398 |         let jlist_start_line = &mut prefix[jlist_start_line_index];
399 |         let jlist_column = CharQuantity::from_byte_index(
400 |             &ByteQuantity(jlist_node.start_position().column),
401 |             &jlist_start_line.text,
402 |         );
403 |         let jlist = jlist_start_line
404 |             .jlists
405 |             .iter_mut()
406 |             .find(|j| j.column == jlist_column)
407 |             .unwrap();
408 |         let symbol_node = infix_op_node.child_by_field_name("symbol").unwrap();
409 |         let symbol_line_offset = symbol_node.start_position().row - jlist_start_line_index;
410 |         let symbol_line = &suffix[symbol_line_offset - 1];
411 |         let symbol_column = ByteQuantity(symbol_node.start_position().column);
412 |         jlist.terminating_infix_op_offset = Some(InfixOp {
413 |             line_offset: symbol_line_offset,
414 |             column: CharQuantity::from_byte_index(&symbol_column, &symbol_line.text),
415 |         });
416 |     }
417 | }
418 | 
419 | #[derive(Debug)]
420 | struct Symbol {
421 |     diff: CharDiff,
422 |     src_range: Range<StrElementQuantity>,
423 |     target: String,
424 | }
425 | 
426 | fn mark_symbols(tree: &Tree, cursor: &mut QueryCursor, tla_lines: &mut [TlaLine], mode: &Mode) {
427 |     let mappings = get_unicode_mappings();
428 |     let queries = &mappings
429 |         .iter()
430 |         .map(|s| s.source_query(mode))
431 |         .collect::<Vec<String>>()
432 |         .join("");
433 |     let query = Query::new(&tree_sitter_tlaplus::LANGUAGE.into(), queries).unwrap();
434 | 
435 |     let mut captures = cursor.matches(&query, tree.root_node(), "".as_bytes());
436 |     while let Some(capture) = captures.next() {
437 |         let capture = capture.captures[0];
438 |         let mapping = &mappings[capture.index as usize];
439 |         let start_position = capture.node.start_position();
440 |         let end_position = capture.node.end_position();
441 |         assert!(start_position.row == end_position.row);
442 |         let line = &mut tla_lines[start_position.row];
443 |         let src_range =
444 |             StrElementQuantity::from_byte_index(&ByteQuantity(start_position.column), &line.text)
445 |                 ..StrElementQuantity::from_byte_index(
446 |                     &ByteQuantity(end_position.column),
447 |                     &line.text,
448 |                 );
449 |         let src_symbol = &line.text[StrElementQuantity::as_byte_range(&src_range)];
450 |         let target = mapping.target_symbol(mode).to_string();
451 |         line.symbols.push(Symbol {
452 |             diff: mapping.chars_added(mode, src_symbol),
453 |             src_range,
454 |             target,
455 |         });
456 |     }
457 | }
458 | 
459 | fn replace_symbols(tla_lines: &mut [TlaLine]) {
460 |     for line_number in 0..tla_lines.len().saturating_add_signed(-1) {
461 |         let (prefix, suffix) = tla_lines.split_at_mut(line_number + 1);
462 |         let line = &mut prefix[line_number];
463 |         while let Some(symbol) = line.symbols.pop() {
464 |             line.text.replace_range(
465 |                 StrElementQuantity::as_byte_range(&symbol.src_range),
466 |                 &symbol.target,
467 |             );
468 |             line.shift_jlists(&symbol.diff, &symbol.src_range.start.char);
469 |             fix_alignment(line, suffix, &symbol.diff, &symbol.src_range.start);
470 |         }
471 |     }
472 | }
473 | 
474 | fn fix_alignment(
475 |     line: &mut TlaLine,
476 |     suffix: &mut [TlaLine],
477 |     &diff: &CharDiff,
478 |     symbol_start_index: &StrElementQuantity,
479 | ) {
480 |     // If there was no net change in character count, there is no need to fix alignment
481 |     if diff == CharDiff(0) {
482 |         return;
483 |     }
484 | 
485 |     // Recursively fix alignment of all jlist bullets
486 |     for jlist in &mut line.jlists {
487 |         // Ignore jlists starting before the index of modification in this line
488 |         if jlist.column <= symbol_start_index.char {
489 |             continue;
490 |         }
491 | 
492 |         // Add or remove spaces from the start of the line for each bullet in this jlist
493 |         let mod_index = StrElementQuantity {
494 |             char: CharQuantity(0),
495 |             byte: ByteQuantity(0),
496 |         };
497 |         for &line_offset in &jlist.bullet_line_offsets {
498 |             // Alignment of first element of jlist was already changed by original modification
499 |             if line_offset == 0 {
500 |                 continue;
501 |             }
502 | 
503 |             let (suffix_prefix, suffix_suffix) = suffix.split_at_mut(line_offset);
504 |             let bullet_line = &mut suffix_prefix[line_offset - 1];
505 |             let bullet_column = jlist.column - diff;
506 |             pad(bullet_line, &diff, &mod_index, &bullet_column);
507 | 
508 |             // Recursively fix alignment of any jlists starting on this line
509 |             fix_alignment(bullet_line, suffix_suffix, &diff, &mod_index);
510 |         }
511 | 
512 |         // Fix alignment of terminating infix op for this jlist, if it exists
513 |         if let Some(infix_op_offset) = &mut jlist.terminating_infix_op_offset {
514 |             let (suffix_prefix, suffix_suffix) = suffix.split_at_mut(infix_op_offset.line_offset);
515 |             let infix_op_line = &mut suffix_prefix[infix_op_offset.line_offset - 1];
516 |             let diff = pad(infix_op_line, &diff, &mod_index, &infix_op_offset.column);
517 |             infix_op_offset.column = infix_op_offset.column + diff;
518 |             fix_alignment(infix_op_line, suffix_suffix, &diff, &mod_index);
519 |         }
520 |     }
521 | }
522 | 
523 | fn pad(
524 |     line: &mut TlaLine,
525 |     &diff: &CharDiff,
526 |     mod_index: &StrElementQuantity,
527 |     &first_symbol_index: &CharQuantity,
528 | ) -> CharDiff {
529 |     if diff < CharDiff(0) {
530 |         // Calculate min to ensure we don't move a symbol to before the end of the line
531 |         let spaces_to_remove = CharQuantity::min(diff.magnitude(), first_symbol_index);
532 |         let bytes_to_remove = ByteQuantity::from_char_index(&spaces_to_remove, &line.text);
533 |         line.text.drain(bytes_to_remove.range_to());
534 |         let diff = StrElementDiff {
535 |             char: mod_index.char - spaces_to_remove,
536 |             byte: mod_index.byte - bytes_to_remove,
537 |         };
538 |         line.shift_jlists(&diff.char, &mod_index.char);
539 |         line.shift_symbols(&diff, &mod_index);
540 |         diff.char
541 |     } else {
542 |         let spaces_to_add = diff.magnitude();
543 |         line.text.insert_str(0, &spaces_to_add.repeat(" "));
544 |         let spaces_added_in_bytes = ByteQuantity::from_char_index(&spaces_to_add, &line.text);
545 |         let diff = StrElementDiff {
546 |             char: diff,
547 |             byte: spaces_added_in_bytes - mod_index.byte,
548 |         };
549 |         line.shift_jlists(&diff.char, &mod_index.char);
550 |         line.shift_symbols(&diff, &mod_index);
551 |         diff.char
552 |     }
553 | }
554 | 
555 | #[cfg(test)]
556 | mod tests {
557 |     use super::*;
558 |     use std::iter::zip;
559 | 
560 |     fn check_ascii_replaced(text: &str) {
561 |         let mut parser = Parser::new();
562 |         parser
563 |             .set_language(&tree_sitter_tlaplus::LANGUAGE.into())
564 |             .unwrap();
565 |         let tree = parser.parse(&text, None).unwrap();
566 |         assert!(!tree.root_node().has_error());
567 |         let mut cursor = QueryCursor::new();
568 |         let queries = get_unicode_mappings()
569 |             .iter()
570 |             .map(|s| s.ascii_query())
571 |             .collect::<Vec<String>>()
572 |             .join("");
573 |         let query = Query::new(&tree_sitter_tlaplus::LANGUAGE.into(), &queries).unwrap();
574 |         assert!(cursor
575 |             .matches(&query, tree.root_node(), "".as_bytes())
576 |             .is_done());
577 |     }
578 | 
579 |     fn unwrap_conversion(input: Result<String, TlaError>) -> String {
580 |         match input {
581 |             Ok(converted) => converted,
582 |             Err(TlaError::InputFileParseError {
583 |                 parse_tree,
584 |                 error_lines,
585 |             }) => {
586 |                 panic!("{:?}\n{}", error_lines, parse_tree.root_node().to_sexp())
587 |             }
588 |             Err(TlaError::OutputFileParseError {
589 |                 output_tree,
590 |                 output,
591 |             }) => {
592 |                 panic!("{}\n{}", output, output_tree.root_node().to_sexp())
593 |             }
594 |             Err(TlaError::InvalidTranslationError {
595 |                 input_tree: _,
596 |                 output_tree: _,
597 |                 output: _,
598 |                 first_diff,
599 |             }) => {
600 |                 panic!("{}", first_diff)
601 |             }
602 |         }
603 |     }
604 | 
605 |     fn run_roundtrip_test(expected: &str) {
606 |         let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false));
607 |         check_ascii_replaced(&intermediate);
608 |         let actual = unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false));
609 |         assert_eq!(
610 |             expected, actual,
611 |             "\nExpected:\n{}\nActual:\n{}",
612 |             expected, actual
613 |         );
614 |     }
615 | 
616 |     #[test]
617 |     fn basic_roundtrip() {
618 |         run_roundtrip_test(
619 |             r#"
620 | ---- MODULE Test ----
621 | op == \A n \in Nat: n >= 0
622 | ===="#,
623 |         );
624 |     }
625 | 
626 |     #[test]
627 |     fn all_canonical_symbols_roundtrip() {
628 |         run_roundtrip_test(
629 |             r#"
630 | ---- MODULE Test ----
631 | op == \A n \in Nat : \E r \in Real : ~(n = r)
632 | op == {x \in R : TRUE}
633 | op == INSTANCE Module WITH x <- y
634 | op == [n \in Nat |-> n]
635 | op == [Nat -> Real]
636 | op == <<1,2,3>>
637 | op == <<<>F>>_vars
638 | op == CASE A -> B [] C -> D [] OTHER -> E
639 | op == label :: []P => Q
640 | op == A -+-> B \equiv C <=> D ~> E /\ F \/ G
641 | op == A := B ::= C /= D <= E >= F \approx G
642 | op == A |- B |= C -| D =| E \asymp F \cong G
643 | op == A \doteq B \gg C \ll D \in E \notin F \prec G
644 | op == A \succ B \preceq C \succeq D \propto E \sim F \simeq G
645 | op == A \sqsubset B \sqsupset C \sqsubseteq D \sqsupseteq E
646 | op == A \subset B \supset C \subseteq D \supseteq E
647 | op == A \intersect B \union C .. D ... E (+) F (-) G
648 | op == A || B (.) C (/) D (\X) E \bigcirc F \bullet G
649 | op == A \div B \o C \star D !! E ?? F \sqcap G
650 | op == A \sqcup B \uplus C \X D \wr E \cdot F ^+
651 | ===="#,
652 |         );
653 |     }
654 | 
655 |     #[test]
656 |     fn all_non_canonical_symbols_roundtrip() {
657 |         let expected = r#"
658 | ---- MODULE Test ----
659 | op == \forall n \in Nat : TRUE
660 | op == \exists r \in Real : TRUE
661 | op == \neg P
662 | op == P \land Q
663 | op == P \lor Q
664 | op == x # y
665 | op == x =< y
666 | op == x \leq y
667 | op == x \geq y
668 | op == P \cap Q
669 | op == P \cup Q
670 | op == x \oplus y
671 | op == x \ominus y
672 | op == x \odot y
673 | op == x \oslash y
674 | op == x \otimes y
675 | op == x \circ y
676 | op == P \times Q
677 | ===="#;
678 |         let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false));
679 |         check_ascii_replaced(&intermediate);
680 |         let actual = unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false));
681 |         // Only first and last lines should be the same
682 |         for (i, (expected_line, actual_line)) in zip(expected.lines(), actual.lines()).enumerate() {
683 |             if i <= 1 || i == expected.lines().count() - 1 {
684 |                 assert_eq!(expected_line, actual_line);
685 |             } else {
686 |                 assert_ne!(expected_line, actual_line);
687 |             }
688 |         }
689 |     }
690 | 
691 |     #[test]
692 |     fn test_basic_jlist() {
693 |         run_roundtrip_test(
694 |             r#"
695 | ---- MODULE Test ----
696 | op == /\ A
697 |       /\ B
698 |       /\ C
699 |       /\ D
700 | ===="#,
701 |         );
702 |     }
703 | 
704 |     #[test]
705 |     fn test_nested_jlist() {
706 |         run_roundtrip_test(
707 |             r#"
708 | ---- MODULE Test ----
709 | op == /\ A
710 |       /\ \/ B 
711 |          \/ C
712 |       /\ D
713 | ===="#,
714 |         );
715 |     }
716 | 
717 |     #[test]
718 |     fn test_full_binary_tree_jlist() {
719 |         run_roundtrip_test(
720 |             r#"
721 | ---- MODULE Test ----
722 | op == /\ \/ /\ \/ /\ A
723 |                   /\ B
724 |                \/ /\ C
725 |                   /\ D
726 |             /\ \/ /\ E
727 |                   /\ F
728 |                \/ /\ G
729 |                   /\ H
730 |          \/ /\ \/ /\ I
731 |                   /\ J
732 |                \/ /\ K
733 |                   /\ L
734 |             /\ \/ /\ M
735 |                   /\ N
736 |                \/ /\ O
737 |                   /\ P
738 |       /\ \/ /\ \/ /\ Q
739 |                   /\ R
740 |                \/ /\ S
741 |                   /\ T
742 |             /\ \/ /\ U
743 |                   /\ V
744 |                \/ /\ W
745 |                   /\ X
746 |          \/ /\ \/ /\ Y
747 |                   /\ Z
748 |                \/ /\ A
749 |                   /\ B
750 |             /\ \/ /\ C
751 |                   /\ D
752 |                \/ /\ E
753 |                   /\ F
754 | ===="#,
755 |         );
756 |     }
757 | 
758 |     #[test]
759 |     fn jlist_with_comments() {
760 |         run_roundtrip_test(
761 |             r#"
762 | ---- MODULE Test ----
763 | op == /\ A
764 |       /\ \/ B 
765 | \* This is a comment
766 |          \/ C
767 | (* This is another comment *)
768 |       /\ D
769 | ===="#,
770 |         );
771 |     }
772 | 
773 |     #[test]
774 |     fn test_aligned_trailing_infix_op() {
775 |         run_roundtrip_test(
776 |             r#"
777 | ---- MODULE Test ----
778 | op == /\ A
779 |       /\ B
780 |       => C
781 | ===="#,
782 |         );
783 |     }
784 | 
785 |     #[test]
786 |     fn test_trailing_infix_op_at_line_start() {
787 |         let expected = r#"
788 | ---- MODULE Test ----
789 | op == /\ A
790 |       /\ B
791 | => C
792 | ===="#;
793 |         let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false));
794 |         check_ascii_replaced(&intermediate);
795 |         unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false));
796 |     }
797 | 
798 |     #[test]
799 |     fn test_nested_trailing_infix_op() {
800 |         let expected = r#"
801 | ---- MODULE Test ----
802 | op == /\ A
803 |       /\ B
804 | => /\ C
805 |    /\ \/ D
806 |       \/ E
807 |       => /\ F
808 |          /\ G
809 |  => H
810 | op == A <=> /\ B
811 |             /\ C
812 |  => D
813 | ===="#;
814 |         let intermediate = unwrap_conversion(rewrite(expected, &Mode::AsciiToUnicode, false));
815 |         check_ascii_replaced(&intermediate);
816 |         unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false));
817 |     }
818 | 
819 |     #[test]
820 |     fn test_misaligned_jlist() {
821 |         run_roundtrip_test(
822 |             r#"
823 | ---- MODULE Test ----
824 | op == /\ A
825 |      /\ B
826 |      /\ C
827 | ===="#,
828 |         );
829 |     }
830 | 
831 |     // See https://github.com/tlaplus-community/tlauc/issues/11
832 |     // Test translation of number sets in their three forms:
833 |     //  1. As an expression
834 |     //  2. As the left-hand-side of an operator definition
835 |     //  3. As a reference to an imported module
836 |     #[test]
837 |     fn test_translate_number_set() {
838 |         run_roundtrip_test(
839 |             r#"
840 | ---- MODULE Test ----
841 | Nat == Nat \union A!B!Nat
842 | Int == Int \union A!B!Int
843 | Real == Real \union A!B!Real
844 | ===="#,
845 |         );
846 |     }
847 | 
848 |     // https://github.com/tlaplus-community/tlauc/issues/1
849 |     #[ignore]
850 |     #[test]
851 |     fn test_infix_op_jlist_from_unicode() {
852 |         run_roundtrip_test(
853 |             r#"
854 | ---- MODULE Test ----
855 | op ≜ ∧ A
856 |      ∧ B
857 |       = C
858 |      ∧ D
859 |       = E
860 | ===="#,
861 |         );
862 |     }
863 | 
864 |     // https://github.com/tlaplus-community/tlauc/issues/2
865 |     #[ignore]
866 |     #[test]
867 |     fn test_block_comments_prefixing_jlist_items() {
868 |         run_roundtrip_test(
869 |             r#"
870 | ---- MODULE Test ----
871 | op == /\ A
872 | (***) /\ \/ B
873 | (******) \/ C
874 | (***) => D
875 | ===="#,
876 |         );
877 |     }
878 | 
879 |     // Tests that file ends with newline (or without newline)
880 |     #[test]
881 |     fn test_empty_input() {
882 |         let input = "";
883 |         let output = rewrite(&input, &Mode::UnicodeToAscii, true);
884 |         assert_eq!(input, output.unwrap());
885 |         let output = rewrite(&input, &Mode::AsciiToUnicode, true);
886 |         assert_eq!(input, output.unwrap());
887 |     }
888 | 
889 |     #[test]
890 |     fn test_single_newline() {
891 |         let input = "\n";
892 |         let output = rewrite(&input, &Mode::UnicodeToAscii, true);
893 |         assert_eq!(input, output.unwrap());
894 |         let output = rewrite(&input, &Mode::AsciiToUnicode, true);
895 |         assert_eq!(input, output.unwrap());
896 |     }
897 | 
898 |     #[test]
899 |     fn test_normal_input_without_newline() {
900 |         run_roundtrip_test(
901 |             r#"
902 | ---- MODULE Test ----
903 | op == 1
904 | ===="#,
905 |         );
906 |     }
907 | 
908 |     #[test]
909 |     fn test_normal_input_with_newline() {
910 |         run_roundtrip_test(
911 |             r#"
912 | ---- MODULE Test ----
913 | op == 1
914 | ====
915 | "#,
916 |         );
917 |     }
918 | }
919 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{anyhow, Context, Result};
  2 | use clap::Parser;
  3 | use std::fs::File;
  4 | use std::io::{Read, Write};
  5 | use std::path::{Path, PathBuf};
  6 | use tlauc::{rewrite, Mode, TlaError};
  7 | 
  8 | #[derive(Parser)]
  9 | #[command(author, version, about, long_about = None)]
 10 | struct Args {
 11 |     #[arg(help = "Path to TLA⁺ file to convert")]
 12 |     input: PathBuf,
 13 | 
 14 |     #[arg(
 15 |         short,
 16 |         long,
 17 |         help = "Optional path to output; will overwrite input file by default"
 18 |     )]
 19 |     output: Option<PathBuf>,
 20 | 
 21 |     #[arg(
 22 |         short,
 23 |         long,
 24 |         default_value_t = false,
 25 |         help = "Whether to force a best-effort conversion, ignoring TLA⁺ parse errors"
 26 |     )]
 27 |     force: bool,
 28 | 
 29 |     #[arg(
 30 |         long,
 31 |         default_value_t = false,
 32 |         help = "Convert the TLA⁺ file to ASCII instead of Unicode"
 33 |     )]
 34 |     ascii: bool,
 35 | }
 36 | 
 37 | fn main() -> Result<()> {
 38 |     let args = Args::parse();
 39 |     let output_path = if let Some(output_path) = args.output {
 40 |         output_path
 41 |     } else {
 42 |         args.input.clone()
 43 |     };
 44 |     convert(
 45 |         args.input.as_path(),
 46 |         output_path.as_path(),
 47 |         if args.ascii {
 48 |             Mode::UnicodeToAscii
 49 |         } else {
 50 |             Mode::AsciiToUnicode
 51 |         },
 52 |         args.force,
 53 |     )
 54 | }
 55 | 
 56 | fn convert(input_path: &Path, output_path: &Path, mode: Mode, force: bool) -> Result<()> {
 57 |     let mut input = String::new();
 58 |     {
 59 |         let mut input_file = File::open(input_path)
 60 |             .context(format!("Failed to open input file [{:?}]", input_path))?;
 61 |         input_file
 62 |             .read_to_string(&mut input)
 63 |             .context(format!("Failed to read input file [{:?}]", input_path))?;
 64 |     }
 65 | 
 66 |     match rewrite(&input, &mode, force) {
 67 |         Ok(output) => {
 68 |             let mut output_file = File::create(output_path)?;
 69 |             output_file.write_all(output.as_bytes()).context(format!("Failed to write to output file [{:?}]", output_path))?;
 70 |             Ok(())
 71 |         },
 72 |         Err(TlaError::InputFileParseError { error_lines, .. }) => {
 73 |             let line_msg = match error_lines.as_slice() {
 74 |                 [] => "Could not identify line of first syntax error.".to_string(),
 75 |                 [..] => format!("Syntax errors might occur on or near the following lines: {:?}.", error_lines)
 76 |             };
 77 |             Err(anyhow!("Failed to correctly parse input TLA⁺ file; use --force flag to bypass this check.\n".to_string() + &line_msg))
 78 |         }
 79 |         Err(TlaError::OutputFileParseError{..}) => Err(anyhow!("Failed to correctly parse converted TLA⁺ output; this is a bug, please report it to the maintainer! Use --force to bypass this check (not recommended).")),
 80 |         Err(TlaError::InvalidTranslationError { input_tree: _, output_tree: _, output: _, first_diff }) => {
 81 |             let err_msg = "Converted TLA⁺ parse tree differs from original; this is a bug, please report it to the maintainer! Use --force to bypass this check (not recommended).";
 82 |             Err(anyhow!("{}\n{}", err_msg, first_diff))
 83 |         }
 84 |     }
 85 | }
 86 | 
 87 | #[cfg(test)]
 88 | mod tests {
 89 |     use super::*;
 90 | 
 91 |     #[test]
 92 |     // https://github.com/tlaplus-community/tlauc/issues/14
 93 |     fn test_input_file_unchanged_on_parse_failure() {
 94 |         let project_root = std::env::var("CARGO_MANIFEST_DIR").unwrap();
 95 |         let input_path = PathBuf::from(project_root)
 96 |             .join("tests")
 97 |             .join("InvalidSyntax.tla");
 98 |         let expected = std::fs::read_to_string(&input_path).unwrap();
 99 |         let output_path = input_path.clone();
100 |         let result: Result<()> = convert(
101 |             input_path.as_path(),
102 |             output_path.as_path(),
103 |             tlauc::Mode::AsciiToUnicode,
104 |             false,
105 |         );
106 |         assert!(result.is_err());
107 |         let actual = std::fs::read_to_string(&output_path).unwrap();
108 |         assert_eq!(expected, actual);
109 |     }
110 | 
111 |     #[test]
112 |     fn test_blank_input_file() {
113 |         let project_root = std::env::var("CARGO_MANIFEST_DIR").unwrap();
114 |         let input_path = PathBuf::from(project_root)
115 |             .join("tests")
116 |             .join("BlankFile.tla");
117 |         let output_path = input_path.clone();
118 |         let result: Result<()> = convert(
119 |             input_path.as_path(),
120 |             output_path.as_path(),
121 |             tlauc::Mode::AsciiToUnicode,
122 |             false,
123 |         );
124 |         assert!(result.is_err());
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/strmeasure.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::{Add, Neg, Range, RangeTo, Sub};
  2 | 
  3 | #[derive(Debug, PartialEq, PartialOrd)]
  4 | pub struct StrElementQuantity {
  5 |     pub char: CharQuantity,
  6 |     pub byte: ByteQuantity,
  7 | }
  8 | 
  9 | #[derive(Debug)]
 10 | pub struct StrElementDiff {
 11 |     pub char: CharDiff,
 12 |     pub byte: ByteDiff,
 13 | }
 14 | 
 15 | impl StrElementQuantity {
 16 |     pub fn from_byte_index(&byte_index: &ByteQuantity, text: &str) -> Self {
 17 |         StrElementQuantity {
 18 |             char: CharQuantity::from_byte_index(&byte_index, text),
 19 |             byte: byte_index,
 20 |         }
 21 |     }
 22 | 
 23 |     pub fn as_byte_range(range: &Range<StrElementQuantity>) -> Range<usize> {
 24 |         let range = range.start.byte..range.end.byte;
 25 |         ByteQuantity::as_range(&range)
 26 |     }
 27 | }
 28 | 
 29 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
 30 | pub struct CharQuantity(pub usize);
 31 | 
 32 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
 33 | pub struct ByteQuantity(pub usize);
 34 | 
 35 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
 36 | pub struct CharDiff(pub i8);
 37 | 
 38 | #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
 39 | pub struct ByteDiff(pub i8);
 40 | 
 41 | impl CharQuantity {
 42 |     pub fn from_byte_index(byte_index: &ByteQuantity, text: &str) -> Self {
 43 |         CharQuantity(text[byte_index.range_to()].chars().count())
 44 |     }
 45 | 
 46 |     pub fn repeat(&self, text: &str) -> String {
 47 |         text.repeat(self.0)
 48 |     }
 49 | }
 50 | 
 51 | impl ByteQuantity {
 52 |     pub fn from_char_index(char_index: &CharQuantity, text: &str) -> Self {
 53 |         match text.char_indices().nth(char_index.0) {
 54 |             Some((byte_index, _)) => ByteQuantity(byte_index),
 55 |             None => panic!("Cannot get character {} in string {}", char_index.0, text),
 56 |         }
 57 |     }
 58 | 
 59 |     pub fn as_range(range: &Range<ByteQuantity>) -> Range<usize> {
 60 |         range.start.range(&range.end)
 61 |     }
 62 | 
 63 |     pub fn range_to(&self) -> RangeTo<usize> {
 64 |         ..self.0
 65 |     }
 66 | 
 67 |     fn range(&self, other: &ByteQuantity) -> Range<usize> {
 68 |         self.0..other.0
 69 |     }
 70 | }
 71 | 
 72 | impl CharDiff {
 73 |     pub fn magnitude(&self) -> CharQuantity {
 74 |         CharQuantity(i8::abs(self.0) as usize)
 75 |     }
 76 | }
 77 | 
 78 | impl Add<CharDiff> for CharQuantity {
 79 |     type Output = Self;
 80 | 
 81 |     fn add(self, offset: CharDiff) -> Self::Output {
 82 |         let result = self.0 as i32 + offset.0 as i32;
 83 |         assert!(
 84 |             result >= 0,
 85 |             "Adding char offset to char index results in negative value: {} {}",
 86 |             self.0,
 87 |             offset.0
 88 |         );
 89 |         CharQuantity(result as usize)
 90 |     }
 91 | }
 92 | 
 93 | impl Add<ByteDiff> for ByteQuantity {
 94 |     type Output = Self;
 95 | 
 96 |     fn add(self, offset: ByteDiff) -> Self::Output {
 97 |         let result = self.0 as i32 + offset.0 as i32;
 98 |         assert!(
 99 |             result >= 0,
100 |             "Adding byte offset to byte index results in negative value: {} {}",
101 |             self.0,
102 |             offset.0
103 |         );
104 |         ByteQuantity(result as usize)
105 |     }
106 | }
107 | 
108 | impl Add<CharDiff> for CharDiff {
109 |     type Output = Self;
110 | 
111 |     fn add(self, other: CharDiff) -> Self::Output {
112 |         CharDiff(self.0 + other.0)
113 |     }
114 | }
115 | 
116 | impl Sub<CharQuantity> for CharQuantity {
117 |     type Output = CharDiff;
118 | 
119 |     fn sub(self, other: CharQuantity) -> Self::Output {
120 |         CharDiff((self.0 as i32 - other.0 as i32) as i8)
121 |     }
122 | }
123 | 
124 | impl Sub<CharDiff> for CharQuantity {
125 |     type Output = Self;
126 | 
127 |     fn sub(self, other: CharDiff) -> Self::Output {
128 |         self + -other
129 |     }
130 | }
131 | 
132 | impl Sub<ByteQuantity> for ByteQuantity {
133 |     type Output = ByteDiff;
134 | 
135 |     fn sub(self, other: ByteQuantity) -> Self::Output {
136 |         ByteDiff((self.0 as i32 - other.0 as i32) as i8)
137 |     }
138 | }
139 | 
140 | impl Sub<CharDiff> for CharDiff {
141 |     type Output = Self;
142 | 
143 |     fn sub(self, other: CharDiff) -> Self::Output {
144 |         CharDiff(self.0 + -other.0)
145 |     }
146 | }
147 | 
148 | impl Neg for CharDiff {
149 |     type Output = Self;
150 | 
151 |     fn neg(self) -> Self::Output {
152 |         CharDiff(-self.0)
153 |     }
154 | }
155 | 
156 | impl Neg for ByteDiff {
157 |     type Output = Self;
158 | 
159 |     fn neg(self) -> Self::Output {
160 |         ByteDiff(-self.0)
161 |     }
162 | }
163 | 


--------------------------------------------------------------------------------
/tests/BlankFile.tla:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tlaplus-community/tlauc/39f45e218ec02c57eb5991b967bef6d0ac0e4968/tests/BlankFile.tla


--------------------------------------------------------------------------------
/tests/InvalidSyntax.tla:
--------------------------------------------------------------------------------
1 | ---- MODULE test ----
2 | foo == invalid,
3 | ====
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/corpus_tests.rs:
--------------------------------------------------------------------------------
 1 | mod corpus_tests {
 2 |     use glob::glob;
 3 |     use rayon::prelude::*;
 4 |     use std::ffi::OsStr;
 5 |     use std::fs::File;
 6 |     use std::io::Read;
 7 |     use std::path::PathBuf;
 8 |     use std::time::Instant;
 9 |     use tlauc::{rewrite, Mode, TlaError};
10 | 
11 |     fn unwrap_conversion(input: Result<String, TlaError>, path: &PathBuf) -> String {
12 |         match input {
13 |             Ok(converted) => converted,
14 |             Err(TlaError::InputFileParseError { .. }) => {
15 |                 panic!("Failed to parse input file [{:?}]", path)
16 |             }
17 |             Err(TlaError::OutputFileParseError { .. }) => {
18 |                 panic!("Failed to parse output file [{:?}]", path)
19 |             }
20 |             Err(TlaError::InvalidTranslationError {
21 |                 input_tree: _,
22 |                 output_tree: _,
23 |                 output: _,
24 |                 first_diff,
25 |             }) => panic!(
26 |                 "Input/output parse tree mismatch for [{:?}]: [{:?}]",
27 |                 path, first_diff
28 |             ),
29 |         }
30 |     }
31 | 
32 |     #[test]
33 |     fn roundtrip_all_example_specs() {
34 |         let start = Instant::now();
35 |         let skip: Vec<&str> = vec!["SomeSpecName.tla"];
36 |         println!("SKIPPING {:?}", skip);
37 |         let skip: Vec<&OsStr> = skip.iter().map(|s| OsStr::new(s)).collect();
38 |         let paths: Vec<PathBuf> = glob("tests/corpus/**/*.tla")
39 |             .unwrap()
40 |             .into_iter()
41 |             .filter_map(|path| path.ok())
42 |             .filter(|path| !skip.contains(&path.file_name().unwrap()))
43 |             .collect();
44 | 
45 |         paths.par_iter().for_each(|path| {
46 |             println!("{:?}", path);
47 |             let mut input = String::new();
48 |             {
49 |                 let mut input_file =
50 |                     File::open(&path).expect(&format!("Failed to open input file [{:?}]", path));
51 |                 input_file
52 |                     .read_to_string(&mut input)
53 |                     .expect(&format!("Failed to read input file [{:?}]", path));
54 |             }
55 | 
56 |             let intermediate =
57 |                 unwrap_conversion(rewrite(&input, &Mode::AsciiToUnicode, false), path);
58 |             unwrap_conversion(rewrite(&intermediate, &Mode::UnicodeToAscii, false), path);
59 |         });
60 | 
61 |         println!("Corpus tests took {} seconds", start.elapsed().as_secs());
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------