├── .vscode
    ├── settings.json
    ├── extensions.json
    └── launch.json
├── .gitignore
├── crlf.sh
├── src
    └── files_to_parse
    │   └── demo.zig
├── LICENSE
└── README.md


/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"editor.detectIndentation": false,
3 | 	"editor.insertSpaces": true
4 | }
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | zig-cache
2 | .zig-cache
3 | zig-out
4 | zig-out-old
5 | *.js
6 | .token_data1
7 | .token_data2
8 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "recommendations": [
3 |         "AugusteRame.zls-vscode",
4 |         "esbenp.prettier-vscode",
5 |         "xaver.clang-format",
6 |         "vadimcn.vscode-lldb"
7 |     ]
8 | }
9 | 


--------------------------------------------------------------------------------
/crlf.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | if [ "$1" == "crlf" ] || [ "$1" == "lf" ]; then
 3 |     find ./src/files_to_parse -type f -name '*.zig' | while read file; do
 4 |         if [ "$1" == "crlf" ]; then
 5 |             perl -pi -e 's/\r?\n/\r\n/g' "$file"
 6 |         else
 7 |             perl -pi -e 's/\r\n/\n/g' "$file"
 8 |         fi
 9 |     done
10 | else
11 |     cat << EOF
12 | Usage: crlf.sh [crlf|lf]
13 | EOF
14 |    exit 1;
15 | fi
16 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "type": "lldb",
 6 |             "request": "launch",
 7 |             "name": "DynSDT test",
 8 |             "program": "./zig-out/bin/exe",
 9 |             "args": [],
10 |             "cwd": "${workspaceFolder}",
11 |             "env": {
12 |                 "FORCE_COLOR": "1"
13 |             },
14 |             "console": "internalConsole",
15 |         }
16 |     ]
17 | }
18 | 


--------------------------------------------------------------------------------
/src/files_to_parse/demo.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const builtin = @import("builtin");
  3 | const math = std.math;
  4 | 
  5 | pub fn main() void {
  6 |     const ops = [_][]const u8{ ".**", "!", "|", "||", "|=", "=", "==", "=>", "!=", "%", "%=", ".", ".*", "..", "...", "^", "^=", "+", "++", "+=", "+%", "+%=", "+|", "+|=", "-", "-=", "-%", "-%=", "-|", "-|=", "*", "*=", "**", "*%", "*%=", "*|", "*|=", "->", "/", "/=", "&", "&=", "<", "<=", "<<", "<<=", "<<|", "<<|=", ">", ">=", ">>", ">>=", "~", "//", "///", "//!", "\\\\", "+!", "-!", "*!", ".^", ".$", ".#", ":", ";", "[", "]", "?", "(", ")", "{", "}", ",", "*^" };
  7 | 
  8 |     comptime var keywords: [ops.len][4]u8 = undefined;
  9 | 
 10 |     inline for (ops, 0..) |op, i| {
 11 |         keywords[i] = (op ++ ("\x00" ** (4 - op.len))).*;
 12 |     }
 13 | 
 14 |     // for (keywords) |keyword| std.debug.print("{}: \"{s}\"\n", .{ std.mem.readIntLittle(u32, keyword[0..4]), keyword });
 15 | 
 16 |     // {
 17 |     //     var taken: u64 = 0;
 18 |     //     for (keywords) |keyword| {
 19 |     //         const len: u16 = @truncate(keyword.len);
 20 |     //         const a = std.mem.readIntLittle(u16, keyword[0..2]);
 21 |     //         const b = std.mem.readIntLittle(u16, keyword[len - 2 ..][0..2]);
 22 | 
 23 |     //         var c: u6 = @truncate(((@as(u32, a ^ (len >> 1)) *% 316177) *% (b >> 1)) >> 23);
 24 |     //         if (c == 15 or c == 35) c ^= @truncate(len & 6);
 25 | 
 26 |     //         const slot = @as(@TypeOf(taken), 1) << c;
 27 |     //         std.debug.print("{s}: {}\n", .{ keyword, c });
 28 |     //         if ((taken & slot) != 0) std.debug.print("\tCollision!\n", .{});
 29 |     //         taken |= slot;
 30 |     //     }
 31 |     // }
 32 | 
 33 |     // if (1 == 1) return;
 34 | 
 35 |     // const tokens = [_]u8{ '!', '|', '=', '(', ')', ';', '%', '{', '}', '[', ']', '.', '^', '+', '-', '*', ':', '/', ',', '&', '?', '<', '>', '~' };
 36 |     // _ = tokens;
 37 |     // // for (tokens) |token| std.debug.print("{}\n", .{token});
 38 |     // var taken_global: u128 = 0;
 39 |     // // comptime for (tokens) |token| {
 40 |     // //     taken_global |= @as(u256, 1) << token;
 41 |     // // };
 42 | 
 43 |     // // ((a >> 1) *% (b >> 1)) >> 6)
 44 | 
 45 |     // var num_set: std.AutoHashMapUnmanaged(u64, void) = .{};
 46 | 
 47 |     const xes = [_]u32{ 21838721, 43677442, 58758119, 75459609, 87354884, 112379007, 117516238, 150919218, 156056449, 174709767, 174709768, 192975847, 209677337, 224758014, 235032476, 246596735, 290274177, 301838436, 312112898, 327193575, 343895065, 349419534, 349419535, 349419536, 380814463, 385951694, 419354674, 424491905, 449516028, 461411303, 470064952, 478112793, 493193470, 515032191, 558709633, 580548354, 595629031, 603676872, 612330521, 624225796, 649249919, 654387150, 687790130, 692927361, 698839068, 698839070, 698839071, 698839072, 729846759, 746548249, 761628926, 771903388, 783467647, 827145089, 838709348, 848983810, 864064487, 880765977, 899032056, 899032057, 917685375, 922822606, 940129904, 940129905, 956225586, 961362817, 986386940, 998282215, 1014983705, 1030064382, 1051903103, 1095580545, 1117419266, 1132499943, 1149201433, 1161096708, 1186120831, 1191258062, 1207353743, 1207353744, 1224661042, 1229798273, 1248451591, 1248451592, 1266717671, 1283419161, 1298499838, 1308774300, 1320338559, 1364016001, 1385854722, 1454556287, 1498233729, 1523257852, 1566935294, 1588774015, 1632451457, 1654290178, 1697967620, 1722991743, 1766669185, 1798064112, 1798064113, 1798064114, 1835370750, 1857209471, 1900886913, 1922725634, 1972773880, 1972773881, 1991427199, 2029967410, 2035104641, 2060128764, 2072024039, 2088725529, 2103806206, 2125644927, 2169322369, 2191161090, 2206241767, 2222943257, 2234838532, 2259862655, 2264999886, 2298402866, 2303540097, 2322193415, 2322193416, 2340459495, 2357160985, 2372241662, 2382516124, 2394080383, 2414707486, 2414707488, 2437757825, 2449322084, 2459596546, 2474677223, 2491378713, 2496903182, 2496903183, 2496903184, 2528298111, 2533435342, 2566838322, 2571975553, 2596999676, 2608894951, 2617548600, 2625596441, 2640677118, 2662515839, 2706193281, 2728032002, 2743112679, 2751160520, 2759814169, 2771709444, 2796733567, 2801870798, 2835273778, 2840411009, 2877330407, 2894031897, 2909112574, 2919387036, 2930951295, 2974628737, 2986192996, 2996467458, 3011548135, 3028249625, 3046515704, 3046515705, 3065169023, 3070306254, 3087613552, 3087613553, 3103709234, 3108846465, 3133870588, 3145765863, 3162467353, 3177548030, 3199386751, 3243064193, 3264902914, 3279983591, 3296685081, 3308580356, 3333604479, 3338741710, 3354837391, 3354837392, 3372144690, 3377281921, 3395935239, 3395935240, 3414201319, 3430902809, 3445983486, 3456257948, 3467822207, 3511499649, 3523063908, 3533338370, 3548419047, 3565120537, 3596128224, 3596128225, 3596128226, 3596128228, 3602039935, 3607177166, 3640580146, 3645717377, 3670741500, 3682636775, 3691290424, 3699338265, 3714418942, 3736257663, 3779935105, 3801773826, 3816854503, 3824902344, 3833555993, 3845451268, 3870475391, 3875612622, 3909015602, 3914152833, 3945547760, 3945547761, 3945547762, 3951072231, 3967773721, 3982854398, 3993128860, 4004693119, 4048370561, 4059934820, 4070209282, 4085289959, 4101991449, 4120257528, 4120257529, 4138910847, 4144048078, 4177451058, 4182588289, 4207612412, 4219507687, 4236209177, 4251289854, 4273128575 };
 48 | 
 49 |     const l_s = [_]u5{ 16, 17, 16, 16, 18, 16, 17, 17, 16, 19, 19, 16, 16, 17, 18, 16, 16, 18, 17, 16, 16, 20, 20, 20, 16, 17, 17, 6, 18, 16, 19, 16, 17, 16, 16, 17, 16, 19, 16, 18, 16, 17, 17, 16, 21, 21, 21, 21, 16, 16, 17, 18, 16, 16, 18, 7, 16, 16, 19, 19, 16, 17, 20, 20, 17, 16, 18, 16, 16, 17, 16, 16, 17, 16, 16, 18, 16, 17, 20, 20, 17, 16, 19, 9, 16, 16, 17, 18, 16, 16, 17, 16, 16, 18, 17, 16, 16, 17, 18, 16, 16, 20, 20, 20, 17, 16, 16, 17, 19, 19, 16, 7, 16, 18, 16, 16, 17, 16, 16, 17, 16, 16, 18, 16, 17, 17, 16, 19, 19, 16, 16, 17, 18, 16, 21, 21, 16, 18, 17, 6, 16, 20, 20, 20, 16, 17, 17, 16, 18, 16, 19, 16, 17, 16, 16, 17, 16, 19, 16, 18, 16, 17, 17, 16, 16, 16, 17, 8, 16, 16, 18, 17, 16, 16, 19, 19, 16, 17, 20, 20, 17, 16, 18, 16, 16, 17, 16, 16, 17, 16, 16, 18, 16, 17, 20, 0, 17, 16, 19, 19, 16, 16, 17, 18, 16, 16, 18, 17, 16, 16, 21, 21, 21, 21, 16, 17, 17, 16, 18, 16, 19, 16, 17, 6, 16, 17, 16, 19, 16, 18, 16, 17, 17, 16, 20, 20, 20, 16, 16, 17, 18, 16, 16, 18, 17, 16, 16, 19, 19, 16, 17, 7, 16, 18, 16, 16, 17, 16 };
 50 | 
 51 |     for (xes, l_s) |x, l| {
 52 |         _ = l;
 53 |         _ = x;
 54 |     }
 55 | 
 56 |     // var x: u32 = 1320338559;
 57 |     var ii: u7 = 0;
 58 |     _ = ii;
 59 |     var n: u32 = 0;
 60 |     for (xes) |x| {
 61 |         // ii +%= 1;
 62 |         // if (ii == 0) break;
 63 |         // x +%= 1;
 64 |         // if (x == 0) break;
 65 |         var i: i6 = 0;
 66 |         while (true) {
 67 |             // var j: i4 = -7;
 68 |             // while (true) {
 69 |             var k: i2 = 0;
 70 |             while (true) {
 71 |                 const uint = std.meta.Int(.unsigned, 128);
 72 |                 var takens = std.mem.zeroes([26]uint);
 73 |                 for (keywords) |keyword| {
 74 |                     // const val = switch (keyword[0]) {
 75 |                     //     ':', ';', '[', ']', '?', '(', ')', '{', '}', ',', '@' => |c| c + ii,
 76 |                     //     else => blk: {
 77 |                     const a: u32 = std.mem.readIntLittle(u32, keyword[0..4]);
 78 |                     // const b: u8 = std.mem.readIntLittle(u8, keyword[keyword.len - 1 ..][0..1]);
 79 |                     const c: u3 = @truncate(keyword.len);
 80 | 
 81 |                     const a_i = if (i == -15) 1 else std.math.shl(u32, a, i);
 82 |                     // const b_j = if (j == -7) 1 else std.math.shl(u32, b, j);
 83 |                     const c_k = std.math.shl(u32, c, k);
 84 |                     const val = (a_i *% x) >> @truncate(c_k);
 85 |                     // break :blk val;
 86 |                     //     },
 87 |                     // };
 88 | 
 89 |                     // (a_i *% x) ^ (b_j *% c_k);
 90 |                     // (a_i *% x) *% (b_j *% c_k);
 91 |                     // (a_i ^ x) ^ (b_j ^ c_k);
 92 |                     // ((a_i ^ b_j) ^ c_k) *% x;
 93 |                     // ((a_i *% b_j) ^ c_k) *% x;
 94 |                     // ((a_i *% b_j) *% x) >> @truncate(c_k)
 95 |                     // ((a_i ^ b_j) *% x) >> @truncate(c_k);
 96 |                     // ((a_i ^ b_j) *% x) ^ c_k;
 97 |                     // ((a_i ^ b_j) >> @truncate(c_k)) *% x;
 98 |                     // ((a_i *% b_j) >> @truncate(c_k)) *% x;
 99 |                     // ((a_i ^ b_j) *% c_k) *% x;
100 | 
101 |                     // 53	59	61
102 |                     // std.debug.print("{} {}\n", .{ a_i, c_k });
103 | 
104 |                     for (&takens, 0..) |*taken, l| {
105 |                         if (val >> @intCast(l) < 128) {
106 |                             const slot = @as(uint, 1) << @truncate(val >> @intCast(l));
107 |                             taken.* |= slot;
108 |                         }
109 |                     }
110 |                 }
111 | 
112 |                 for (&takens, 0..) |taken, l| {
113 |                     // std.debug.print("{}\n", .{@popCount(taken)});
114 |                     if (@popCount(taken) == keywords.len) {
115 |                         n += 1;
116 |                         const str =
117 |                             \\export fn f{}(a: u8, b: u8, len: u4) u7 {{
118 |                             \\    const a_i = a {s} {};
119 |                             // \\    const b_j = b {s} {};
120 |                             \\    const c_k = len {s} {};
121 |                             \\    const x = {};
122 |                             \\    const val = (a_i *% x) >> @truncate(c_k);
123 |                             \\    return @truncate(val >> {});
124 |                             \\}}
125 |                             \\
126 |                         ;
127 |                         // std.debug.print("Close call:\n", .{});
128 |                         std.debug.print(str, .{
129 |                             n,
130 |                             if (i > 0) "<<" else ">>",
131 |                             std.math.absCast(i),
132 |                             // if (j > 0) "<<" else ">>",
133 |                             // std.math.absCast(j),
134 |                             if (k > 0) "<<" else ">>",
135 |                             std.math.absCast(k),
136 |                             x,
137 |                             l,
138 |                         });
139 |                     }
140 |                 }
141 |                 break;
142 |                 // k = std.math.add(i2, k, 1) catch break;
143 |             }
144 |             // j = std.math.add(i4, j, 1) catch break;
145 |             // }
146 |             // i += 1;
147 |             // if (i == 16) break;
148 |             break;
149 |         }
150 |     }
151 | }
152 | 
153 | // 162
154 | 
155 | // [5,13,17,30,31,33,38,43,48,49,57,59,75,76,82,84,85,86,89,91,93,95,101,103,107,109,111,113,114,117,121,128,134,139,141,142,153,155,156,158,161,162,164,177,180,190,195,213,217]
156 | // ['!', '|', '=', '(', ')', ';', '%', '{', '}', '[', ']', '.', '^', '+', '-', '*', ':', '/', ',', '&', '?', '<', '>', '~']
157 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | MIT License
  2 | 
  3 | Copyright (c) 2023 Niles Salter
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | of this software and associated documentation files (the "Software"), to deal
  7 | in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 | The above copyright notice and this permission notice shall be included in all
 13 | copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | SOFTWARE.
 22 | 
 23 | -------------------------------------------------------------------------
 24 | 
 25 | The pieces of this software that are derived from simdjson and simdjzon are marked as such in the source code, and are released under the Apache license:
 26 | 
 27 | 
 28 |                                  Apache License
 29 |                            Version 2.0, January 2004
 30 |                         http://www.apache.org/licenses/
 31 | 
 32 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 33 | 
 34 |    1. Definitions.
 35 | 
 36 |       "License" shall mean the terms and conditions for use, reproduction,
 37 |       and distribution as defined by Sections 1 through 9 of this document.
 38 | 
 39 |       "Licensor" shall mean the copyright owner or entity authorized by
 40 |       the copyright owner that is granting the License.
 41 | 
 42 |       "Legal Entity" shall mean the union of the acting entity and all
 43 |       other entities that control, are controlled by, or are under common
 44 |       control with that entity. For the purposes of this definition,
 45 |       "control" means (i) the power, direct or indirect, to cause the
 46 |       direction or management of such entity, whether by contract or
 47 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 48 |       outstanding shares, or (iii) beneficial ownership of such entity.
 49 | 
 50 |       "You" (or "Your") shall mean an individual or Legal Entity
 51 |       exercising permissions granted by this License.
 52 | 
 53 |       "Source" form shall mean the preferred form for making modifications,
 54 |       including but not limited to software source code, documentation
 55 |       source, and configuration files.
 56 | 
 57 |       "Object" form shall mean any form resulting from mechanical
 58 |       transformation or translation of a Source form, including but
 59 |       not limited to compiled object code, generated documentation,
 60 |       and conversions to other media types.
 61 | 
 62 |       "Work" shall mean the work of authorship, whether in Source or
 63 |       Object form, made available under the License, as indicated by a
 64 |       copyright notice that is included in or attached to the work
 65 |       (an example is provided in the Appendix below).
 66 | 
 67 |       "Derivative Works" shall mean any work, whether in Source or Object
 68 |       form, that is based on (or derived from) the Work and for which the
 69 |       editorial revisions, annotations, elaborations, or other modifications
 70 |       represent, as a whole, an original work of authorship. For the purposes
 71 |       of this License, Derivative Works shall not include works that remain
 72 |       separable from, or merely link (or bind by name) to the interfaces of,
 73 |       the Work and Derivative Works thereof.
 74 | 
 75 |       "Contribution" shall mean any work of authorship, including
 76 |       the original version of the Work and any modifications or additions
 77 |       to that Work or Derivative Works thereof, that is intentionally
 78 |       submitted to Licensor for inclusion in the Work by the copyright owner
 79 |       or by an individual or Legal Entity authorized to submit on behalf of
 80 |       the copyright owner. For the purposes of this definition, "submitted"
 81 |       means any form of electronic, verbal, or written communication sent
 82 |       to the Licensor or its representatives, including but not limited to
 83 |       communication on electronic mailing lists, source code control systems,
 84 |       and issue tracking systems that are managed by, or on behalf of, the
 85 |       Licensor for the purpose of discussing and improving the Work, but
 86 |       excluding communication that is conspicuously marked or otherwise
 87 |       designated in writing by the copyright owner as "Not a Contribution."
 88 | 
 89 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 90 |       on behalf of whom a Contribution has been received by Licensor and
 91 |       subsequently incorporated within the Work.
 92 | 
 93 |    2. Grant of Copyright License. Subject to the terms and conditions of
 94 |       this License, each Contributor hereby grants to You a perpetual,
 95 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 96 |       copyright license to reproduce, prepare Derivative Works of,
 97 |       publicly display, publicly perform, sublicense, and distribute the
 98 |       Work and such Derivative Works in Source or Object form.
 99 | 
100 |    3. Grant of Patent License. Subject to the terms and conditions of
101 |       this License, each Contributor hereby grants to You a perpetual,
102 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
103 |       (except as stated in this section) patent license to make, have made,
104 |       use, offer to sell, sell, import, and otherwise transfer the Work,
105 |       where such license applies only to those patent claims licensable
106 |       by such Contributor that are necessarily infringed by their
107 |       Contribution(s) alone or by combination of their Contribution(s)
108 |       with the Work to which such Contribution(s) was submitted. If You
109 |       institute patent litigation against any entity (including a
110 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
111 |       or a Contribution incorporated within the Work constitutes direct
112 |       or contributory patent infringement, then any patent licenses
113 |       granted to You under this License for that Work shall terminate
114 |       as of the date such litigation is filed.
115 | 
116 |    4. Redistribution. You may reproduce and distribute copies of the
117 |       Work or Derivative Works thereof in any medium, with or without
118 |       modifications, and in Source or Object form, provided that You
119 |       meet the following conditions:
120 | 
121 |       (a) You must give any other recipients of the Work or
122 |           Derivative Works a copy of this License; and
123 | 
124 |       (b) You must cause any modified files to carry prominent notices
125 |           stating that You changed the files; and
126 | 
127 |       (c) You must retain, in the Source form of any Derivative Works
128 |           that You distribute, all copyright, patent, trademark, and
129 |           attribution notices from the Source form of the Work,
130 |           excluding those notices that do not pertain to any part of
131 |           the Derivative Works; and
132 | 
133 |       (d) If the Work includes a "NOTICE" text file as part of its
134 |           distribution, then any Derivative Works that You distribute must
135 |           include a readable copy of the attribution notices contained
136 |           within such NOTICE file, excluding those notices that do not
137 |           pertain to any part of the Derivative Works, in at least one
138 |           of the following places: within a NOTICE text file distributed
139 |           as part of the Derivative Works; within the Source form or
140 |           documentation, if provided along with the Derivative Works; or,
141 |           within a display generated by the Derivative Works, if and
142 |           wherever such third-party notices normally appear. The contents
143 |           of the NOTICE file are for informational purposes only and
144 |           do not modify the License. You may add Your own attribution
145 |           notices within Derivative Works that You distribute, alongside
146 |           or as an addendum to the NOTICE text from the Work, provided
147 |           that such additional attribution notices cannot be construed
148 |           as modifying the License.
149 | 
150 |       You may add Your own copyright statement to Your modifications and
151 |       may provide additional or different license terms and conditions
152 |       for use, reproduction, or distribution of Your modifications, or
153 |       for any such Derivative Works as a whole, provided Your use,
154 |       reproduction, and distribution of the Work otherwise complies with
155 |       the conditions stated in this License.
156 | 
157 |    5. Submission of Contributions. Unless You explicitly state otherwise,
158 |       any Contribution intentionally submitted for inclusion in the Work
159 |       by You to the Licensor shall be under the terms and conditions of
160 |       this License, without any additional terms or conditions.
161 |       Notwithstanding the above, nothing herein shall supersede or modify
162 |       the terms of any separate license agreement you may have executed
163 |       with Licensor regarding such Contributions.
164 | 
165 |    6. Trademarks. This License does not grant permission to use the trade
166 |       names, trademarks, service marks, or product names of the Licensor,
167 |       except as required for reasonable and customary use in describing the
168 |       origin of the Work and reproducing the content of the NOTICE file.
169 | 
170 |    7. Disclaimer of Warranty. Unless required by applicable law or
171 |       agreed to in writing, Licensor provides the Work (and each
172 |       Contributor provides its Contributions) on an "AS IS" BASIS,
173 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
174 |       implied, including, without limitation, any warranties or conditions
175 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
176 |       PARTICULAR PURPOSE. You are solely responsible for determining the
177 |       appropriateness of using or redistributing the Work and assume any
178 |       risks associated with Your exercise of permissions under this License.
179 | 
180 |    8. Limitation of Liability. In no event and under no legal theory,
181 |       whether in tort (including negligence), contract, or otherwise,
182 |       unless required by applicable law (such as deliberate and grossly
183 |       negligent acts) or agreed to in writing, shall any Contributor be
184 |       liable to You for damages, including any direct, indirect, special,
185 |       incidental, or consequential damages of any character arising as a
186 |       result of this License or out of the use or inability to use the
187 |       Work (including but not limited to damages for loss of goodwill,
188 |       work stoppage, computer failure or malfunction, or any and all
189 |       other commercial damages or losses), even if such Contributor
190 |       has been advised of the possibility of such damages.
191 | 
192 |    9. Accepting Warranty or Additional Liability. While redistributing
193 |       the Work or Derivative Works thereof, You may choose to offer,
194 |       and charge a fee for, acceptance of support, warranty, indemnity,
195 |       or other liability obligations and/or rights consistent with this
196 |       License. However, in accepting such obligations, You may act only
197 |       on Your own behalf and on Your sole responsibility, not on behalf
198 |       of any other Contributor, and only if You agree to indemnify,
199 |       defend, and hold each Contributor harmless for any liability
200 |       incurred by, or claims asserted against, such Contributor by reason
201 |       of your accepting any such warranty or additional liability.
202 | 
203 |    END OF TERMS AND CONDITIONS
204 | 
205 |    APPENDIX: How to apply the Apache License to your work.
206 | 
207 |       To apply the Apache License to your work, attach the following
208 |       boilerplate notice, with the fields enclosed by brackets "[]"
209 |       replaced with your own identifying information. (Don't include
210 |       the brackets!)  The text should be enclosed in the appropriate
211 |       comment syntax for the file format. We also recommend that a
212 |       file or class name and description of purpose be included on the
213 |       same "printed page" as the copyright notice for easier
214 |       identification within third-party archives.
215 | 
216 |    Copyright [yyyy] [name of copyright owner]
217 | 
218 |    Licensed under the Apache License, Version 2.0 (the "License");
219 |    you may not use this file except in compliance with the License.
220 |    You may obtain a copy of the License at
221 | 
222 |        http://www.apache.org/licenses/LICENSE-2.0
223 | 
224 |    Unless required by applicable law or agreed to in writing, software
225 |    distributed under the License is distributed on an "AS IS" BASIS,
226 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
227 |    See the License for the specific language governing permissions and
228 |    limitations under the License.
229 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [<sub><sub><img src="https://raw.githubusercontent.com/Validark/Validark/master/zig-z.svg" alt="Lua" height="40"></sub></sub>](https://ziglang.org/) Accelerated Zig Parser
  2 | 
  3 | A high-throughput tokenizer and parser (soon™️) for the Zig programming language.
  4 | 
  5 | The mainline Zig tokenizer uses a deterministic finite state machine. Those are pretty good for some applications, but tokenizing can often employ the use of other techniques for added speed.
  6 | 
  7 | Two tokenizer implementations are provided.
  8 | 
  9 | 1. A version that produces a few bitstrings per 64-byte chunk and uses those to skip over continuation-character matching. I gave [two](https://www.youtube.com/watch?v=oN8LDpWuPWw&t=530s) [talks](https://www.youtube.com/live/FDiUKafPs0U&t=210s) on this subject. (**Currently this code has gone poof, but I will resurrect this for comparison's sake within 3 months (when I give my final Utah-Zig talk on the subject of the Zig Tokenizer in July)**)
 10 | 
 11 | 2. A version that produces bitstrings for EVERYTHING we want to do within a 64-byte chunk, and utilizes vector compression to find the extents of all tokens simulataneously. See [this animation](https://validark.dev/presentations/simd-intro#keywords-lookup). I also gave a talk (really more of a rant) about my grand plans [here](https://www.youtube.com/watch?v=NM1FNB5nagk). Unfortunately it did not turn out how I had hoped because I got sick before I had time to give it the love it deserves. But my next talk shall knocketh thy socks off, guaranteed!
 12 | 
 13 | The test bench as it sits on my computer right now prints this out when I run it:
 14 | 
 15 | ```
 16 |        Read in files in 26.479ms (1775.63 MB/s) and used 47.018545MB memory with 3504899722 lines across 3253 files
 17 | Legacy Tokenizing took              91.419ms (0.51 GB/s, 38.34B loc/s) and used 40.07934MB memory
 18 | Tokenizing with compression took    33.301ms (1.41 GB/s, 105.25B loc/s) and used 16.209284MB memory
 19 |        That's 2.75x faster and 2.47x less memory than the mainline implementation!
 20 | ```
 21 | 
 22 | And I still have more optimization plans >:D !!! Stay tuned!
 23 | 
 24 | See my article on the new tokenizer, here: https://validark.dev/posts/deus-lex-machina/
 25 | 
 26 | ---
 27 | 
 28 | ## Tokenizer 1:
 29 | 
 30 | Everything beneath this notice was written with regards to Tokenizer 1. The information is a little out-of-date but the optimization strategies are still applicable.
 31 | 
 32 | 
 33 | [Click here to see my latest work.](#latest-work)
 34 | 
 35 | # Results
 36 | 
 37 | **Currently the utf8 validator is turned off! I did a lot of performance optimization the past few days and did not finish porting my changes over yet.**
 38 | 
 39 | The test bench fully reads in all of the Zig files under the folders in the `src/files_to_parse` folder. In my test I installed the Zig compiler, ZLS, and a few other Zig projects in my `src/files_to_parse` folder. The test bench iterates over the source bytes from each Zig file (with added sentinels) and calls the tokenization function on each **with the utf8 validator turned off**.
 40 | 
 41 | To tokenize 3,218 Zig files with 1,298,380 newlines, the original tokenizer and my new tokenizer have the following characteristics:
 42 | 
 43 | |  | memory (megabytes)|
 44 | |:-:|:-:|
 45 | | raw source files | 59.162811MB |
 46 | | original (tokens) | 46.089775MB |
 47 | | this (tokens) | 18.50827MB |
 48 | 
 49 | That's 2.49x less memory!
 50 | 
 51 | Please keep in mind that comparing to the legacy tokenizer's speed is not necessarily straightforward. It is not difficult for me to see the legacy tokenizer's performance change by ~15% by making a trivial change in my code. It varies heavily depending on the particular compile. That said, here are some numbers I am seeing on my machine (with the utf8 validator turned off on my implementation):
 52 | 
 53 | ### x86_64 Zen 3
 54 | 
 55 | **Currently the utf8 validator is turned off! I did a lot of performance optimization the past few days and did not finish porting my changes over yet.**
 56 | 
 57 | |  | run-time (milliseconds) | throughput (megabytes per second) |throughput (million lines of code per second) |
 58 | |:-:|:-:|:-:|:-:|
 59 | | read files (baseline) |   37.03ms  | 1597.85 MB/s | 35.06M loc/s |
 60 | | original              | 218.512ms  |  270.78 MB/s |  5.94M loc/s |
 61 | | this                  |  72.107ms  |  820.57 MB/s | 18.01M loc/s |
 62 | 
 63 | That's ~3.03x faster! **Currently the utf8 validator is turned off! I did a lot of performance optimization the past few days and did not finish porting my changes over yet.**
 64 | 
 65 | ### RISC-V SiFive U74
 66 | 
 67 | **Currently the utf8 validator is turned off! I did a lot of performance optimization the past few days and did not finish porting my changes over yet.**
 68 | 
 69 | |  | run-time (milliseconds) | throughput (megabytes per second) |throughput (million lines of code per second) |
 70 | |:-:|:-:|:-:|:-:|
 71 | | read files (baseline) | 318.989ms | 185.47 MB/s | 4.07M loc/s |
 72 | | original              |    2.206s |  26.81 MB/s | 0.59M loc/s |
 73 | | this                  | 894.963ms |  66.11 MB/s | 1.45M loc/s |
 74 | 
 75 | That's ~2.47x faster! **Currently the utf8 validator is turned off! I did a lot of performance optimization the past few days and did not finish porting my changes over yet.**
 76 | 
 77 | ## To-do
 78 | 
 79 | - Fix utf8 validator and get a good SWAR implementation.
 80 | - Make it so we can return memory which holds the non-newline bitmaps.
 81 | - Actually implement the AST parser.
 82 | 
 83 | # Maintenance note
 84 | 
 85 | Oddly enough, I think some of this code is more maintainable too, as adding an operator or keyword to the tokenizer is literally just adding another string into the relevant array. All of the assumptions and tricks I use are explicitly checked for in compile-time assertions (`grep` for `comptime assert`), so violating any of those invariants will result in compile errors that tell you why you can't change certain things.
 86 | 
 87 | However, I do have a bunch of weird SWAR tricks that the compiler will hopefully perform automatically one day.
 88 | 
 89 | # Designing for high performance
 90 | 
 91 | In the delicate balancing act that is performance optimization, you generally want:
 92 | 
 93 |   1. The ability to process more than one thing at once
 94 |   2. Fewer unpredicable branches
 95 |   3. A linear traversal over a smaller amount of contiguous memory
 96 | 
 97 | 
 98 | I try to achieve each of these in the following ways:
 99 | 
100 | 1. SIMD, i.e. single-instruction, multiple data. Instead of operating on a single element at a time, you can operate on 16, 32, or 64 elements simultaneously. Instead of going character-by-character, we use SIMD to check for the length of identifiers/keywords, the length of quotes, the length of whitespace, and the length of comments or single-line quotes. This allows us to move quicker than one byte at a time. We also use a SIMD technique to validate proper utf8 conformance, ported from [simdjson](https://github.com/simdjson/simdjson) by [travisstaloch](https://github.com/travisstaloch/) for use in [simdjzon](https://github.com/travisstaloch/simdjzon/). Please note that that particular code is licensed under the Apache license, included at the bottom of the `main.zig` file.
101 |     - I do not actually use SIMD to find "character literals" of the form `'a'` because these are generally extremely short and did not actually give much benefit in tests.
102 | 
103 |     - We can't and don't want to use SIMD for absolutely everything because:
104 |       - Comments can be inside of quotes and quotes can be inside of comments
105 |         - Selecting which bitstring to match in next is (probably?) not that efficient. You'd have to multiply each vector and then OR all the vectors together, get the next position, then repeat. I might try out this approach, but I doubt it will be that practical. I also note when I look at the arm64 output that it takes *much* more vector instructions than on x86_64, and doing everything in SIMD generates several hundred instructions on arm64. It might still be worth it though, especially on x86_64, but I doubt it.
106 |       - Operators are all over the place and doing everything in SIMD would require a lot of work that's not that bad for scalar code to do.
107 | 
108 | 2. SWAR, i.e., SIMD within a register. This is where we read multiple bytes into a 4 or 8 byte register and use conventional arithmetic and logical instructions to operate on multiple bytes simultaneously.
109 |     - SWAR fallbacks are provided for machines which lack proper SIMD instructions.
110 |         - We can check for equality against a character by broadcasting the character and performing an xor operation:
111 |         ```
112 |           0xaabbccddeeffgghh
113 |         ^ 0xcccccccccccccccc
114 |         --------------------
115 |           0x~~~~00~~~~~~~~~~
116 |         ```
117 |         - The previous step will result in 0's in the byte array in the positions where we found our target byte (in this case, `cc`). We can then add a broadcasted `0x7F`.
118 |         ```
119 |           0x~~~~00~~~~~~~~~~
120 |         + 0x7F7F7F7F7F7F7F7F
121 |           ----------------
122 |           0x8~8~7F8~8~8~8~8~
123 |         ```
124 |         - This will result in a 1 bit in the most significant bit of each byte that didn't start out as a 0 after the previous step. The only problem with the technique as I have presented it thus far is the potential for overflow across bytes. To remedy this, we mask out the highest bit of each byte before starting this algorithm. That way, when we add 7F we know it cannot overflow beyond the most significant bit of each byte, and then we know we can look at the most significant bit of each byte to tell us whether our target byte was **not** there.
125 |         - Then we can mask out the most significant bit of each byte and emulate a movmask operation, i.e. concentrate the bits together, with a multiplication:
126 |         ```
127 |         Example with 32 bit integers:
128 |         We want to concentrate the upper bits of each byte into a single nibble.
129 |         Doing the gradeschool multiplication algorithm, we can see that each 1 bit
130 |         in the bottom multiplicand shifts the upper multiplicand, and then we add all these
131 |         shifted bitstrings together. (Note `.` represents a 0)
132 |           a.......b.......c.......d.......
133 |         * ..........1......1......1......1
134 |         -------------------------------------------------------------------------
135 |           a.......b.......c.......d.......
136 |           .b.......c.......d..............
137 |           ..c.......d.....................
138 |         + ...d............................
139 |         -------------------------------------------------------------------------
140 |           abcd....bcd.....cd......d.......
141 | 
142 |         Then we simply shift to the right by `32 - 4` (bitstring size minus the number of relevant
143 |         bits) to isolate the desired `abcd` bits in the least significant byte!
144 |         ```
145 |     - Even on machines with vectors and powerful instructions, SWAR techniques may still be employed for operator matching.
146 | 3. Reducing unpredictable branches through:
147 |     - Using SIMD/SWAR. Using a conventional while loop to capture a completely unpredictable number of characters in the aforementioned categories all but guarantees a branch mispredict every time we exit the loop, and possibly multiple throughout the loop if the branch predictor is having a bad day. Using SIMD/SWAR, we can instead produce a bitstring with 0's marked in the place corresponding to target characters like the matching `"`, shift the bitstring according to our cursor's position, and count the trailing ones (the reason the bits are the inverse of what you might expect is because when we shift the bitstring it will be filled with 0's). In most cases, a single "count trailing one's" operation is all we need to find the position we are supposed to go to next. No need for a totally unpredictable while loop that goes character-by-character!
148 | 
149 |     - Using perfect hash functions. Specifically, keywords like `var` and `const` are mapped into a 7 bit address space by a perfect hash function. Identifiers can be checked against the list of keywords by applying the perfect hash function to each identifier and doing a table lookup to find what keyword it may match, then doing a single 16-byte vs 16-byte comparison to see if the identifier matches that keyword. The keywords are padded in memory to be 16 bytes and have a `len` stored in the final byte so we can check that the incoming identifier has the same length as the prospective keyword. We also use Phil Bagwell's array-mapped trie compression technique, meaning we have a 128-bit bitmap and find which position to check using the bitmap, thereby enabling us to have a packed buffer that need not have 128 slots. We do a similar trick for operators.
150 |       - One cool thing I can do because of Zig's comptime execution feature is tell Zig that a dummy operator/keyword is needed when we do not have an operator or keyword which hashes to the maximum 7 bit value, i.e. 127 (because I am hashing these to 7 bits of address space). If an operator or keyword is added or removed which hashed to 127, the comptime logic will automatically remove or add the dummy operator/keyword. Very nifty! At the moment, one of the perfect hash schemes needs a dummy element and the other does not. It's nice knowing that if we make a change like changing the hash function or adding/removing an operator or keyword, it will automatically figure out the right thing to do. These kinds of tricks are not good in conventional programming languages. We either have to do this work at start-up time or, even worse, someone bakes all the assumptions into the code and then changing it becomes a game of Jenga, except harder because the pieces are not all in one place. In Zig, we write it once and compile-time execution takes care of the rest.
151 | 
152 |     - I use a trick where I just allocate the upper-bound amount of memory for tokens per-file, and use the `resize` facility of the allocator to reclaim the space I did not fill. The nice thing about this trick is I can always assume there is sufficient space, which eliminates the need to check that such a thing is safe.
153 | 
154 |     - I place sentinels at the end of files (and I place a newline at the front) to make the rest of the code simpler. This allows us to safely go back a character at any point if the perfect hash function wants us to grab the last two characters from an identifier with only one character, and allows us to safely go past the end of the source file as well. By placing `"` and `'` characters at the end of our buffer, we can eliminate bounds-checking in the code that searches for those characters, and simply check whether we hit the sentinel node after the hot loop finishes. We currently don't break out of these for newlines though, which we should probably do. All other validation for these should occur when actually trying to allocate the string or character they are supposed to represent.
155 | 
156 |     - Some things we do unconditionally that could be hidden behind a branch, but are very inexpensive so there is no point. Other things we hide behind a branch when it's expensive and generally predictable. E.g. utf8 validation is typically just making sure all bytes are less than 128, i.e. 0x80. Once we see some non-ascii sequences, then we have to do the more computationally expensive work of making sure the byte sequence is valid.
157 | 
158 |     - Table lookups. I consolidate the SIMD/SWAR code into one so that we go down the exact same codepaths to find how many non_newline/identifier/non_unescaped_quote/space characters to jump over. This is probably much more efficient than having 4 separate copies of the same hot loop.
159 | 
160 |     - Inlining the SIMD/SWAR loop, even on machines that need to unroll 8 times. This turns out to be worth it in my tests, probably because it is an extremely hot loop!
161 | 
162 | 4. We reduce memory consumption by not storing start indices explicitly, which typically need to match the address space of the source length. In the case of Zig, where source files are constrained to be at most ~4GiB, only 32 bits of address space is needed for any given file. Thus the goal becomes reducing 32-bit start indices to something smaller. Quasi-succinct schemes for reducing the space consumption of monotonically increasing integer sequences immediately spring to mind, such as [Elias-Fano encoding](https://www.antoniomallia.it/sorted-integers-compression-with-elias-fano-encoding.html). However, we can achieve good space compression by simply storing the length of each token rather than the start index. Because tokens almost always have a length that can fit in a byte, we try to store all lengths in a byte. In the event that the length is too large to be stored in a byte, we store a `0` instead and make the next 4 bytes the true length. This works because tokens cannot have a length of 0, else they would not exist, therefore we can use lengths of `0` to trigger special behavior. We also know that this idea does not affect the upper bound on the number of Token elements we need to allocate because in order for a token to take up 3 times as much space as a typical token, it needs to have a length of at least 256, which the astute reader may note is significantly larger than 3.
163 | 
164 | 5. Use fewer variables where possible. While machines nowadays have *a lot* more registers than they used to, you still only have access to 16 or 32 general purpose registers! If you have more variables than that, you have to spill to the stack (it's actually worse than this, because intermediate values in expressions temporarily need their own registers too). While machines do have extra registers they can use under the hood, you do not! Therefore, we can get better performance by
165 |    - Using pointers rather than pointers + index
166 |    - Being clever about how we write out our `non_newlines` bitstrings. Instead of storing all of the bitstrings I get from the SIMD/SWAR code on the stack in a `[4]u64` (on 64 bit machines), and then writing separately to a `non_newlines` pointer, I write *all* the bitstrings into the memory allocated for the `non_newlines` bitstrings. Each time, I increment the place we are writing in the allocation by the width of a single bitstring, i.e. 8 bytes on 64 bit machines. Since I always write the `non_newlines` into the current position in the allocated memory and the other bitstrings are written after it, we will be left at the end with only `non_newlines` bitstrings. The only downside is we need to overallocate an extra 3 u64's than we otherwise would, but that's hardly any trouble. Here is a diagram of how this strategy looks in memory:
167 | 
168 |    ```
169 |    |0|1|2|3|4|5|6|7|8|9| <- slots
170 |    |a|b|c|d|   <- We write our bitstrings to 4 slots. (`a` is `non_newlines`)
171 |      |a|b|c|d| <- Each time, we move one slot forward
172 |        |a|b|c|d|
173 |          |a|b|c|d|
174 |            |a|b|c|d|
175 |              |a|b|c|d|
176 |                |a|b|c|d|
177 |    |a|a|a|a|a|a|a|b|c|d| <- In the end, we are left with this
178 |    ```
179 | 
180 | # Still to-do
181 | 
182 | Aside from the to-do's listed in the `main.zig` file, the plan with this is to rewrite the Zig parser which produces the Abstract Syntax Tree as well. I have a number of ideas on how to dramatically improve efficiency there as well. Stay tuned!
183 | 
184 | My ultimate goal is that this repository will be integrated with the Zig compiler.
185 | 
186 | # How to use
187 | 
188 | ```
189 | git clone https://github.com/Validark/Accelerated-Zig-Parser.git
190 | ```
191 | 
192 | Next, install one or more Zig projects under the `src/files_to_parse` folder.
193 | 
194 | ```
195 | cd Zig-Parser-Experiment/src/files_to_parse
196 | git clone https://github.com/ziglang/zig.git
197 | git clone https://github.com/zigtools/zls.git
198 | ```
199 | 
200 | Then run it!
201 | 
202 | ```
203 | cd ../..
204 | zig build -Doptimize=ReleaseFast run
205 | ```
206 | 
207 | 
208 | # Latest work
209 | 
210 | In the last few days, I have:
211 | 
212 | - Disabled loop unrolling for quote parsing loop on SWAR-enabled machines. Only a 1% uplift on my Sifive U74, but considering that's about 9 milliseconds at the moment, I'll take it.
213 | 
214 | - Updated the keyword lookup algorithm for non-vector architectures to use aligned loads where possible. There still could be room for improvement but today I saw a **~5% performance uplift**.
215 | 
216 | - Updated to a [slightly better optimized version](https://github.com/simdjson/simdjson/pull/2042) of the escape-detection algorithm.
217 | 
218 | - Made toggles so that `"` and `'` can be moved between the SIMD/SWAR section and a naïve scalar version very easily. It appears that for machines which have to use SWAR, it is faster to do the naïve scalar version (almost **~8% uplift on my RISC-V SiFive U74**). On the other hand, it's still more efficient on my desktop to do quote classification in SIMD, but for other less-powerful devices, it may not be worth it.
219 |     - There is also a trade-off to be made on big-endian hardware. The SIMD `'`/`"` escape detection algorithm currently has to be done in little-endian, so there necessarily has to be a reversal somewhere (or byte-reversal on the vectors) if we want to not use the naïve scalar version.
220 |         - With SIMD, we need to do a vector reversal, unless we have fast machine-word bit-reverse instructions. At the moment I am not aware of any ISA's supported by the Zig compiler with a fast bit-reverse instruction besides arm/aarch64.
221 |             - We take advantage of arm's bit-reverse instruction (`rbit`) so that we can use `clz` directly in our bitstring queries, rather than `rbit`+`clz`. On little-endian machines, we do the flip after the escape-detection algorithm. On big-endian, we can do it before, but then we can just bitreverse the backslashes before and after the escape detection algorithm. arm is typically little-endian these days, but who knows, maybe a future ISA can take advantage of the flexibility.
222 |         - mips64 and powerpc64 have built-in `clz` instructions, and emulate `ctz` via `@bitSizeOf(usize) - clz(~x & (x -% 1))`. Therefore, if we wanted to do quotes in SIMD *and* use `clz`, we would have to flip our bitstrings twice! Ouch! Hopefully I or someone else figures out how to make a big-endian equivalent of the escape character bitstring generation algorithm.
223 |         - some sparc64 machines have `popc` (e.g. niagara 2 and up), which can emulate `ctz` via `popc(~x & (x -% 1))`. To do `clz` we have to do `x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x |= x >> 32;` to spread the most-significant bit all the way right-wards, then we can invert the bitstring to get a mask of the leading zeroes, and popcount that. So on big-endian sparc64 machines, we WANT to do a bitreverse. Also, LLVM's Vectorization currently does not work on sparc64 (or powerpc) machines, so we probably have to use the SWAR algorithm for the time being.
224 |         - machines which do not have `clz` builtin can probably emulate a `ctz` faster than a `clz`.
225 | 
226 |         - With SWAR, we can do either a `@byteSwap` on the register being treated as a vector or we can reverse the bits with a bitreversing movmask algorithm. The problem with the latter is that we have to do an extra shift right on the bitstring before multiplying because the most significant bit of the most significant byte has to be moved to the lowest bit position of its byte. We *can* avoid this extra shift by instead using a multiply-high operation and concentrating the bits in the upper half of a double machine-word, and maintaining a 3 instruction movmask. However, the problem with this idea is that multiply-high might be a function call or have terrible throughput / port constraints, whereas multiplies typically have a throughput of 1 per cycle. Is the throughput actually a problem in practice though? Unsure. We *do* have quite a lot of other work to do in between multiplies.
227 |             - To generate 3 bitstrings for a chunk, we need 3 extra instructions for the bitreversed movmask (assuming we don't do `'`/`"` in SWAR). Therefore, if we can do a machine-word byte-reverse faster than 3 shifts and/or in fewer than 3 instructions, it would be smarter to do the byte-reverse. Alternatively, if we have fast multiply-high's somehow, we could use that to eliminate the 3 extra shifts (per native subchunk).
228 | 
229 |         - For now, I think the bitstring escape sequence algorithm is best left disabled on big-endian hardware without a fast bit-reverse (so just arm atm).
230 | 
231 |         - Since sparc64 and powerpc have to use SWAR until LLVM improves, they should do quote/escape logic in scalar code, not vector code. sparc64 machines lack bit-reverse and byte-reverse, so we can use the movemask-reversed function on sparc64.
232 |         - powerpc can stay in big-endian land and use `clz`.
233 |         - mips64 has an extension to the ISA which adds vector instructions, although I'm not sure if it has gone into real hardware yet or whether those vectors are actually useful for the kind of SIMD we are doing here.
234 |             - therefore mips64 can stay in big-endian land and use `clz`.
235 | 
236 | - Partially added some control character bans but there is still more to be done. Still, as of yet, incomplete.
237 | 
238 | - Replaced the SWAR movmask algorithm with one significantly better on typical hardware. Before, we were using [an algorithm from Wojciech Muła](http://0x80.pl/articles/scalar-sse-movmask.html) which for 64 bit operand `x` would basically do: `(@as(u128, x) * constant) >> 64`. Now, we can stay within the lower 64 bits by concentrating the target bits in the most significant byte, so no widening is necessary. This is really good news for basically every machine I could find info on for the difference between `mulhi` vs `mul`. Typically `mulhi` instructions have much higher latency and signicantly worse throughput, and some machines do not even have a `mulhi` instruction at all. My algorithm modifies [Wojciech Muła's algorithm](http://0x80.pl/articles/scalar-sse-movmask.html) to use only the lower 64 bits of the product of the multiplication:
239 |     ```
240 |     Example with 32 bit integers:
241 |     We want to concentrate the upper bits of each byte into a single nibble.
242 |     Doing the gradeschool multiplication algorithm, we can see that each 1 bit
243 |     in the bottom multiplicand shifts the upper multiplicand, and then we add all these
244 |     shifted bitstrings together. (Note `.` represents a 0)
245 |       a.......b.......c.......d.......
246 |     * ..........1......1......1......1
247 |     -------------------------------------------------------------------------
248 |       a.......b.......c.......d.......
249 |       .b.......c.......d..............
250 |       ..c.......d.....................
251 |     + ...d............................
252 |     -------------------------------------------------------------------------
253 |       abcd....bcd.....cd......d.......
254 | 
255 |     Then we simply shift to the right by `32 - 4` (bitstring size minus the number of relevant
256 |     bits) to isolate the desired `abcd` bits in the least significant byte!
257 |     ```
258 | 
259 | - Laid groundwork for exporting non_newline bitmaps, that way we can use it later on in the compiler to figure out what line we are on [without needing to go byte-by-byte later on in the pipeline](https://github.com/ziglang/zig/blob/91e117697ad90430d9266203415712b6cc59f669/src/AstGen.zig#L12498C10-L12515).
260 |     - Use a clever trick where we write out the SIMD/SWAR movmasked bitstrings into the allocated area, but we shift where we are writing by the width of one bitstring each time. That way, in the end we have filled our buffer with the first bitstring we write out in each step, with the overhead of basically one instruction (the pointer increment) per chunk (64 bytes on 64 bit machines).
261 |     ```
262 |     |0|1|2|3|4|5|6|7|8|9| <- slots
263 |     |a|b|c|d|   <- We write our bitstrings to 4 slots. (`a` is `non_newlines`)
264 |       |a|b|c|d| <- Each time, we move one slot forward
265 |         |a|b|c|d|
266 |           |a|b|c|d|
267 |             |a|b|c|d|
268 |               |a|b|c|d|
269 |                 |a|b|c|d|
270 |     |a|a|a|a|a|a|a|b|c|d| <- In the end, we are left with this
271 |     ```
272 | 
273 | 
274 | - Fixed random performance issues, like the compiler not realizing that our SIMD/SWAR chunks are always aligned loads. (It matters a lot on less-mainstream machines!)
275 | 
276 | - Made the SIMD/SWAR code go chunk by chunk in lockstep rather than have each individual component load its 64 (on 64-bit machines) bytes separately. I am assuming that LLVM was able to reuse loaded vectors on some occasions, but in practice I saw a massive speedup in the last week. Granted, the utf8 validator was turned off temporarily while it is being reworked. However, on my Zen 3 machine I typically saw basically no performance difference between running the utf8 validator versus not. The reason for this is because we can almost always exit early (when the entire chunk is ascii). Due to alignment/cache/happenstance, I typically saw my tokenization times go *down* with the utf8 validator turned *on*, so I don't think I am unfairly advantaging my most recent measurements.
277 | 
278 | - Turned off the utf8 validator. I need to fix the types for it so it can be re-enabled. We also need to port a SWAR version. simdjson or Golang might have some tricks we can use.
279 | 
280 | - Added an option to enable or disable the folding of comments into adjacent nodes (`FOLD_COMMENTS_INTO_ADJACENT_NODES`). This should make it a little easier to change my mind on the particulars of the AST implementation.
281 | 
282 | - Added more tests and compile-time assertions. We're getting there!
283 | 


--------------------------------------------------------------------------------