├── .gitignore
├── README.md
├── build.jai
├── eval.jai
├── linux.jai
├── macos.jai
├── movegen.jai
├── nnue_avx2.jai
├── nnue_cpu.jai
├── nnue_probe.jai
├── nnue_sse.jai
├── resources
    ├── AnonymousPro.ttf
    ├── capture.wav
    ├── chess_pieces.png
    ├── move.wav
    ├── nn-04cf2b4ed1da.nnue
    └── settings_icon.png
├── search.jai
├── uci.jai
├── ui.jai
└── windows.jai


/.gitignore:
--------------------------------------------------------------------------------
 1 | .build/
 2 | chess
 3 | ceij
 4 | jai
 5 | berserk
 6 | stockfish
 7 | koivisto
 8 | Chess
 9 | Ceij
10 | Jai
11 | Berserk
12 | Stockfish
13 | Koivisto
14 | *.pdb
15 | *.nnue
16 | *.pdf
17 | *.exe
18 | *
19 | resources/
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Chess Engine in Jai
  2 | 
  3 | This Chess Engine in Jai is a hobby project designed to find bugs and benchmark the Jai Compiler.
  4 | You can find video demos of this Chess Engine [here](https://www.youtube.com/watch?v=2OvE0I_rdpI&list=PL2fmKE0pL4IyET-eKbbBPw_i9IHN1QmFZ&index=1) 
  5 | 
  6 | ## How to Build:
  7 | This project includes both a Chess Graphical User Interface and a UCI compatible Chess Engine.
  8 | 
  9 | Type `jai build.jai` to build both the UI and AI on `release` mode
 10 | 
 11 | Type `jai build.jai - ui` to build the GUI.
 12 | 
 13 | Type `jai build.jai - ai` to build the AI.
 14 | 
 15 | Type `jai build.jai - ui ai` to build both the GUI and the AI.
 16 | 
 17 | Type `jai build.jai - release` to build an optimized build.
 18 | 
 19 | Type `jai build.jai - release ai nnue_probe` to build the engine using Daniel Shawul's NNUE-Probe library. This C library is entirely optional, and is used to provide the best SIMD code for unsupported computers (e.g. AVX512, ARM NEON, MMX, etc.)
 20 | 
 21 | The build script will try to detect the CPU you are using, and pick either `AVX2`, `SSE`, or `cpu` automatically based on your processor. Send the flags `avx2`, `sse`, or `cpu` to manually toggle what you want. `cpu` means running with no SIMD support. Because Neural Networks are based on matrix multiplication, SIMD is needed to obtain the best performance.
 22 | 
 23 | ## Code Organization
 24 | This code is divided up into the following files:
 25 | * `build.jai` is a build script for both the User Interface and GUI.
 26 | * `eval.jai` contains chess evaluation parameters
 27 | * `movegen.jai` contains the `Chess` structs and Piece definitions
 28 | * `search.jai` is the main `Chess` search function. This file contains code for Multi-threaded Parallel Search
 29 | * `uci.jai` contains the `main` function for the AI. This file parses the Universal Chess Interface protocol for the engine
 30 | * `ui.jai` is the `main` function for the UI. This file handles the User Interface with `Simp` and `GetRect`
 31 | * `linux.jai` is code specific to the Linux OS.
 32 | * `windows.jai` is code specific to the Windows OS.
 33 | * `macos.jai` is code specific to Mac OS.
 34 | * the `resources` contains the following information
 35 |   * Fonts
 36 |   * Sound Effects
 37 |   * Chess Piece Picture
 38 |   * Neural Network Model for AI
 39 | * The Efficiently Updatable Neural Network Code is organized as follows:
 40 |   * `nnue_avx2.jai` contains code for AVX2 processors.
 41 |   * `nnue_sse.jai` contains code for SSE processors.
 42 |   * `nnue_probe.jai` contains code for interacting with the NNUE-Probe C library by Daniel Shawul. This can be used to get the best optimization for SIMD architectures unsupported by the existing Jai code (e.g. AVX512 SIMD, ARM NEON SIMD, X86-64 MMX SIMD, etc.)
 43 |   * `nnue_cpu.jai` is a default code with no SIMD. Because it is general with no SIMD whatsoever, this allows someone to run it on any platform.
 44 | 
 45 | ## User Interface Features
 46 | 
 47 | * Drag and drop behavior to move pieces
 48 | * Click to move pieces
 49 | * Legal move generation and detection
 50 | * Flip the board using the `X` Key
 51 | * New game using the `Ctrl+N` Key
 52 | * Undo move using the `Ctrl+Z`Key
 53 | * Redo move using the `Ctrl+Y`Key
 54 | * Implements UCI (Universal Chess Interface) Protocol to communicate with any UCI compatible chess engine (e.g. Stockfish, RubiChess)
 55 | * Can do Human vs Computer, Computer vs Human, Human vs Human, and Computer vs Computer.
 56 | * Parse and Loads FEN strings
 57 | * Engine vs. Engine Implemented (e.g. Stockfish vs Komodo)
 58 | * Multi-engine support
 59 | * UCI Engine Options
 60 | * Console command-line
 61 | * Windows, Mac, and Linux support
 62 | * Chess Clock with Increment
 63 | * Blindfold Mode
 64 | * Highlight Squares with Right Click
 65 | * Draw Arrows on the Board with Right Click
 66 | * Display Engine Lines
 67 | * Dark Background/Light Background Theme
 68 | 
 69 | ## Chess Engine in Jai
 70 | * Estimated elo: 3100 
 71 | * UCI protocol
 72 | 
 73 | ### Chess Engine Options
 74 | * Clear Hash - clears out the transposition table
 75 | * Transposition Table Memory Size (in MB). - adjusts the size of the transposition table in megabytes
 76 | * Number of Threads - adjusts the number of threads used by the CPU. increasing the number of threads should increase the playing strength of the CPU
 77 | * Multiple Principle Variation Lines - detects multiple PV lines. Set the PV to 1 for the best playing strength (the default)
 78 | * Difficulty Levels 1 to 8 - adjusts the difficulty of the engine.
 79 | 
 80 | ### Parallel Search
 81 | * Lazy Shared Memory Processing Parallel Search
 82 | * Uses the `Thread_Group` Jai Module to implement threading
 83 | 
 84 | ### Board Representation
 85 | * 8x8 Board
 86 | * Legal/Psuedo Legal Move Generation
 87 | * Staged Move Generation
 88 | * Bitboards with Little Endian Rank-File Mapping
 89 | * Fancy Magic Bitboards with Parallel Bit Extract
 90 | * Kogge-Stone Algorithm
 91 | * Move generator generates 324 million positions per second
 92 | * Moves encoded as 16-bit integers
 93 | 
 94 | ### Search
 95 | * Negamax Search with Alpha-Beta Pruning
 96 | * Principle Variation Search
 97 | * Iterative Deepening
 98 | * Aspiration Window Search
 99 | * Internal Iterative Deepening
100 | 
101 | ### Pruning and Reductions
102 | * Null Move Pruning w/ Verification
103 | * Low Depth SEE Pruning
104 | * Reverse Futility Pruning/Static Move Pruning
105 | * Delta Pruning
106 | * SEE Quiescene Search Pruning
107 | * Razoring
108 | * Mate Distance Pruning
109 | * History Leaf Reduction/Pruning
110 | * Prob Cut
111 | * Late Move Reduction/Pruning
112 | 
113 | ### Extensions
114 | * Singular Search Extensions
115 | * Check Extensions
116 | * Recapture Extensions
117 | 
118 | ### Transposition Table
119 | * 16-byte Transposition Table Hash Entries
120 | * 3-fold repetition & Fifty-move Rule
121 | * Zobrist Hashing w/ Incremental Update
122 | * Transposition Table Probing in Non PV Nodes
123 | * Prefetch Transposition Table Entries
124 | * Depth-Preferred Replacement Scheme
125 | * Aging
126 | 
127 | ### Move Ordering
128 | * MVV-LVA (Most Valuable Victim, Least Valuable Attacker) Implementation
129 | * Transposition Table Hash Move
130 | * Killer Moves Move Ordering
131 | * History Moves Move Ordering
132 | * Tactical History Move Ordering
133 | * Countermove and Follow Up History Move Ordering
134 | * Relative History Move Ordering
135 | 
136 | ### Evaluation
137 | * Efficiently Updatable Neural Networks with Incremental Update
138 | * Simple Mop-Up Endgame Evaluation
139 | * NNUE Stockfish 12 HalfKP Architecture
140 | * Supports the following CPU Architectures:
141 |   * AVX2
142 |   * SSE 
143 |   * CPU w/o SIMD
144 | 
145 | ### Credits
146 | * Thank you to Jai Beta Users Patrik Smělý and Don Swet (github.com/cookednick) for testing the Chess program on Mac OS.
147 | * Thank you to Maksim Korzh, author of the BBC chess engine. His work provided the main inspiration for this project. Here is a link to the [BBC Chess Engine](https://github.com/maksimKorzh/bbc).
148 | * Thank you to Daniel Shawul for translating Stockfish NNUE into C. His work was the basis for my NNUE implementation. [nnue-probe](https://github.com/dshawul/nnue-probe)
149 | * Thank you to Jonathan Blow, for allowing me access to the Jai Compiler Beta. [Jai-Community-Wiki](https://github.com/Jai-Community/Jai-Community-Library)
150 | * [Berserk Chess Engine](https://github.com/jhonnold/berserk) was a great chess engine I learned a lot from. The code is well-organized and well-written. I highly recommend looking through Jay Honnold's work if you want to learn how to take your chess engine to the next level.
151 | * [The Stockfish PyTorch NNUE Guide](https://github.com/glinscott/nnue-pytorch/blob/master/docs/nnue.md) is an incredibly detailed look at how NNUE works. I recommend reading it over and over again to get a full grasp about what is going on.
152 | 
153 | 


--------------------------------------------------------------------------------
/build.jai:
--------------------------------------------------------------------------------
  1 | #import "Basic";
  2 | #import "Compiler";
  3 | #import "Machine_X64";
  4 | #import "String";
  5 | #import "File";
  6 | #import "File_Utilities";
  7 | #import "Process";
  8 | 
  9 | ui_exe_name :: "chess";
 10 | ai_exe_name :: "ceij";
 11 | author :: "Daniel Tan";
 12 | 
 13 | add_chess_engine_files :: (w: Workspace, cpu_info: *Cpu_X86, type: NNUE_Type) {
 14 | 
 15 |   create_uci_message :: () -> string #expand {
 16 |     NEWLINE := ifx OS == .WINDOWS "\r\n" else "\n";
 17 |     builder: String_Builder;
 18 |     print_to_builder(*builder, "uci_response :: #string END%1", NEWLINE);
 19 |     print_to_builder(*builder, "id name %1%2", ai_exe_name, NEWLINE);
 20 |     print_to_builder(*builder, "id author %1%2", author, NEWLINE);
 21 |     print_to_builder(*builder, "info string neural network type %1%2", type, NEWLINE);
 22 |     print_to_builder(*builder, "END%1", NEWLINE);
 23 |     return builder_to_string(*builder);
 24 |   }
 25 | 
 26 |   // OS specific code.
 27 |   #if OS == .LINUX {
 28 |     add_build_file("linux.jai", w);  
 29 |   } else #if OS == .WINDOWS {
 30 |     add_build_file("windows.jai", w);  
 31 |   } else #if OS == .MACOS {
 32 |     add_build_file("macos.jai", w);  
 33 |   }
 34 | 
 35 |   // cross platform layer.
 36 |   add_build_file("uci.jai", w);  
 37 |   add_build_file("search.jai", w);  
 38 |   add_build_file("eval.jai", w);  
 39 |   add_build_file("movegen.jai", w);  
 40 | 
 41 |   // determine which NNUE SIMD to build.
 42 |   if type == .auto {
 43 |     // automatically add based on whether the CPU has AVX features or not.
 44 |     print("Detecting CPU instruction set.\n");
 45 |     if check_feature(cpu_info.feature_leaves, .AVX2) {
 46 |       type = .avx2;
 47 |     } else if check_feature(cpu_info.feature_leaves, .SSE) {
 48 |       type = .sse;
 49 |     } else {
 50 |       type = .cpu;
 51 |     }
 52 |   }
 53 | 
 54 |   if #complete type == {
 55 |   case .auto;
 56 |     assert(false);
 57 |   case .cpu;
 58 |     print("Building Chess Engine with NNUE CPU\n");
 59 |     add_build_file("nnue_cpu.jai", w);  
 60 |   case .sse;
 61 |     print("Building Chess Engine with NNUE SSE\n");
 62 |     add_build_file("nnue_sse.jai", w);  
 63 |   case .avx2; 
 64 |     print("Building Chess Engine with NNUE AVX2\n");
 65 |     add_build_file("nnue_avx2.jai", w);
 66 |   case .nnue_probe;
 67 |     print("Building Chess Engine with NNUE Probe\n");
 68 | 
 69 |     // I only tested this on Linux. I believe nnue-probe only works on Linux only,
 70 |     // and cannot run on windows/mac, but I could be wrong...
 71 |     library: string;
 72 |     location: string;
 73 |     if OS == .LINUX {
 74 |       library = "libnnueprobe.so";
 75 |       location = "nnue-probe/src/libnnueprobe.so";
 76 |     } else if OS == .WINDOWS {
 77 |       library =  "libnnueprobe.dll";
 78 |       location = "nnue-probe/src/libnnueprobe.dll";
 79 |     } else if OS == .MACOS {
 80 |       library =  "libnnueprobe.dylib";
 81 |       location = "nnue-probe/src/libnnueprobe.dylib";
 82 |     }
 83 |     if !file_exists(library) {
 84 |       if !file_exists("nnue-probe") {
 85 |         print("Downloading nnue-probe library by Daniel Shawul.\n");
 86 |         run_command("git", "clone", "https://github.com/dshawul/nnue-probe");
 87 |       }
 88 | 
 89 |       run_command("make", "--directory=nnue-probe/src");
 90 |       file_move(location, library);
 91 |     }
 92 | 
 93 |     add_build_file("nnue_probe.jai", w);
 94 |   case;
 95 |     assert(false, "Invalid build script.\n");
 96 |   }
 97 | 
 98 |   uci_message := create_uci_message();
 99 |   add_build_string(uci_message, w);
100 | 
101 | 
102 | }
103 | 
104 | add_ui_files :: (w: Workspace) {
105 |   // OS specific code.
106 |   #if OS == .LINUX {
107 |     add_build_file("linux.jai", w);  
108 |   } else #if OS == .WINDOWS {
109 |     add_build_file("windows.jai", w);  
110 |   } else #if OS == .MACOS {
111 |     add_build_file("macos.jai", w);  
112 |   }
113 | 
114 |   // cross platform layer.
115 |   add_build_file("ui.jai", w);  
116 |   add_build_file("movegen.jai", w);  
117 | }
118 | 
119 | #run {
120 |   cpu_info := get_cpu_info();
121 |   defer set_build_options_dc(.{do_output=false});
122 |   target_options := get_build_options();
123 |   args := target_options.compile_time_command_line;
124 |   flags: CompileFlags = 0;
125 |   nnue_type: NNUE_Type = .auto;
126 |   if args.count == 0 then {
127 |     flags = CompileFlags.ui | .ai | .release;
128 |   }
129 | 
130 |   for arg: args {
131 |     if arg == {
132 |     case "ui";
133 |       flags |= .ui;
134 |     case "ai";
135 |       flags |= .ai;
136 |     case "debug";
137 |       flags &= ~.release;
138 |     case "release";
139 |       flags |= .release;
140 |     case "avx2";
141 |       nnue_type = .avx2;
142 |     case "sse";
143 |       nnue_type = .sse;
144 |     case "cpu";
145 |       nnue_type = .cpu;
146 |     case "nnue_probe";
147 |       nnue_type = .nnue_probe;
148 |       
149 |     }
150 |   }
151 | 
152 |   if flags & .ai {
153 |     w: Workspace;
154 |     if flags & .release {
155 |       w = build_release(*cpu_info, ai_exe_name);
156 |     } else {
157 |       w = build_debug(*cpu_info, ai_exe_name);
158 |     }
159 |     add_chess_engine_files(w, *cpu_info, nnue_type);
160 |     print("Building Chess Engine [%]\n", ai_exe_name);
161 |   }
162 | 
163 |   if flags & .ui {
164 |     w: Workspace;
165 |     if flags & .release {
166 |       w = build_release(*cpu_info, ui_exe_name);
167 |     } else {
168 |       w = build_debug(*cpu_info, ui_exe_name);
169 |     }
170 |     add_ui_files(w);
171 |     print("Building Chess User Interface [%]\n", ui_exe_name);
172 |   }
173 | }
174 | 
175 | CompileFlags :: enum_flags {
176 |   release;
177 |   ui;
178 |   ai;
179 | }
180 | 
181 | NNUE_Type :: enum {
182 |   auto;
183 |   cpu;
184 |   sse;
185 |   avx2;
186 |   nnue_probe;
187 | }
188 | 
189 | build_debug :: (cpu_info: *Cpu_X86, executable_name: string) -> Workspace {
190 |   w := compiler_create_workspace(executable_name);
191 |   options := get_build_options(w);
192 |   options.output_executable_name = executable_name;
193 |   set_optimization(*options, .DEBUG);
194 |   options.backend = .X64;
195 |   options.stack_trace = true;
196 |   set_build_options(options, w);
197 |   movegen_features(cpu_info, w);
198 |   print("Creating Debug Build for: %\n", executable_name);
199 |   return w;
200 | }
201 | 
202 | build_release :: (cpu_info: *Cpu_X86, executable_name: string) -> Workspace {
203 |   w := compiler_create_workspace(executable_name);
204 |   options := get_build_options(w);
205 |   options.output_executable_name = executable_name;
206 |   set_optimization(*options, .OPTIMIZED);
207 |   options.llvm_options.enable_split_modules = false;
208 |   options.stack_trace = false;
209 |   set_build_options(options, w);
210 |   movegen_features(cpu_info, w);
211 |   print("Creating Release Build for: %\n", executable_name);
212 |   return w;
213 | }
214 | 
215 | movegen_features :: (cpu_info: *Cpu_X86, w: Workspace) {
216 |   enable_blsr := check_feature(cpu_info.feature_leaves, .BMI1) == true;
217 |   code := tprint("ENABLE_BLSR :: %;\n", enable_blsr);
218 |   print(code);
219 |   add_build_string(code, w);
220 | 
221 |   enable_fancy_magic_bitboards := check_feature(cpu_info.feature_leaves, .BMI2) == true;
222 |   code = tprint("FANCY_MAGIC_BITBOARDS :: %;\n", enable_fancy_magic_bitboards);
223 |   print(code);
224 |   add_build_string(code, w);
225 | }
226 | 


--------------------------------------------------------------------------------
/eval.jai:
--------------------------------------------------------------------------------
 1 | // for NNUE resources: https://hxim.github.io/Stockfish-Evaluation-Guide/
 2 | 
 3 | uci_evaluate :: (chess: *Chess) -> int #expand {
 4 |   return nnue_evaluate_board(chess);
 5 | }
 6 | 
 7 | evaluate :: (chess: *ChessGame, fifty: int) -> int {
 8 |   TF, score := trivial_evaluation(chess, fifty); 
 9 |   if TF return score;
10 |   eval := nnue_evaluate(chess);
11 |   return (eval * (100-fifty)) / 200;
12 | } 
13 | 
14 | #scope_file
15 | 
16 | 
17 | // We use trivial evaluation to deal with endgame positions.
18 | trivial_evaluation :: (using chess: *Chess, fifty: int) -> bool, int {
19 | 
20 |   WIN_SCORE :: 5_000;
21 | 
22 |   eval_winning_position :: (strong_king: u64, q: u64, r: u64, b: u64, n: u64, p: u64, weak_king: u64) -> int {
23 | 
24 |     push_to_edge :: (sq: int) -> int {
25 |       r := sq / 8;
26 |       f := sq % 8;
27 |       fd := min(f, 7-f);
28 |       rd := min(r, 7-r);
29 |       return 90 - (7 * fd * fd / 2 + 7 * rd * rd / 2);
30 |     }
31 | 
32 |     eg_pawn_table :: int.[
33 |       0,   0,   0,   0,   0,   0,   0,   0,
34 |     750, 750, 750, 750, 750, 750, 750, 750,
35 |     550, 550, 550, 550, 550, 550, 550, 550,
36 |     250, 250, 250, 250, 250, 250, 250, 250,
37 |     150, 150, 150, 150, 150, 150, 150, 150,
38 |      50,  50,  50,  50,  50,  50,  50,  50,
39 |      25,  25,  25,  25,  25,  25,  25,  25,
40 |       0,   0,   0,   0,   0,   0,   0,   0,
41 |     ];
42 | 
43 |     win := bit_scan_forward(strong_king);
44 |     los := bit_scan_forward(weak_king);
45 | 
46 |     winx := win / 8;
47 |     winy := win % 8;
48 | 
49 |     losx := los / 8;
50 |     losy := los % 8;
51 | 
52 |     distance   := abs(winx - losx) + abs(winy - losy);
53 |     push_close := 140 - 20 * distance;
54 |     edge       := push_to_edge(los);
55 |     mat        := popcount(q)*900 + popcount(r)*500 + popcount(b)*350 + popcount(n)*300;
56 |     pawn_points:= 0;
57 |     while p {
58 |       i := bit_scan_forward(p);
59 |       pawn_points += eg_pawn_table[i];
60 |       p &= p-1;
61 |     }
62 | 
63 |     return WIN_SCORE + push_close + edge + mat + pawn_points;
64 |   }
65 | 
66 |   // used for trivial checkmates/draws.
67 |   w_pieces := w_queen|w_rook|w_bishop|w_knight|w_pawn;
68 |   b_pieces := b_queen|b_rook|b_bishop|b_knight|b_pawn;
69 |   if b_pieces == 0 {
70 |     if w_pieces & (~(w_pawn|w_knight)) {
71 |       side := ifx turn == Turn.WHITE then 1 else -1; 
72 |       eval := eval_winning_position(w_king, q=w_queen, r=w_rook, b=w_bishop, n=w_knight, p=bit_reverse64(w_pawn), b_king) * side;
73 |       return true, eval;
74 |     } 
75 |   }
76 | 
77 |   if w_pieces == 0 {
78 |     if b_pieces & (~(b_pawn|b_knight)) {
79 |       side := ifx turn == Turn.WHITE then -1 else 1;
80 |       eval := eval_winning_position(b_king, q=b_queen, r=b_rook, b=b_bishop, n=b_knight, p=b_pawn, w_king) * side;
81 |       return true, eval;
82 |     } 
83 |   }
84 |   return false, 0;
85 | }
86 | 
87 | #import "File";
88 | #import "String";
89 | 
90 | 


--------------------------------------------------------------------------------
/linux.jai:
--------------------------------------------------------------------------------
 1 | // contains linux OS specific code.
 2 | 
 3 | EXE :: ""; // nothing.
 4 | NEWLINE :: "\n";
 5 | 
 6 | OS :: struct {}
 7 | 
 8 | // defines the AI uci message loop.
 9 | getline :: (os: *OS, loop_body: Code, flags: For_Flags) #expand {
10 |   nonblock_stdin :: () #expand {
11 |     flags := fcntl(STDIN_FILENO, F_GETFL, 0) | O_NONBLOCK;
12 |     fcntl(STDIN_FILENO, F_SETFL, flags);
13 |   }
14 |   nonblock_stdin();
15 |   `it_index := 0;
16 |   while outer := true {
17 |     stopping = false;
18 |     memset(buffera.data, 0, size_of(type_of(buffera)));
19 |     memset(bufferb.data, 0, size_of(type_of(bufferb)));
20 |     bytes_read := read(STDIN_FILENO, buffera.data, buffera.count-1);
21 |     if bytes_read < 0 {
22 |       sleep_milliseconds(25);
23 |       continue;
24 |     }
25 | 
26 |     messages := to_string(buffera.data, bytes_read);
27 |     while messages {
28 |       found, `it, rest := split_from_left(messages, #char "\n");
29 |       if !found break;
30 |       #insert (break=break outer) loop_body;
31 |       messages = rest;
32 |     }
33 |   }
34 | }
35 | 
36 | read_input :: (main_thread: bool, nodes: int, maxnodes: int, time_begin: float64, movetime: int) #expand {
37 |   if stopping == true then
38 |     `return 0;
39 | 
40 |   if (nodes & 8191) == 8191 {
41 |     if nodes >= maxnodes {
42 |       stopping = true;
43 |       `return 0;
44 |     }
45 | 
46 |     if main_thread == false
47 |       return;
48 | 
49 |     if (nodes & 8191) == 8191 {
50 |       time := seconds_since_init();
51 |       left: int = xx (1000.0 * (time - time_begin));
52 |       if left > movetime {
53 |         stopping = true;
54 |         `return 0;
55 |       }
56 |     }
57 | 
58 |     bytes_read := read(STDIN_FILENO, bufferb.data, bufferb.count-1);
59 |     if bytes_read > 0 {
60 |       str := to_string(bufferb.data, bytes_read-1); // -1 the '\n' char
61 |       while str {
62 |         found, msg, rest := split_from_left(str, #char "\n");
63 |         if equal(str, "isready") {
64 |           print("readyok\n");
65 |         } 
66 | 
67 |         if equal(str, "stop") {
68 |           stopping = true;
69 |           `return 0;
70 |         }
71 | 
72 |         if equal(str, "quit") {
73 |           exit(0);
74 |         } 
75 | 
76 |         str = rest;
77 |         if !found break;
78 |       }
79 |     }
80 |   }
81 | }
82 | 
83 | 
84 | stop :: () -> bool #expand {
85 |   return stopping == true;
86 | }
87 | 
88 | #scope_file
89 | buffera: [4096] u8;
90 | bufferb: [4096] u8;
91 | stopping: bool = false;
92 | 
93 | #import "POSIX";
94 | #import "Basic";
95 | #import "String";
96 | #import "System"; // For get_path_of_running_executable.
97 | 


--------------------------------------------------------------------------------
/macos.jai:
--------------------------------------------------------------------------------
  1 | // contains mac OS specific code.
  2 | // this is just copy/paste the linux code as macos.
  3 | 
  4 | EXE :: ""; // nothing.
  5 | NEWLINE :: "\n";
  6 | 
  7 | OS :: struct {}
  8 | 
  9 | // defines the AI uci message loop.
 10 | getline :: (os: *OS, loop_body: Code, flags: For_Flags) #expand {
 11 | 
 12 |   nonblock_stdin :: () #expand {
 13 |     flags := fcntl(STDIN_FILENO, F_GETFL, 0) | O_NONBLOCK;
 14 |     fcntl(STDIN_FILENO, F_SETFL, flags);
 15 |   }
 16 | 
 17 |   nonblock_stdin();
 18 |   `it_index := 0;
 19 |   while outer := true {
 20 |     stopping = false;
 21 |     memset(buffera.data, 0, size_of(type_of(buffera)));
 22 |     memset(bufferb.data, 0, size_of(type_of(bufferb)));
 23 |     bytes_read := read(STDIN_FILENO, buffera.data, buffera.count-1);
 24 |     if bytes_read < 0 {
 25 |       sleep_milliseconds(25);
 26 |       continue;
 27 |     }
 28 | 
 29 |     messages := to_string(buffera.data, bytes_read);
 30 |     while messages {
 31 |       found, `it, rest := split_from_left(messages, #char "\n");
 32 |       if !found break;
 33 |       #insert (break=break outer) loop_body;
 34 |       messages = rest;
 35 |     }
 36 |   }
 37 | }
 38 | 
 39 | read_input :: (main_thread: bool, nodes: int, maxnodes: int, time_begin: float64, movetime: int) #expand {
 40 |   if stopping == true then
 41 |     `return 0;
 42 | 
 43 |   if (nodes & 8191) == 8191 {
 44 |     if nodes >= maxnodes {
 45 |       stopping = true;
 46 |       `return 0;
 47 |     }
 48 | 
 49 |     if main_thread == false
 50 |       return;
 51 | 
 52 |     if (nodes & 8191) == 8191 {
 53 |       time := get_time();
 54 |       left: int = xx (1000.0 * (time - time_begin));
 55 |       if left > movetime {
 56 |         stopping = true;
 57 |         `return 0;
 58 |       }
 59 |     }
 60 | 
 61 |     bytes_read := read(STDIN_FILENO, bufferb.data, bufferb.count-1);
 62 |     if bytes_read > 0 {
 63 |       str := to_string(bufferb.data, bytes_read-1); // -1 the '\n' char
 64 |       while str {
 65 |         found, msg, rest := split_from_left(str, #char "\n");
 66 |         if equal(str, "isready") {
 67 |           print("readyok\n");
 68 |         } 
 69 | 
 70 |         if equal(str, "stop") {
 71 |           stopping = true;
 72 |           `return 0;
 73 |         }
 74 | 
 75 |         if equal(str, "quit") {
 76 |           exit(0);
 77 |         } 
 78 | 
 79 |         str = rest;
 80 |         if !found break;
 81 |       }
 82 |     }
 83 |   }
 84 | }
 85 | 
 86 | 
 87 | stop :: () -> bool #expand {
 88 |   return stopping == true;
 89 | }
 90 | 
 91 | #scope_file
 92 | buffera: [4096] u8;
 93 | bufferb: [4096] u8;
 94 | stopping: bool = false;
 95 | 
 96 | #import "POSIX";
 97 | #import "Basic";
 98 | #import "String";
 99 | #import "System"; // For get_path_of_running_executable.
100 | 


--------------------------------------------------------------------------------
/nnue_avx2.jai:
--------------------------------------------------------------------------------
  1 | #run {
  2 |   nnue_default :: "resources/nn-04cf2b4ed1da.nnue";
  3 |   if nnue_init(nnue_default) {
  4 |     print("NNUE % initialized\n", nnue_default);
  5 |   } else {
  6 |     assert(false, "Error. Neural Network is not initialized.\n"); 
  7 |   }
  8 | }
  9 | 
 10 | nnue_startup :: () #expand {} // initialization is done at compile time.
 11 | 
 12 | nnue_init :: (file_name: string) -> bool {
 13 | 
 14 |   read_hidden_weights :: (weight: []s8, dims: int, d: *s8) -> *s8 {
 15 | 
 16 |     wt_idx :: (r: u32, c: u32, dims: int) -> u32 {
 17 |       if dims > 32 {
 18 |         b: u32 = c & 0x18;
 19 |         b = (b << 1) | (b >> 1);
 20 |         c = xx ((c & ~0x18) | (b & 0x18));
 21 |       }
 22 |       return c * 32 + r;
 23 |     }
 24 | 
 25 |     i := 0;
 26 |     for r: 0..cast(u32)31 {
 27 |       for c: 0..cast(u32)(dims-1) {
 28 |         index := wt_idx(r, c, dims);
 29 |         weight[index] = d.*;
 30 |         d += 1;
 31 |       }
 32 |     }
 33 | 
 34 |     return d;
 35 |   }
 36 | 
 37 |   read_output_weights :: (weight: []s8, data: *s8) {
 38 |     for i: 0..31 {
 39 |       weight[i] = << data;
 40 |       data += 1;
 41 |     }
 42 |   }
 43 | 
 44 |   permute_biases :: (biases: *s32) #expand {
 45 |     rdi := biases;
 46 |     // translated from godbolt's clang -O3 assembly language output.
 47 |     #asm AVX {
 48 |       movdqa.x xmm0: vec, [rdi+16]; 
 49 |       movdqa.x xmm1: vec, [rdi+32];
 50 |       movdqa.x xmm2: vec, [rdi+48];
 51 |       movdqa.x xmm3: vec, [rdi+64];
 52 |       movdqa.x xmm4: vec, [rdi+80];
 53 |       movdqa.x xmm5: vec, [rdi+96];
 54 | 
 55 |       movdqa.x [rdi+16], xmm3; 
 56 |       movdqa.x [rdi+32], xmm0;
 57 |       movdqa.x [rdi+48], xmm4;
 58 |       movdqa.x [rdi+64], xmm1;
 59 |       movdqa.x [rdi+80], xmm5;
 60 |       movdqa.x [rdi+96], xmm2;
 61 |     }
 62 |   }
 63 |   verify_file :: (buffer: [] u8) -> bool {
 64 |     if buffer.count != 21022697 then
 65 |       return false;
 66 |     d := buffer.data;
 67 |     if <<cast(*u32)d != NnueVersion then
 68 |       return false;
 69 |     if <<cast(*u32)(d+4) != 0x3e5aa6ee then
 70 |       return false;
 71 |     if <<cast(*u32)(d+8) != 177 then
 72 |       return false;
 73 |     if <<cast(*u32)(d + TransformerStart) != 0x5d69d7b8 then
 74 |       return false;
 75 |     if <<cast(*u32)(d + NetworkStart) != 0x63337156 then
 76 |       return false;
 77 |     return true;
 78 |   }
 79 | 
 80 |   init_weights :: (buffer: [] u8) {
 81 |     data := cast(*s8) (buffer.data + TransformerStart + 4);
 82 | 
 83 |     // Read transformer
 84 |     for i: 0..(kHalfDimensions-1) {
 85 |       ft_biases[i] = <<cast, no_check (*s16)(data);
 86 |       data += 2;
 87 |     }
 88 | 
 89 |     for i: 0..(kHalfDimensions*FtInDims)-1 {
 90 |       ft_weights[i] = <<cast, no_check(*s16)(data);
 91 |       data += 2;
 92 |     }
 93 | 
 94 |     // Read network
 95 |     data += 4;
 96 |     for i: 0..31 {
 97 |       hidden1_biases[i] = <<cast, no_check(*s32)(data);
 98 |       data += 4;
 99 |     }
100 | 
101 |     data = read_hidden_weights(hidden1_weights, 512, data);
102 | 
103 |     for i: 0..31 {
104 |       hidden2_biases[i] = <<cast, no_check(*s32)(data);
105 |       data += 4;
106 |     }
107 | 
108 |     data = read_hidden_weights(hidden2_weights, 32, data);
109 |     
110 |     for i: 0..0 {
111 |       output_biases[i] = <<cast(*s32)(data);
112 |       data += 4;
113 |     }
114 | 
115 |     read_output_weights(output_weights, data);
116 | 
117 |     // only for AVX2
118 |     permute_biases(hidden1_biases.data);
119 |     permute_biases(hidden2_biases.data);
120 |   }
121 | 
122 |   file, success :=  file_open(file_name);
123 |   if !success {
124 |     return false;
125 |   }
126 |   length :=  file_length(file);
127 |   buffer := NewArray(length, u8);
128 |   defer {
129 |     array_free(buffer);
130 |     file_close(*file);
131 |   }
132 | 
133 |   if !file_read(file, buffer.data, length) {
134 |     return false;
135 |   }
136 | 
137 |   // verify that the file is correct.
138 |   if !verify_file(buffer) then
139 |     return false;
140 | 
141 |   init_weights(buffer);
142 |   return true;
143 | 
144 | }
145 | 
146 | nnue_evaluate :: (chess: *ChessGame) -> int {
147 |   a_nnue: [3] *NNUEdata;
148 |   a_nnue[0] = null;
149 |   a_nnue[1] = null;
150 |   a_nnue[2] = null;
151 | 
152 |   i := 0;
153 |   while i<3 && chess.ply >= i {
154 |     a_nnue[i] = *chess.nnue[chess.ply - i];
155 |     i += 1;
156 |   }
157 | 
158 |   using chess.chess;
159 |   return nnue_evaluate_pos(chess, a_nnue);
160 | }
161 | 
162 | nnue_evaluate_board :: (chess: Chess) -> int {
163 |   nnue: NNUEdata #align 32;
164 |   nnue.accumulator.computedAccumulation = 0;
165 |   nnue_data: [3] *NNUEdata;
166 |   nnue_data[0] = *nnue;
167 |   nnue_data[1] = null;
168 |   nnue_data[2] = null;
169 |   return nnue_evaluate_pos(*chess, nnue_data);
170 | }
171 | 
172 | DirtyPiece :: struct {
173 |   dirtyNum: s32;
174 |   pc      : [3] s32;
175 |   from    : [3] s32;
176 |   to      : [3] s32;
177 | }
178 | 
179 | Accumulator :: struct {
180 |   padding: [1088] u8;
181 |   #place padding;
182 |   accumulation: [2][256] s16 #align 64;
183 |   computedAccumulation: s32;
184 | } 
185 | 
186 | NNUEdata :: struct {
187 |   padding: [1152] u8;
188 |   #place padding;
189 |   accumulator: Accumulator;
190 |   dirtyPiece: DirtyPiece;
191 | } 
192 | 
193 | #scope_file
194 | NNUE_Model :: struct {
195 |   // features:
196 |   ft_biases:  [kHalfDimensions] s16 #align 64;
197 |   ft_weights: [kHalfDimensions*FtInDims] s16 #align 64;
198 | 
199 |   // weights:
200 |   hidden1_weights: [64*512] s8 #align 64;
201 |   hidden2_weights: [64*32]  s8 #align 64;
202 |   output_weights:  [1*32]   s8 #align 64;
203 | 
204 |   // biases:
205 |   hidden1_biases: [32] s32 #align 64;
206 |   hidden2_biases: [32] s32 #align 64;
207 |   output_biases : [1]  s32 #align 64;
208 | }
209 | 
210 | using #no_reset nnue_model: NNUE_Model #align 64;
211 | 
212 | // dimensions
213 | kHalfDimensions :: 256;
214 | FtInDims :: 64*PS_END; // 63 * 641
215 | FtOutDims :: kHalfDimensions*2;
216 | NnueVersion : u32 : 0x7AF32F16;
217 | TransformerStart :: 3*4 + 177;
218 | NetworkStart :: TransformerStart+4 + 2*256 + 2*256*64*641;
219 | 
220 | IndexList :: struct {
221 |   size: s32;
222 |   values: [30] s32;
223 | }
224 | 
225 | nnue_evaluate_pos :: (chess: *Chess, nnue: [3] *NNUEdata) -> s32 {
226 |   Data :: struct {
227 |     input: [FtOutDims] s8;
228 |     hidden1_out: [32] s8;
229 |     hidden2_out: [32] s8; 
230 |   }
231 | 
232 |   input_mask:   [FtOutDims / (8 * size_of(u32)) ] u32;
233 |   hidden1_mask: [8 / size_of(u32)] u32;
234 |   using data: Data #align 32;
235 | 
236 |   transform(chess, nnue, *input[0], *input_mask[0]);
237 |   affine_txfm(*input[0], *hidden1_out[0], FtOutDims, 32, *hidden1_biases[0], *hidden1_weights[0], *input_mask[0], *hidden1_mask[0], true);
238 |   affine_txfm(*hidden1_out[0], *hidden2_out[0], 32, 32, *hidden2_biases[0], *hidden2_weights[0], *hidden1_mask[0], null, false);
239 |   out_value := inline affine_propagate(*hidden2_out[0], output_biases[0], *output_weights[0]);
240 |   FV_SCALE :: 16;
241 |   return out_value / FV_SCALE;
242 | }
243 | 
244 | m256 :: union {
245 |   i8x32 : [32] s8;
246 |   i16x16: [16] s16;
247 |   i32x8 : [8]  s32;
248 |   i64x4 : [4]  s64;
249 | }
250 | 
251 | update_accumulator :: (chess: *Chess, nnue: [3] *NNUEdata) -> bool {
252 | 
253 |   acc_if :: inline (prevAcc: **Accumulator, nnue: *NNUEdata) -> bool {
254 |     if !nnue then
255 |       return true;
256 |     prevAcc.* = *nnue.accumulator;
257 |     return !prevAcc.*.computedAccumulation;
258 |   }
259 | 
260 |   accumulator := *nnue[0].accumulator;
261 |   if accumulator.computedAccumulation then
262 |     return true;
263 |   prevAcc: *Accumulator = null;
264 |   if acc_if(*prevAcc, nnue[1]) && acc_if(*prevAcc, nnue[2]) then
265 |     return false;
266 |   removed_indices: [2] IndexList;
267 |   added_indices: [2] IndexList;
268 |   reset: [2] bool;
269 |   removed_indices[0].size = 0;
270 |   removed_indices[1].size = 0;
271 |   added_indices[0].size = 0;
272 |   added_indices[1].size = 0;
273 |   append_changed_indices(chess, nnue, removed_indices, added_indices, reset);
274 |   for c: 0..1 {
275 |     accTile := *accumulator.accumulation[c][0];
276 |     r := reset[c];
277 |     tile := ifx r then *ft_biases[0] else *prevAcc.accumulation[c][0];
278 |     #asm AVX, AVX2 {
279 |       movdqa.y ymm0: vec,  [tile + 0x000];
280 |       movdqa.y ymm1: vec,  [tile + 0x020];
281 |       movdqa.y ymm2: vec,  [tile + 0x040];
282 |       movdqa.y ymm3: vec,  [tile + 0x060];
283 |       movdqa.y ymm4: vec,  [tile + 0x080];
284 |       movdqa.y ymm5: vec,  [tile + 0x0a0];
285 |       movdqa.y ymm6: vec,  [tile + 0x0c0];
286 |       movdqa.y ymm7: vec,  [tile + 0x0e0];
287 |       movdqa.y ymm8: vec,  [tile + 0x100];
288 |       movdqa.y ymm9: vec,  [tile + 0x120];
289 |       movdqa.y ymm10: vec, [tile + 0x140];
290 |       movdqa.y ymm11: vec, [tile + 0x160];
291 |       movdqa.y ymm12: vec, [tile + 0x180];
292 |       movdqa.y ymm13: vec, [tile + 0x1a0];
293 |       movdqa.y ymm14: vec, [tile + 0x1c0];
294 |       movdqa.y ymm15: vec, [tile + 0x1e0];
295 |     }
296 |     if r == false {
297 |       // Difference calculation for the deactivated features
298 |       for k: 0..removed_indices[c].size-1 {
299 |         index := removed_indices[c].values[k] * kHalfDimensions;
300 |         subtile := *ft_weights[index];
301 |         #asm AVX, AVX2 {
302 |           psubw.y ymm0,  ymm0,  [subtile + 0x000];
303 |           psubw.y ymm1,  ymm1,  [subtile + 0x020];
304 |           psubw.y ymm2,  ymm2,  [subtile + 0x040];
305 |           psubw.y ymm3,  ymm3,  [subtile + 0x060];
306 |           psubw.y ymm4,  ymm4,  [subtile + 0x080];
307 |           psubw.y ymm5,  ymm5,  [subtile + 0x0a0];
308 |           psubw.y ymm6,  ymm6,  [subtile + 0x0c0];
309 |           psubw.y ymm7,  ymm7,  [subtile + 0x0e0];
310 |           psubw.y ymm8,  ymm8,  [subtile + 0x100];
311 |           psubw.y ymm9,  ymm9,  [subtile + 0x120];
312 |           psubw.y ymm10, ymm10, [subtile + 0x140];
313 |           psubw.y ymm11, ymm11, [subtile + 0x160];
314 |           psubw.y ymm12, ymm12, [subtile + 0x180];
315 |           psubw.y ymm13, ymm13, [subtile + 0x1a0];
316 |           psubw.y ymm14, ymm14, [subtile + 0x1c0];
317 |           psubw.y ymm15, ymm15, [subtile + 0x1e0];
318 |         }
319 |       }
320 |     }
321 | 
322 |     // Difference calculation for the activated features
323 |     for k: 0..added_indices[c].size-1 {
324 |       index := added_indices[c].values[k] * kHalfDimensions;
325 |       addtile := *ft_weights[index];
326 |       #asm AVX, AVX2 {
327 |         paddw.y ymm0,  ymm0,  [addtile + 0x000];
328 |         paddw.y ymm1,  ymm1,  [addtile + 0x020];
329 |         paddw.y ymm2,  ymm2,  [addtile + 0x040];
330 |         paddw.y ymm3,  ymm3,  [addtile + 0x060];
331 |         paddw.y ymm4,  ymm4,  [addtile + 0x080];
332 |         paddw.y ymm5,  ymm5,  [addtile + 0x0a0];
333 |         paddw.y ymm6,  ymm6,  [addtile + 0x0c0];
334 |         paddw.y ymm7,  ymm7,  [addtile + 0x0e0];
335 |         paddw.y ymm8,  ymm8,  [addtile + 0x100];
336 |         paddw.y ymm9,  ymm9,  [addtile + 0x120];
337 |         paddw.y ymm10, ymm10, [addtile + 0x140];
338 |         paddw.y ymm11, ymm11, [addtile + 0x160];
339 |         paddw.y ymm12, ymm12, [addtile + 0x180];
340 |         paddw.y ymm13, ymm13, [addtile + 0x1a0];
341 |         paddw.y ymm14, ymm14, [addtile + 0x1c0];
342 |         paddw.y ymm15, ymm15, [addtile + 0x1e0];
343 |       }
344 |     }
345 | 
346 |     #asm AVX, AVX2 {
347 |       movdqa.y [accTile + 0x000],ymm0;
348 |       movdqa.y [accTile + 0x020],ymm1;
349 |       movdqa.y [accTile + 0x040],ymm2;
350 |       movdqa.y [accTile + 0x060],ymm3;
351 |       movdqa.y [accTile + 0x080],ymm4;
352 |       movdqa.y [accTile + 0x0a0],ymm5;
353 |       movdqa.y [accTile + 0x0c0],ymm6;
354 |       movdqa.y [accTile + 0x0e0],ymm7;
355 |       movdqa.y [accTile + 0x100],ymm8;
356 |       movdqa.y [accTile + 0x120],ymm9;
357 |       movdqa.y [accTile + 0x140],ymm10;
358 |       movdqa.y [accTile + 0x160],ymm11;
359 |       movdqa.y [accTile + 0x180],ymm12; 
360 |       movdqa.y [accTile + 0x1a0],ymm13;
361 |       movdqa.y [accTile + 0x1c0],ymm14; 
362 |       movdqa.y [accTile + 0x1e0],ymm15; 
363 |     }
364 |   }
365 | 
366 |   accumulator.computedAccumulation = 1;
367 |   return true;
368 | }
369 | 
370 | refresh_accumulator :: (chess: *Chess, nnue: [3] *NNUEdata) {
371 |   accumulator := *(nnue[0].accumulator);
372 |   activeIndices: [2] IndexList;
373 |   activeIndices[0].size = 0;
374 |   activeIndices[1].size = 0;
375 |   append_active_indices(chess, activeIndices);
376 | 
377 |   biases := *ft_biases[0];
378 |   for c: 0..1 {
379 | 
380 |     acc := *accumulator.accumulation[c][0];
381 |     #asm AVX, AVX2 {
382 |       movdqa.y ymm0:  vec, [biases + 0x000];
383 |       movdqa.y ymm1:  vec, [biases + 0x020];
384 |       movdqa.y ymm2:  vec, [biases + 0x040];
385 |       movdqa.y ymm3:  vec, [biases + 0x060];
386 |       movdqa.y ymm4:  vec, [biases + 0x080];
387 |       movdqa.y ymm5:  vec, [biases + 0x0a0];
388 |       movdqa.y ymm6:  vec, [biases + 0x0c0];
389 |       movdqa.y ymm7:  vec, [biases + 0x0e0];
390 |       movdqa.y ymm8:  vec, [biases + 0x100];
391 |       movdqa.y ymm9:  vec, [biases + 0x120];
392 |       movdqa.y ymm10: vec, [biases + 0x140];
393 |       movdqa.y ymm11: vec, [biases + 0x160];
394 |       movdqa.y ymm12: vec, [biases + 0x180];
395 |       movdqa.y ymm13: vec, [biases + 0x1a0];
396 |       movdqa.y ymm14: vec, [biases + 0x1c0];
397 |       movdqa.y ymm15: vec, [biases + 0x1e0];
398 |     }
399 | 
400 |     for k: 0..activeIndices[c].size-1 {
401 |       index  := kHalfDimensions * activeIndices[c].values[k];
402 |       tile_add := *ft_weights[index];
403 |       #asm AVX, AVX2 {
404 |         paddw.y ymm0,  ymm0,  [tile_add + 0x000];
405 |         paddw.y ymm1,  ymm1,  [tile_add + 0x020];
406 |         paddw.y ymm2,  ymm2,  [tile_add + 0x040];
407 |         paddw.y ymm3,  ymm3,  [tile_add + 0x060];
408 |         paddw.y ymm4,  ymm4,  [tile_add + 0x080];
409 |         paddw.y ymm5,  ymm5,  [tile_add + 0x0a0];
410 |         paddw.y ymm6,  ymm6,  [tile_add + 0x0c0];
411 |         paddw.y ymm7,  ymm7,  [tile_add + 0x0e0];
412 |         paddw.y ymm8,  ymm8,  [tile_add + 0x100];
413 |         paddw.y ymm9,  ymm9,  [tile_add + 0x120];
414 |         paddw.y ymm10, ymm10, [tile_add + 0x140];
415 |         paddw.y ymm11, ymm11, [tile_add + 0x160];
416 |         paddw.y ymm12, ymm12, [tile_add + 0x180];
417 |         paddw.y ymm13, ymm13, [tile_add + 0x1a0];
418 |         paddw.y ymm14, ymm14, [tile_add + 0x1c0];
419 |         paddw.y ymm15, ymm15, [tile_add + 0x1e0];
420 |       }
421 |     }
422 |     #asm AVX, AVX2 {
423 |       movdqa.y [acc + 0x000],ymm0;
424 |       movdqa.y [acc + 0x020],ymm1;
425 |       movdqa.y [acc + 0x040],ymm2;
426 |       movdqa.y [acc + 0x060],ymm3;
427 |       movdqa.y [acc + 0x080],ymm4;
428 |       movdqa.y [acc + 0x0a0],ymm5;
429 |       movdqa.y [acc + 0x0c0],ymm6;
430 |       movdqa.y [acc + 0x0e0],ymm7;
431 |       movdqa.y [acc + 0x100],ymm8;
432 |       movdqa.y [acc + 0x120],ymm9;
433 |       movdqa.y [acc + 0x140],ymm10;
434 |       movdqa.y [acc + 0x160],ymm11;
435 |       movdqa.y [acc + 0x180],ymm12; 
436 |       movdqa.y [acc + 0x1a0],ymm13;
437 |       movdqa.y [acc + 0x1c0],ymm14; 
438 |       movdqa.y [acc + 0x1e0],ymm15; 
439 |     }
440 |   }
441 | 
442 |   accumulator.computedAccumulation = 1;
443 | }
444 | 
445 | append_active_indices :: (chess: *Chess, active: [] IndexList) {
446 |   half_kp_append_active_indices(chess, chess.w_king, 0, *active[0]);
447 |   half_kp_append_active_indices(chess, chess.b_king, 1, *active[1]);
448 | }
449 | 
450 | append_changed_indices :: (chess: *Chess, nnue: [3] *NNUEdata, removed: [] IndexList, added: [] IndexList, reset: [] bool) {
451 |   dp := *nnue[0].dirtyPiece;
452 |   if nnue[1].accumulator.computedAccumulation then {
453 |     {
454 |       king := chess.w_king;
455 |       ksq := cast(s32) bsf(king);
456 |       reset[0] = dp.pc[0] == 1;
457 |       if reset[0] then {
458 |         half_kp_append_active_indices(chess, king, 0, *added[0]);
459 |       } else {
460 |         half_kp_append_changed_indices(ksq, 0, dp, *removed[0], *added[0]);
461 |       }
462 |     }
463 |     {
464 |       king := chess.b_king;
465 |       ksq := cast(s32) bsf(king);
466 |       reset[1] = dp.pc[0] == 7;
467 |       if reset[1] then {
468 |         half_kp_append_active_indices(chess, king, 1, *added[1]);
469 |       } else {
470 |         half_kp_append_changed_indices(ksq, 1, dp, *removed[1], *added[1]);
471 |       }
472 |     }
473 |   } else {
474 |     dp2 := *nnue[1].dirtyPiece;
475 |     {
476 |       king := chess.w_king;
477 |       ksq := cast(s32) bsf(king);
478 |       reset[0] = dp.pc[0] == 1 || dp2.pc[0] == 1;
479 |       if reset[0] then {
480 |         half_kp_append_active_indices(chess, king, 0, *added[0]);
481 |       } else {
482 |         half_kp_append_changed_indices(ksq, 0, dp, *removed[0], *added[0]);
483 |         half_kp_append_changed_indices(ksq, 0, dp2, *removed[0], *added[0]);
484 |       }
485 |     }
486 | 
487 |     {
488 |       king := chess.b_king;
489 |       ksq := cast(s32) bsf(king);
490 |       reset[1] = dp.pc[0] == 7 || dp2.pc[0] == 7;
491 |       if reset[1] then {
492 |         half_kp_append_active_indices(chess, king, 1, *added[1]);
493 |       } else {
494 |         half_kp_append_changed_indices(ksq, 1, dp, *removed[1], *added[1]);
495 |         half_kp_append_changed_indices(ksq, 1, dp2, *removed[1], *added[1]);
496 |       }
497 |     }
498 |   }
499 | }
500 | 
501 | half_kp_append_active_indices :: (chess: *Chess, king: u64, c: s32, active: *IndexList) {
502 |   ksq := cast(s32) bsf(king);
503 |   ksq = orient(c, ksq) * PS_END;
504 |   occupied := chess.occupied;
505 |   kings := chess.w_king | chess.b_king;
506 |   occupied ^= kings;
507 |   while occupied {
508 |     sq := cast(s32) bsf(occupied);
509 |     occupied &= occupied - 1;
510 |     pc := cast(s32) chess.pieces[sq];
511 |     active.values[active.size] = make_index(xx c, sq, pc, ksq);
512 |     active.size += 1;
513 |   }
514 | }
515 | 
516 | bsf :: (value: u64) -> int #expand {
517 |   result: int = 0;
518 |   #asm { bsf.q result, value; }
519 |   return result;
520 | }
521 | 
522 | half_kp_append_changed_indices :: (ksq: s32, c: s32, dp: DirtyPiece, removed: *IndexList, added: *IndexList) {
523 |   ksq = orient(c, ksq) * PS_END;
524 |   num := dp.dirtyNum - 1;
525 |   for i: 0..num {
526 |     pc := dp.pc[i];
527 |     if pc == 1 || pc == 7 continue;
528 |     from := dp.from[i];
529 |     to := dp.to[i];
530 |     if from != 64 then {
531 |       removed.values[removed.size] = make_index(c, from, pc, ksq);
532 |       removed.size += 1;
533 |     }
534 | 
535 |     if to != 64 then {
536 |       added.values[added.size] = make_index(c, to, pc, ksq);
537 |       added.size += 1;
538 |     }
539 |   }
540 | }
541 | 
542 | make_index :: (c: s32, s: s32, pc: s32, ksq: s32) -> s32 #expand {
543 |   return orient(c, s) + PieceToIndex[c][pc] + ksq;
544 | }
545 | 
546 | orient :: (c: s32, s: s32) -> s32 {
547 |   if c == 0 {
548 |     return s;
549 |   } else {
550 |     return s ^ 0x3F;
551 |   }
552 | }
553 | 
554 | PS_W_PAWN   ::  1;
555 | PS_B_PAWN   ::  1*64 + 1;
556 | PS_W_KNIGHT ::  2*64 + 1;
557 | PS_B_KNIGHT ::  3*64 + 1;
558 | PS_W_BISHOP ::  4*64 + 1;
559 | PS_B_BISHOP ::  5*64 + 1;
560 | PS_W_ROOK   ::  6*64 + 1;
561 | PS_B_ROOK   ::  7*64 + 1;
562 | PS_W_QUEEN  ::  8*64 + 1;
563 | PS_B_QUEEN  ::  9*64 + 1;
564 | PS_END      :: 10*64 + 1;
565 | 
566 | PieceToIndex: [2][14] s32 = .[ 
567 |   s32.[0, 0, PS_W_QUEEN, PS_W_ROOK, PS_W_BISHOP, PS_W_KNIGHT, PS_W_PAWN,
568 |        0, PS_B_QUEEN, PS_B_ROOK, PS_B_BISHOP, PS_B_KNIGHT, PS_B_PAWN, 0],
569 |   s32.[ 0, 0, PS_B_QUEEN, PS_B_ROOK, PS_B_BISHOP, PS_B_KNIGHT, PS_B_PAWN,
570 |        0, PS_W_QUEEN, PS_W_ROOK, PS_W_BISHOP, PS_W_KNIGHT, PS_W_PAWN, 0]
571 | ];
572 | 
573 | transform :: (chess: *Chess, nnue: [3] *NNUEdata, output: *s8, out_mask: *u32) {
574 |   if !update_accumulator(chess, nnue) then
575 |     refresh_accumulator(chess, nnue);
576 |   accumulation: [][256] s16 = nnue[0].accumulator.accumulation;
577 | 
578 |   turn := chess.turn;
579 |   for p: 0..1 {
580 |     tile := *accumulation[turn][0];
581 |     #asm AVX, AVX2 {
582 |       val: gpr;
583 |       pxor.y     zeroes: vec, zeroes, zeroes;
584 |       movdqa.y   ymm0: vec,  [tile + 0x000];
585 |       packsswb.y ymm0, ymm0, [tile + 0x020];
586 |       movdqa.y   ymm1: vec,  [tile + 0x040];
587 |       packsswb.y ymm1, ymm1, [tile + 0x060];
588 |       movdqa.y   ymm2: vec,  [tile + 0x080];
589 |       packsswb.y ymm2, ymm2, [tile + 0x0a0];
590 |       movdqa.y   ymm3: vec,  [tile + 0x0c0];
591 |       packsswb.y ymm3, ymm3, [tile + 0x0e0];
592 |       movdqa.y   ymm4: vec,  [tile + 0x100];
593 |       packsswb.y ymm4, ymm4, [tile + 0x120];
594 |       movdqa.y   ymm5: vec,  [tile + 0x140];
595 |       packsswb.y ymm5, ymm5, [tile + 0x160];
596 |       movdqa.y   ymm6: vec,  [tile + 0x180];
597 |       packsswb.y ymm6, ymm6, [tile + 0x1a0];
598 |       movdqa.y   ymm7: vec,  [tile + 0x1c0];
599 |       packsswb.y ymm7, ymm7, [tile + 0x1e0];
600 | 
601 |       movdqa.y   [output + 0x000], ymm0;
602 |       movdqa.y   [output + 0x020], ymm1;
603 |       movdqa.y   [output + 0x040], ymm2;
604 |       movdqa.y   [output + 0x060], ymm3;
605 |       movdqa.y   [output + 0x080], ymm4;
606 |       movdqa.y   [output + 0x0a0], ymm5;
607 |       movdqa.y   [output + 0x0c0], ymm6;
608 |       movdqa.y   [output + 0x0e0], ymm7;
609 |       pcmpgtb.y  ymm0, ymm0, zeroes; 
610 |       pcmpgtb.y  ymm1, ymm1, zeroes; 
611 |       pcmpgtb.y  ymm2, ymm2, zeroes; 
612 |       pcmpgtb.y  ymm3, ymm3, zeroes; 
613 |       pcmpgtb.y  ymm4, ymm4, zeroes; 
614 |       pcmpgtb.y  ymm5, ymm5, zeroes; 
615 |       pcmpgtb.y  ymm6, ymm6, zeroes; 
616 |       pcmpgtb.y  ymm7, ymm7, zeroes; 
617 |       pmovmskb   val, ymm0;
618 |       mov.d      [out_mask + 0x00], val;
619 |       pmovmskb   val, ymm1;
620 |       mov.d      [out_mask + 0x04], val;
621 |       pmovmskb   val, ymm2;
622 |       mov.d      [out_mask + 0x08], val;
623 |       pmovmskb   val, ymm3;
624 |       mov.d      [out_mask + 0x0c], val;
625 |       pmovmskb   val, ymm4;
626 |       mov.d      [out_mask + 0x10], val;
627 |       pmovmskb   val, ymm5;
628 |       mov.d      [out_mask + 0x14], val;
629 |       pmovmskb   val, ymm6;
630 |       mov.d      [out_mask + 0x18], val;
631 |       pmovmskb   val, ymm7;
632 |       mov.d      [out_mask + 0x1c], val;
633 | 
634 |       add        output, 0x100;
635 |       add        out_mask, 0x20;
636 |     }
637 | 
638 |     turn ^= 1;
639 |   }
640 | }
641 | 
642 | affine_txfm :: (input: *s8, output: *s8, inDims: u32, outDims: u32, biases: *s32, weights: *s8, in_mask: *u32, out_mask: *u32, pack8_and_calc_mask: bool) #expand {
643 | 
644 |   // mask2_t = u64
645 |   next_idx :: () -> bool #expand {
646 |     #if pack8_and_calc_mask{
647 |       while v == 0 {
648 |         offset += 8 * size_of(u64);
649 |         if offset >= inDims then
650 |           return false;
651 |         v = << cast(*u64)((cast(*s8)in_mask) + (offset/8));
652 |       }
653 | 
654 |       idx = offset + bsf(v);
655 |       v &= v - 1;
656 |       return true;
657 |     } else {
658 |       if v == 0 then {
659 |         return false;
660 |       }
661 | 
662 |       idx = bsf(v);
663 |       v &= v - 1;
664 |       return true;
665 |     }
666 |   }
667 | 
668 | 
669 |   #asm AVX, AVX2 {
670 |     movdqa.y out_0: vec, [biases + 0];
671 |     movdqa.y out_1: vec, [biases + 32];
672 |     movdqa.y out_2: vec, [biases + 64];
673 |     movdqa.y out_3: vec, [biases + 96];
674 |     pxor.y   kZero: vec, kZero, kZero;
675 |   }
676 | 
677 |   // translated from => memcpy(&v, inMask, sizeof(mask2_t));
678 |   v := << cast(*u64)in_mask;
679 |   idx: int = 0;
680 |   offset: int = 0;
681 |   while offset < inDims {
682 |     if !next_idx() break;
683 |     weights_data := *(cast(*m256)weights)[idx];
684 |     #asm AVX, AVX2 {
685 |       // initialize first and second = 0.
686 |       movdqa.y first: vec, [weights_data];
687 |       second: vec;
688 |     }
689 |     factor: s16 = input[idx];
690 |     if next_idx() {
691 |       weights_data := *(cast(*m256)weights)[idx];
692 |       val: s16 = cast(s16) input[idx];
693 |       factor |= val << 8;
694 |       #asm AVX, AVX2 {
695 |         movdqa.y second, [weights_data];
696 |       }
697 |     } else {
698 |       #asm AVX, AVX2 {
699 |         pxor.y second, second, second;
700 |       }
701 |     }
702 | 
703 |     #asm AVX, AVX2 {
704 |       // __m256i mul = _mm256_set1_epi16(factor), prod, signs;
705 |       // __m256i prod = _mm256_maddubs_epi16(mul, _mm256_unpacklo_epi8(first, second));
706 |       // __m256i signs = _mm256_cmpgt_epi16(kZero, prod);
707 |       // out_0 = _mm256_add_epi32(out_0, _mm256_unpacklo_epi16(prod, signs));
708 |       // out_1 = _mm256_add_epi32(out_1, _mm256_unpackhi_epi16(prod, signs));
709 |       // prod = _mm256_maddubs_epi16(mul, _mm256_unpackhi_epi8(first, second));
710 |       // signs = _mm256_cmpgt_epi16(kZero, prod);
711 |       // out_2 = _mm256_add_epi32(out_2, _mm256_unpacklo_epi16(prod, signs));
712 |       // out_3 = _mm256_add_epi32(out_3, _mm256_unpackhi_epi16(prod, signs));
713 | 
714 |       movd mul: vec, factor;
715 |       pbroadcastw.y mul, mul; 
716 |       punpcklbw.y prod: vec, first, second;
717 |       pmaddubsw.y prod, mul, prod;
718 |       pcmpgtw.y signs: vec, kZero, prod;
719 |       punpcklwd.y xmm1: vec, prod, signs;
720 |       paddd.y out_0, out_0, xmm1;
721 |       punpckhwd.y xmm1, prod, signs;
722 |       paddd.y out_1, out_1, xmm1;
723 |       punpckhbw.y xmm1, first, second;
724 |       pmaddubsw.y prod, mul, xmm1;
725 |       pcmpgtw.y signs, kZero, prod;
726 |       punpcklwd.y xmm1, prod, signs;
727 |       paddd.y out_2, out_2, xmm1;
728 |       punpckhwd.y xmm1, prod, signs;
729 |       paddd.y out_3, out_3, xmm1;
730 |     }
731 |   }
732 | 
733 |   #asm AVX, AVX2 {
734 |     // __m256i out16_0 = _mm256_srai_epi16(_mm256_packs_epi32(out_0, out_1), SHIFT);
735 |     // __m256i out16_1 = _mm256_srai_epi16(_mm256_packs_epi32(out_2, out_3), SHIFT);
736 |     // __m256i *outVec = (__m256i *)output;
737 |     // outVec[0] = _mm256_packs_epi16(out16_0, out16_1);
738 | 
739 |     packssdw.y out_0, out_0, out_1;
740 |     packssdw.y out_1, out_2, out_3;
741 |     psraw.y    out_0, out_0, 6;
742 |     psraw.y    out_1, out_1, 6;
743 |     packsswb.y out_0, out_0, out_1;
744 |   }
745 | 
746 |   #if pack8_and_calc_mask then {
747 |     #asm AVX, AVX2 {
748 |       // outMask[0] = _mm256_movemask_epi8(_mm256_cmpgt_epi8(outVec[0], kZero));
749 |       movdqa.y   [output], out_0;
750 |       pcmpgtb.y out_0, out_0, kZero;
751 |       pmovmskb  reg: gpr, out_0;
752 |       mov.d     [out_mask], reg;
753 |     }
754 |   } else {
755 |     #asm AVX, AVX2 {
756 |       // outVec[0] = _mm256_max_epi8(outVec[0], kZero);
757 |       pmaxsb.y out_0, out_0, kZero;
758 |       movdqa.y [output], out_0;
759 |     }
760 |   }
761 | }
762 | 
763 | affine_propagate :: (input: *s8, biases: s32, weights: *s8) -> s32 #expand {
764 |   eax: s32 = ---;
765 |   #asm AVX, AVX2 {
766 |     // __m256i prod = _mm256_maddubs_epi16(iv[0], row[0]);
767 |     // prod = _mm256_madd_epi16(prod, _mm256_set1_epi16(1));
768 |     // __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(prod), _mm256_extracti128_si256(prod, 1));
769 |     // sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x1b));
770 |     // return _mm_cvtsi128_si32(sum) + _mm_extract_epi32(sum, 1) + biases[0];
771 | 
772 |     mov eax, 1;
773 |     movdqa.y    prod: vec,  [input];
774 |     pmaddubsw.y prod, prod, [weights];
775 |     movd        xmm0: vec, eax;
776 |     pbroadcastw xmm0, xmm0; 
777 |     pmaddwd.y   prod, prod, xmm0;
778 | 
779 | 
780 | 
781 |     extracti128 xmm0, prod, 1;
782 |     paddd.x     sum: vec, prod, xmm0;
783 |     pshufd      xmm0, sum, 0x1b;
784 |     paddd.x     sum, sum, xmm0;
785 |     movd        eax, sum;
786 |     pextrd      val: gpr, sum, 1;
787 |     add         eax, val;
788 |     add         eax, biases;
789 |   }
790 | 
791 |   return eax;
792 | }
793 | 
794 | #import "Basic";
795 | #import "File";
796 | 
797 | 
798 | 
799 | 
800 | 


--------------------------------------------------------------------------------
/nnue_cpu.jai:
--------------------------------------------------------------------------------
  1 | #run {
  2 |   nnue_default :: "resources/nn-04cf2b4ed1da.nnue";
  3 |   if nnue_init(nnue_default) {
  4 |     print("NNUE % initialized\n", nnue_default);
  5 |   } else {
  6 |     assert(false, "Error. Neural Network is not initialized.\n"); 
  7 |   }
  8 | }
  9 | 
 10 | nnue_startup :: () #expand {} // initialization is done at compile time.
 11 | 
 12 | nnue_init :: (file_name: string) -> bool {
 13 | 
 14 |   verify_file :: (buffer: [] u8) -> bool {
 15 |     if buffer.count != 21022697 then
 16 |       return false;
 17 |     d := buffer.data;
 18 |     if <<cast(*u32)d != NnueVersion then
 19 |       return false;
 20 |     if <<cast(*u32)(d+4) != 0x3e5aa6ee then
 21 |       return false;
 22 |     if <<cast(*u32)(d+8) != 177 then
 23 |       return false;
 24 |     if <<cast(*u32)(d + TransformerStart) != 0x5d69d7b8 then
 25 |       return false;
 26 |     if <<cast(*u32)(d + NetworkStart) != 0x63337156 then
 27 |       return false;
 28 |     return true;
 29 |   }
 30 | 
 31 |   read_hidden_weights :: (weight: []s8, dims: int, d: *s8) -> *s8 {
 32 | 
 33 |     wt_idx :: (r: int, c: int, dims: int) -> int {
 34 |       return c * 32 + r;
 35 |     }
 36 |     i := 0;
 37 |     for r: 0..31 {
 38 |       for c: 0..dims-1 {
 39 |         index := wt_idx(r, c, dims);
 40 |         weight[index] = <<d;
 41 |         d += 1;
 42 |       }
 43 |     }
 44 | 
 45 |     return d;
 46 |   }
 47 | 
 48 |   read_output_weights :: (weight: []s8, data: *s8) {
 49 |     for i: 0..31 {
 50 |       weight[i] = << data;
 51 |       data += 1;
 52 |     }
 53 |   }
 54 | 
 55 |   init_weights :: (buffer: [] u8) {
 56 |     data := cast(*s8) (buffer.data + TransformerStart + 4);
 57 | 
 58 |     // Read transformer
 59 |     for i: 0..(kHalfDimensions-1) {
 60 |       ft_biases[i] = <<cast, no_check (*s16)(data);
 61 |       data += 2;
 62 |     }
 63 | 
 64 |     for i: 0..(kHalfDimensions*FtInDims)-1 {
 65 |       ft_weights[i] = <<cast, no_check(*s16)(data);
 66 |       data += 2;
 67 |     }
 68 | 
 69 |     // Read network
 70 |     data += 4;
 71 |     for i: 0..31 {
 72 |       hidden1_biases[i] = <<cast, no_check(*s32)(data);
 73 |       data += 4;
 74 |     }
 75 | 
 76 |     data = read_hidden_weights(hidden1_weights, 512, data);
 77 | 
 78 |     for i: 0..31 {
 79 |       hidden2_biases[i] = <<cast, no_check(*s32)(data);
 80 |       data += 4;
 81 |     }
 82 | 
 83 |     data = read_hidden_weights(hidden2_weights, 32, data);
 84 |     
 85 |     for i: 0..0 {
 86 |       output_biases[i] = <<cast(*s32)(data);
 87 |       data += 4;
 88 |     }
 89 | 
 90 |     read_output_weights(output_weights, data);
 91 | 
 92 |   }
 93 | 
 94 | 
 95 |   file, success :=  file_open(file_name);
 96 |   if !success {
 97 |     return false;
 98 |   }
 99 |   length :=  file_length(file);
100 |   buffer := NewArray(length, u8);
101 |   defer {
102 |     array_free(buffer);
103 |     file_close(*file);
104 |   }
105 | 
106 |   if !file_read(file, buffer.data, length) {
107 |     return false;
108 |   }
109 | 
110 |   // verify that the file is correct.
111 |   if !verify_file(buffer) then
112 |     return false;
113 | 
114 |   init_weights(buffer);
115 |   return true;
116 | 
117 | }
118 | 
119 | nnue_evaluate :: (chess: *ChessGame) -> int {
120 |   a_nnue: [3] *NNUEdata;
121 |   a_nnue[0] = null;
122 |   a_nnue[1] = null;
123 |   a_nnue[2] = null;
124 | 
125 |   i := 0;
126 |   while i<3 && chess.ply >= i {
127 |     a_nnue[i] = *chess.nnue[chess.ply - i];
128 |     i += 1;
129 |   }
130 | 
131 |   using chess.chess;
132 |   return nnue_evaluate_pos(chess, a_nnue);
133 | }
134 | 
135 | nnue_evaluate_board :: (chess: Chess) -> int {
136 |   nnue: NNUEdata;
137 |   nnue.accumulator.computedAccumulation = 0;
138 |   nnue_data: [3] *NNUEdata;
139 |   nnue_data[0] = *nnue;
140 |   nnue_data[1] = null;
141 |   nnue_data[2] = null;
142 |   return nnue_evaluate_pos(*chess, nnue_data);
143 | }
144 | 
145 | DirtyPiece :: struct {
146 |   dirtyNum: s32;
147 |   pc      : [3] s32;
148 |   from    : [3] s32;
149 |   to      : [3] s32;
150 | }
151 | 
152 | Accumulator :: struct {
153 |   padding: [1088] u8;
154 |   #place padding;
155 | 
156 |   accumulation: [2][256] s16 #align 64;
157 |   computedAccumulation: s32;
158 | } 
159 | 
160 | NNUEdata :: struct {
161 |   padding: [1152] u8;
162 |   #place padding;
163 | 
164 |   accumulator: Accumulator;
165 |   dirtyPiece: DirtyPiece;
166 | } 
167 | 
168 | #scope_file
169 | NNUE_Model :: struct {
170 |   // features:
171 |   ft_biases:  [kHalfDimensions] s16 #align 64;
172 |   ft_weights: [kHalfDimensions*FtInDims] s16 #align 64;
173 | 
174 |   // weights:
175 |   hidden1_weights: [64*512] s8 #align 64;
176 |   hidden2_weights: [64*32]  s8 #align 64;
177 |   output_weights:  [1*32]   s8 #align 64;
178 | 
179 |   // biases:
180 |   hidden1_biases: [32] s32 #align 64;
181 |   hidden2_biases: [32] s32 #align 64;
182 |   output_biases : [1]  s32 #align 64;
183 | }
184 | 
185 | #no_reset nnue_model: NNUE_Model #align 64;
186 | using nnue_model;
187 | 
188 | // dimensions
189 | kHalfDimensions :: 256;
190 | FtInDims :: 64*PS_END; // 63 * 641
191 | FtOutDims :: kHalfDimensions*2;
192 | NnueVersion : u32 : 0x7AF32F16;
193 | TransformerStart :: 3*4 + 177;
194 | NetworkStart :: TransformerStart+4 + 2*256 + 2*256*64*641;
195 | 
196 | IndexList :: struct {
197 |   size: s32;
198 |   values: [30] s32;
199 | }
200 | 
201 | nnue_evaluate_pos :: (chess: *Chess, nnue: [3] *NNUEdata) -> s32 {
202 |   input_mask:   [FtOutDims / (8 * size_of(u32)) ] u32 #align 8;
203 |   hidden1_mask: [8 / size_of(u32)] u32 #align 8;
204 |   FV_SCALE :: 16;
205 |   input: [FtOutDims] s8 #align 16;
206 |   hidden1_out: [32] s8  #align 16;
207 |   hidden2_out: [32] s8  #align 16; 
208 |   transform(chess, nnue, *input[0], *input_mask[0]);
209 |   affine_txfm(*input[0], *hidden1_out[0], FtOutDims, 32, *hidden1_biases[0], *hidden1_weights[0], *input_mask[0], *hidden1_mask[0], true);
210 |   affine_txfm(*hidden1_out[0], *hidden2_out[0], 32, 32, *hidden2_biases[0], *hidden2_weights[0], *hidden1_mask[0], null, false);
211 |   out_value := inline affine_propagate(*hidden2_out[0], output_biases[0], *output_weights[0]);
212 |   return out_value / FV_SCALE;
213 | }
214 | 
215 | update_accumulator :: (chess: *Chess, nnue: [3] *NNUEdata) -> bool {
216 | 
217 |   acc_if :: inline (prevAcc: **Accumulator, nnue: *NNUEdata) -> bool {
218 |     if !nnue then
219 |       return true;
220 |     prevAcc.* = *nnue.accumulator;
221 |     return !prevAcc.*.computedAccumulation;
222 |   }
223 | 
224 |   accumulator := *nnue[0].accumulator;
225 |   if accumulator.computedAccumulation then
226 |     return true;
227 |   prevAcc: *Accumulator = null;
228 |   if acc_if(*prevAcc, nnue[1]) && acc_if(*prevAcc, nnue[2]) then
229 |     return false;
230 |   removed_indices: [2] IndexList;
231 |   added_indices: [2] IndexList;
232 |   reset: [2] bool;
233 |   removed_indices[0].size = 0;
234 |   removed_indices[1].size = 0;
235 |   added_indices[0].size = 0;
236 |   added_indices[1].size = 0;
237 |   append_changed_indices(chess, nnue, removed_indices, added_indices, reset);
238 | 
239 |   for c: 0..1 {
240 |     if reset[c] then {
241 |       memcpy(accumulator.accumulation[c].data, ft_biases.data, kHalfDimensions * size_of(s16));
242 |     } else {
243 |       memcpy(accumulator.accumulation[c].data, prevAcc.accumulation[c].data, kHalfDimensions * size_of(s16));
244 |       // Difference calculation for the deactivated features
245 |       for k: 0..removed_indices[c].size-1 {
246 |         index  := removed_indices[c].values[k];
247 |         offset := kHalfDimensions * index;
248 |         for j: 0..kHalfDimensions-1 {
249 |           accumulator.accumulation[c][j] -= ft_weights[offset + j];
250 |         }
251 |       }
252 |     }
253 | 
254 |     // Difference calculation for the activated features
255 |     for k: 0..added_indices[c].size-1 {
256 |       index := added_indices[c].values[k];
257 |       offset := kHalfDimensions * index;
258 |       for j: 0..kHalfDimensions-1 {
259 |         accumulator.accumulation[c][j] += ft_weights[offset + j];
260 |       }
261 |     }
262 |   }
263 | 
264 |   accumulator.computedAccumulation = 1;
265 |   return true;
266 | }
267 | 
268 | refresh_accumulator :: (chess: *Chess, nnue: [3] *NNUEdata) {
269 |   accumulator := *(nnue[0].accumulator);
270 |   activeIndices: [2] IndexList;
271 |   activeIndices[0].size = 0;
272 |   activeIndices[1].size = 0;
273 |   append_active_indices(chess, activeIndices);
274 |   for c: 0..1 {
275 |     memcpy(accumulator.accumulation[c].data, ft_biases.data, kHalfDimensions * size_of(s16));
276 |     for k: 0..activeIndices[c].size-1 {
277 |       index := activeIndices[c].values[k];
278 |       offset := kHalfDimensions * index;
279 |       for j: 0..kHalfDimensions-1 {
280 |         accumulator.accumulation[c][j] += ft_weights[offset + j];
281 |       }
282 |     }
283 |   }
284 |   accumulator.computedAccumulation = 1;
285 | }
286 | 
287 | append_active_indices :: (chess: *Chess, active: [] IndexList) {
288 |   half_kp_append_active_indices(chess, chess.w_king, 0, *active[0]);
289 |   half_kp_append_active_indices(chess, chess.b_king, 1, *active[1]);
290 | }
291 | 
292 | append_changed_indices :: (chess: *Chess, nnue: [3] *NNUEdata, removed: [] IndexList, added: [] IndexList, reset: [] bool) {
293 |   dp := *nnue[0].dirtyPiece;
294 |   if nnue[1].accumulator.computedAccumulation then {
295 |     {
296 |       king := chess.w_king;
297 |       ksq := cast(s32) bsf(king);
298 |       reset[0] = dp.pc[0] == 1;
299 |       if reset[0] then {
300 |         half_kp_append_active_indices(chess, king, 0, *added[0]);
301 |       } else {
302 |         half_kp_append_changed_indices(ksq, 0, dp, *removed[0], *added[0]);
303 |       }
304 |     }
305 |     {
306 |       king := chess.b_king;
307 |       ksq := cast(s32) bsf(king);
308 |       reset[1] = dp.pc[0] == 7;
309 |       if reset[1] then {
310 |         half_kp_append_active_indices(chess, king, 1, *added[1]);
311 |       } else {
312 |         half_kp_append_changed_indices(ksq, 1, dp, *removed[1], *added[1]);
313 |       }
314 |     }
315 |   } else {
316 |     dp2 := *nnue[1].dirtyPiece;
317 |     {
318 |       king := chess.w_king;
319 |       ksq := cast(s32) bsf(king);
320 |       reset[0] = dp.pc[0] == 1 || dp2.pc[0] == 1;
321 |       if reset[0] then {
322 |         half_kp_append_active_indices(chess, king, 0, *added[0]);
323 |       } else {
324 |         half_kp_append_changed_indices(ksq, 0, dp, *removed[0], *added[0]);
325 |         half_kp_append_changed_indices(ksq, 0, dp2, *removed[0], *added[0]);
326 |       }
327 |     }
328 | 
329 |     {
330 |       king := chess.b_king;
331 |       ksq := cast(s32) bsf(king);
332 |       reset[1] = dp.pc[0] == 7 || dp2.pc[0] == 7;
333 |       if reset[1] then {
334 |         half_kp_append_active_indices(chess, king, 1, *added[1]);
335 |       } else {
336 |         half_kp_append_changed_indices(ksq, 1, dp, *removed[1], *added[1]);
337 |         half_kp_append_changed_indices(ksq, 1, dp2, *removed[1], *added[1]);
338 |       }
339 |     }
340 |   }
341 | }
342 | 
343 | half_kp_append_active_indices :: (chess: *Chess, king: u64, c: s32, active: *IndexList) {
344 |   ksq := cast(s32) bsf(king);
345 |   ksq = orient(c, ksq) * PS_END;
346 |   occupied := chess.occupied;
347 |   kings := chess.w_king | chess.b_king;
348 |   occupied ^= kings;
349 |   while occupied {
350 |     sq := cast(s32) bsf(occupied);
351 |     occupied &= occupied - 1;
352 |     pc := cast(s32) chess.pieces[sq];
353 |     active.values[active.size] = make_index(xx c, sq, pc, ksq);
354 |     active.size += 1;
355 |   }
356 | }
357 | 
358 | bsf :: (value: u64) -> int #expand {
359 |   result: int = 0;
360 |   #asm { bsf.q result, value; }
361 |   return result;
362 | }
363 | 
364 | half_kp_append_changed_indices :: (ksq: s32, c: s32, dp: DirtyPiece, removed: *IndexList, added: *IndexList) {
365 |   ksq = orient(c, ksq) * PS_END;
366 |   num := dp.dirtyNum - 1;
367 |   for i: 0..num {
368 |     pc := dp.pc[i];
369 |     if pc == 1 || pc == 7 continue;
370 |     from := dp.from[i];
371 |     to := dp.to[i];
372 |     if from != 64 then {
373 |       removed.values[removed.size] = make_index(c, from, pc, ksq);
374 |       removed.size += 1;
375 |     }
376 | 
377 |     if to != 64 then {
378 |       added.values[added.size] = make_index(c, to, pc, ksq);
379 |       added.size += 1;
380 |     }
381 |   }
382 | }
383 | 
384 | make_index :: (c: s32, s: s32, pc: s32, ksq: s32) -> s32 #expand {
385 |   return orient(c, s) + PieceToIndex[c][pc] + ksq;
386 | }
387 | 
388 | orient :: (c: s32, s: s32) -> s32 #expand {
389 |   if c == 0 then {
390 |     return s;
391 |   } else {
392 |     return s ^ 0x3F;
393 |   }
394 | }
395 | 
396 | PS_W_PAWN   ::  1;
397 | PS_B_PAWN   ::  1*64 + 1;
398 | PS_W_KNIGHT ::  2*64 + 1;
399 | PS_B_KNIGHT ::  3*64 + 1;
400 | PS_W_BISHOP ::  4*64 + 1;
401 | PS_B_BISHOP ::  5*64 + 1;
402 | PS_W_ROOK   ::  6*64 + 1;
403 | PS_B_ROOK   ::  7*64 + 1;
404 | PS_W_QUEEN  ::  8*64 + 1;
405 | PS_B_QUEEN  ::  9*64 + 1;
406 | PS_END      :: 10*64 + 1;
407 | 
408 | PieceToIndex: [2][14] s32 = .[ 
409 |   s32.[0, 0, PS_W_QUEEN, PS_W_ROOK, PS_W_BISHOP, PS_W_KNIGHT, PS_W_PAWN,
410 |        0, PS_B_QUEEN, PS_B_ROOK, PS_B_BISHOP, PS_B_KNIGHT, PS_B_PAWN, 0],
411 |   s32.[ 0, 0, PS_B_QUEEN, PS_B_ROOK, PS_B_BISHOP, PS_B_KNIGHT, PS_B_PAWN,
412 |        0, PS_W_QUEEN, PS_W_ROOK, PS_W_BISHOP, PS_W_KNIGHT, PS_W_PAWN, 0]
413 | ];
414 | 
415 | transform :: (chess: *Chess, nnue: [3] *NNUEdata, output: *s8, out_mask: *u32) {
416 |   if !update_accumulator(chess, nnue) then
417 |     refresh_accumulator(chess, nnue);
418 |   accumulation: [][256] s16 = nnue[0].accumulator.accumulation;
419 |   offset := 0;
420 |   p := chess.turn;
421 |   for 0..1 {
422 |     for i: 0..kHalfDimensions-1 {
423 |       sum: s16 = accumulation[p][i];
424 |       output[offset + i] = cast(s8) clamp(sum, 0, 127);
425 |     }
426 |     offset += kHalfDimensions;
427 |     p ^= 1;
428 |   }
429 | }
430 | 
431 | affine_txfm :: (input: *s8, output: *void, inDims: u32, $outDims: u32, biases: *s32, weights: *s8, in_mask: *u32, out_mask: *u32, pack8_and_calc_mask: bool) {
432 | 
433 |   tmp: [outDims] s32;
434 |   for i: 0..outDims-1 {
435 |     tmp[i] = biases[i];
436 |   }
437 | 
438 |   for idx: 0..inDims-1 {
439 |     factor: s32 = input[idx];
440 |     if factor {
441 |       for i: 0..outDims-1 {
442 |         tmp[i] += factor * weights[outDims * idx + i];
443 |       }
444 |     }
445 |   }
446 | 
447 |   outVec := cast(*s8)output;
448 |   for i: 0..outDims-1 {
449 |     outVec[i] = cast(s8) clamp(tmp[i] >> 6, 0, 127);
450 |   }
451 | }
452 | 
453 | affine_propagate :: (input: *s8, biases: s32, weights: *s8) -> s32 {
454 |   sum := biases;
455 |   for j: 0..31 {
456 |     sum += cast(s32)weights[j] * cast(s32)input[j];
457 |   }
458 |   return sum;
459 | }
460 | 
461 | #import "Basic";
462 | #import "File";
463 | 


--------------------------------------------------------------------------------
/nnue_probe.jai:
--------------------------------------------------------------------------------
 1 | // This is handcrafted bindings for Daniel Shawul's NNUE-Probe Library.
 2 | // NNUE-Probe can be used to get an optimized SIMD Matrix Multiplication
 3 | // for computer architectures that the current Jai Chess Engine does not
 4 | // support with the inline assembly.
 5 | // e.g. AVX512, MMX, ARM NEON, etc.
 6 | // This library is completely optional, and is not a necessary component
 7 | // of the Chess Engine.
 8 | 
 9 | nnue_probe :: #library "libnnueprobe";
10 | 
11 | nnue_evaluate :: (player: s32, pieces: *s32, squares: *s32) -> s32 #foreign nnue_probe;
12 | nnue_evaluate_incremental :: (player: s32, pieces: *s32, squares: *s32, nnue: **NNUEdata) -> s32 #foreign nnue_probe;
13 | nnue_init :: (file_name: *u8) -> bool #foreign nnue_probe;
14 | 
15 | nnue_evaluate_board :: (chess: *Chess) -> int {
16 |   pieces, squares := initialize_piece_list(chess);
17 |   return nnue_evaluate(xx chess.turn, *pieces[0], *squares[0]);
18 | }
19 | 
20 | nnue_evaluate :: (chess: *ChessGame) -> int {
21 |   pieces, squares := initialize_piece_list(chess);
22 |   a_nnue: [3] *NNUEdata;
23 |   a_nnue[0] = null;
24 |   a_nnue[1] = null;
25 |   a_nnue[2] = null;
26 | 
27 |   i := 0;
28 |   while i<3 && chess.ply >= i {
29 |     a_nnue[i] = chess.nnue.data + chess.ply - i;
30 |     i += 1;
31 |   }
32 | 
33 |   return nnue_evaluate_incremental(xx chess.turn, *pieces[0], *squares[0], *a_nnue[0]);
34 | }
35 | 
36 | initialize_piece_list :: (chess: *Chess) -> [33] s32, [33] s32 {
37 |   pieces : [33] s32;
38 |   squares: [33] s32;
39 |   pieces[0] = xx Piece.W_KING;
40 |   squares[0] = xx bit_scan_forward(chess.w_king);
41 |   pieces[1] = xx Piece.W_KING;
42 |   squares[1] = xx bit_scan_forward(chess.b_king);
43 | 
44 |   index := 2;
45 |   occupied := chess.occupied & ~(chess.w_king | chess.b_king);
46 |   while occupied {
47 |     sq := cast(s32) bit_scan_forward(occupied);
48 |     piece := cast(s32) chess.pieces[sq];
49 |     pieces[index] = piece;
50 |     squares[index] = sq;
51 |     index += 1;
52 |     occupied &= occupied - 1;
53 |   }
54 | 
55 |   pieces[index] = 0;
56 |   squares[index] = 0;
57 | 
58 |   return pieces, squares;
59 | }
60 | 
61 | nnue_startup :: () #expand { // initialization is done at compile time.
62 |   #import "File_Utilities";
63 | 
64 |   nnue_default :: "resources/nn-04cf2b4ed1da.nnue";
65 |   if !file_exists(nnue_default) {
66 |     print("Error. % does not exist", nnue_default);
67 |     `return;
68 |   }
69 |   nnue_init(nnue_default);
70 | }
71 | 
72 | DirtyPiece :: struct {
73 |   dirtyNum: s32;
74 |   pc      : [3] s32;
75 |   from    : [3] s32;
76 |   to      : [3] s32;
77 | }
78 | 
79 | Accumulator :: struct {
80 |   padding: [1088] u8;
81 |   #place padding;
82 |   accumulation: [2][256] s16 #align 64;
83 |   computedAccumulation: s32;
84 | } 
85 | 
86 | NNUEdata :: struct {
87 |   padding: [1152] u8;
88 |   #place padding;
89 |   accumulator: Accumulator;
90 |   dirtyPiece: DirtyPiece;
91 | } 
92 | 


--------------------------------------------------------------------------------
/nnue_sse.jai:
--------------------------------------------------------------------------------
  1 | #run {
  2 |   nnue_default :: "resources/nn-04cf2b4ed1da.nnue";
  3 |   if nnue_init(nnue_default) {
  4 |     print("NNUE % initialized\n", nnue_default);
  5 |   } else {
  6 |     assert(false, "Error. Neural Network is not initialized.\n"); 
  7 |   }
  8 | }
  9 | 
 10 | nnue_startup :: () #expand {} // initialization is done at compile time.
 11 | 
 12 | nnue_init :: (file_name: string) -> bool {
 13 |   verify_file :: (buffer: [] u8) -> bool {
 14 |     if buffer.count != 21022697 then
 15 |       return false;
 16 |     d := buffer.data;
 17 |     if <<cast(*u32)d != NnueVersion then
 18 |       return false;
 19 |     if <<cast(*u32)(d+4) != 0x3e5aa6ee then
 20 |       return false;
 21 |     if <<cast(*u32)(d+8) != 177 then
 22 |       return false;
 23 |     if <<cast(*u32)(d + TransformerStart) != 0x5d69d7b8 then
 24 |       return false;
 25 |     if <<cast(*u32)(d + NetworkStart) != 0x63337156 then
 26 |       return false;
 27 |     return true;
 28 |   }
 29 | 
 30 |   read_hidden_weights :: (weight: []s8, dims: int, d: *s8) -> *s8 {
 31 | 
 32 |     wt_idx :: (r: int, c: int, dims: int) -> int {
 33 |       return c * 32 + r;
 34 |     }
 35 | 
 36 |     i := 0;
 37 |     for r: 0..31 {
 38 |       for c: 0..dims-1 {
 39 |         index := wt_idx(r, c, dims);
 40 |         weight[index] = <<d;
 41 |         d += 1;
 42 |       }
 43 |     }
 44 | 
 45 |     return d;
 46 |   }
 47 | 
 48 |   read_output_weights :: (weight: []s8, data: *s8) {
 49 |     for i: 0..31 {
 50 |       weight[i] = << data;
 51 |       data += 1;
 52 |     }
 53 |   }
 54 | 
 55 |   init_weights :: (buffer: [] u8) {
 56 |     data := cast(*s8) (buffer.data + TransformerStart + 4);
 57 | 
 58 |     // Read transformer
 59 |     for i: 0..(kHalfDimensions-1) {
 60 |       ft_biases[i] = <<cast, no_check (*s16)(data);
 61 |       data += 2;
 62 |     }
 63 | 
 64 |     for i: 0..(kHalfDimensions*FtInDims)-1 {
 65 |       ft_weights[i] = <<cast, no_check(*s16)(data);
 66 |       data += 2;
 67 |     }
 68 | 
 69 |     // Read network
 70 |     data += 4;
 71 |     for i: 0..31 {
 72 |       hidden1_biases[i] = <<cast, no_check(*s32)(data);
 73 |       data += 4;
 74 |     }
 75 | 
 76 |     data = read_hidden_weights(hidden1_weights, 512, data);
 77 | 
 78 |     for i: 0..31 {
 79 |       hidden2_biases[i] = <<cast, no_check(*s32)(data);
 80 |       data += 4;
 81 |     }
 82 | 
 83 |     data = read_hidden_weights(hidden2_weights, 32, data);
 84 |     
 85 |     for i: 0..0 {
 86 |       output_biases[i] = <<cast(*s32)(data);
 87 |       data += 4;
 88 |     }
 89 | 
 90 |     read_output_weights(output_weights, data);
 91 | 
 92 |   }
 93 | 
 94 |   file, success :=  file_open(file_name);
 95 |   if !success {
 96 |     return false;
 97 |   }
 98 |   length :=  file_length(file);
 99 |   buffer := NewArray(length, u8);
100 |   defer {
101 |     array_free(buffer);
102 |     file_close(*file);
103 |   }
104 | 
105 |   if !file_read(file, buffer.data, length) {
106 |     return false;
107 |   }
108 | 
109 |   // verify that the file is correct.
110 |   if !verify_file(buffer) then
111 |     return false;
112 | 
113 |   init_weights(buffer);
114 |   return true;
115 | 
116 | }
117 | 
118 | nnue_evaluate :: (chess: *ChessGame) -> int {
119 |   a_nnue: [3] *NNUEdata;
120 |   a_nnue[0] = null;
121 |   a_nnue[1] = null;
122 |   a_nnue[2] = null;
123 | 
124 |   i := 0;
125 |   while i<3 && chess.ply >= i {
126 |     a_nnue[i] = *chess.nnue[chess.ply - i];
127 |     i += 1;
128 |   }
129 | 
130 |   using chess.chess;
131 |   return nnue_evaluate_pos(chess, a_nnue);
132 | }
133 | 
134 | nnue_evaluate_board :: (chess: Chess) -> int {
135 |   nnue: NNUEdata #align 32;
136 |   nnue.accumulator.computedAccumulation = 0;
137 |   nnue_data: [3] *NNUEdata;
138 |   nnue_data[0] = *nnue;
139 |   nnue_data[1] = null;
140 |   nnue_data[2] = null;
141 |   return nnue_evaluate_pos(*chess, nnue_data);
142 | }
143 | 
144 | DirtyPiece :: struct {
145 |   dirtyNum: s32;
146 |   pc      : [3] s32;
147 |   from    : [3] s32;
148 |   to      : [3] s32;
149 | }
150 | 
151 | Accumulator :: struct {
152 |   padding: [1088] u8;
153 |   #place padding;
154 | 
155 |   accumulation: [2][256] s16 #align 64;
156 |   computedAccumulation: s32;
157 | } 
158 | 
159 | NNUEdata :: struct {
160 |   padding: [1152] u8;
161 |   #place padding;
162 | 
163 |   accumulator: Accumulator;
164 |   dirtyPiece: DirtyPiece;
165 | } 
166 | 
167 | #scope_file
168 | NNUE_Model :: struct {
169 |   // features:
170 |   ft_biases:  [kHalfDimensions] s16 #align 64;
171 |   ft_weights: [kHalfDimensions*FtInDims] s16 #align 64;
172 | 
173 |   // weights:
174 |   hidden1_weights: [64*512] s8 #align 64;
175 |   hidden2_weights: [64*32]  s8 #align 64;
176 |   output_weights:  [1*32]   s8 #align 64;
177 | 
178 |   // biases:
179 |   hidden1_biases: [32] s32 #align 64;
180 |   hidden2_biases: [32] s32 #align 64;
181 |   output_biases : [1]  s32 #align 64;
182 | }
183 | 
184 | #no_reset nnue_model: NNUE_Model #align 64;
185 | using nnue_model;
186 | 
187 | // dimensions
188 | kHalfDimensions :: 256;
189 | FtInDims :: 64*PS_END; // 63 * 641
190 | FtOutDims :: kHalfDimensions*2;
191 | NnueVersion : u32 : 0x7AF32F16;
192 | TransformerStart :: 3*4 + 177;
193 | NetworkStart :: TransformerStart+4 + 2*256 + 2*256*64*641;
194 | 
195 | Position :: struct {
196 |   player: s32;
197 |   pieces: *s32;
198 |   squares: *s32;
199 |   nnue: [3] *NNUEdata;
200 | }
201 | 
202 | IndexList :: struct {
203 |   size: s32;
204 |   values: [30] s32;
205 | }
206 | 
207 | nnue_evaluate_pos :: (chess: *Chess, nnue: [3] *NNUEdata) -> s32 {
208 |   input_mask:   [FtOutDims / (8 * size_of(u32)) ] u32 #align 8;
209 |   hidden1_mask: [8 / size_of(u32)] u32 #align 8;
210 |   FV_SCALE :: 16;
211 |   input: [FtOutDims] s8 #align 16;
212 |   hidden1_out: [32] s8  #align 16;
213 |   hidden2_out: [32] s8  #align 16; 
214 |   transform(chess, nnue, *input[0], *input_mask[0]);
215 |   affine_txfm(*input[0], *hidden1_out[0], FtOutDims, *hidden1_biases[0], *hidden1_weights[0]);
216 |   affine_txfm(*hidden1_out[0], *hidden2_out[0], 32, *hidden2_biases[0], *hidden2_weights[0]);
217 |   out_value := inline affine_propagate(*hidden2_out[0], output_biases[0], *output_weights[0]);
218 |   return out_value / FV_SCALE;
219 | }
220 | 
221 | update_accumulator :: (chess: *Chess, nnue: [3] *NNUEdata) -> bool {
222 | 
223 |   acc_if :: inline (prevAcc: **Accumulator, nnue: *NNUEdata) -> bool {
224 |     if !nnue then
225 |       return true;
226 |     prevAcc.* = *nnue.accumulator;
227 |     return !prevAcc.*.computedAccumulation;
228 |   }
229 | 
230 |   accumulator := *nnue[0].accumulator;
231 |   if accumulator.computedAccumulation then
232 |     return true;
233 |   prevAcc: *Accumulator = null;
234 |   if acc_if(*prevAcc, nnue[1]) && acc_if(*prevAcc, nnue[2]) then
235 |     return false;
236 |   removed_indices: [2] IndexList;
237 |   added_indices: [2] IndexList;
238 |   reset: [2] bool;
239 |   removed_indices[0].size = 0;
240 |   removed_indices[1].size = 0;
241 |   added_indices[0].size = 0;
242 |   added_indices[1].size = 0;
243 |   append_changed_indices(chess, nnue, removed_indices, added_indices, reset);
244 | 
245 |   for c: 0..1 {
246 |     accindex := 0;
247 |     while accindex < 256 {
248 |       acc_tile := *accumulator.accumulation[c][accindex];
249 |       r := reset[c] == true;
250 |       copy_tile := ifx r then *ft_biases[accindex] else *prevAcc.accumulation[c][accindex];
251 |       #asm SSE {
252 |         movdqa.x xmm0: vec,  [copy_tile + 0x00];
253 |         movdqa.x xmm1: vec,  [copy_tile + 0x10];
254 |         movdqa.x xmm2: vec,  [copy_tile + 0x20];
255 |         movdqa.x xmm3: vec,  [copy_tile + 0x30];
256 |         movdqa.x xmm4: vec,  [copy_tile + 0x40];
257 |         movdqa.x xmm5: vec,  [copy_tile + 0x50];
258 |         movdqa.x xmm6: vec,  [copy_tile + 0x60];
259 |         movdqa.x xmm7: vec,  [copy_tile + 0x70];
260 |         movdqa.x xmm8: vec,  [copy_tile + 0x80];
261 |         movdqa.x xmm9: vec,  [copy_tile + 0x90];
262 |         movdqa.x xmm10: vec, [copy_tile + 0xa0];
263 |         movdqa.x xmm11: vec, [copy_tile + 0xb0];
264 |         movdqa.x xmm12: vec, [copy_tile + 0xc0];
265 |         movdqa.x xmm13: vec, [copy_tile + 0xd0];
266 |         movdqa.x xmm14: vec, [copy_tile + 0xe0];
267 |         movdqa.x xmm15: vec, [copy_tile + 0xf0];
268 |       }
269 | 
270 |       if r == false then {
271 |         // Difference calculation for the deactivated features
272 |         for k: 0..removed_indices[c].size-1 {
273 |           index := removed_indices[c].values[k] * kHalfDimensions;
274 |           sub_tile := *ft_weights[index + accindex];
275 |           #asm SSE {
276 |             psubw.x xmm0,  [sub_tile + 0x00];
277 |             psubw.x xmm1,  [sub_tile + 0x10];
278 |             psubw.x xmm2,  [sub_tile + 0x20];
279 |             psubw.x xmm3,  [sub_tile + 0x30];
280 |             psubw.x xmm4,  [sub_tile + 0x40];
281 |             psubw.x xmm5,  [sub_tile + 0x50];
282 |             psubw.x xmm6,  [sub_tile + 0x60];
283 |             psubw.x xmm7,  [sub_tile + 0x70];
284 |             psubw.x xmm8,  [sub_tile + 0x80];
285 |             psubw.x xmm9,  [sub_tile + 0x90];
286 |             psubw.x xmm10, [sub_tile + 0xa0];
287 |             psubw.x xmm11, [sub_tile + 0xb0];
288 |             psubw.x xmm12, [sub_tile + 0xc0];
289 |             psubw.x xmm13, [sub_tile + 0xd0];
290 |             psubw.x xmm14, [sub_tile + 0xe0];
291 |             psubw.x xmm15, [sub_tile + 0xf0];
292 |           }
293 |         }
294 |       }
295 |      
296 |       // Difference calculation for the activated features
297 |       for k: 0..added_indices[c].size-1 {
298 |         index := added_indices[c].values[k] * kHalfDimensions;
299 |         add_tile := *ft_weights[index + accindex];
300 |         #asm SSE {
301 |           paddw.x xmm0,  [add_tile + 0x00];
302 |           paddw.x xmm1,  [add_tile + 0x10];
303 |           paddw.x xmm2,  [add_tile + 0x20];
304 |           paddw.x xmm3,  [add_tile + 0x30];
305 |           paddw.x xmm4,  [add_tile + 0x40];
306 |           paddw.x xmm5,  [add_tile + 0x50];
307 |           paddw.x xmm6,  [add_tile + 0x60];
308 |           paddw.x xmm7,  [add_tile + 0x70];
309 |           paddw.x xmm8,  [add_tile + 0x80];
310 |           paddw.x xmm9,  [add_tile + 0x90];
311 |           paddw.x xmm10, [add_tile + 0xa0];
312 |           paddw.x xmm11, [add_tile + 0xb0];
313 |           paddw.x xmm12, [add_tile + 0xc0];
314 |           paddw.x xmm13, [add_tile + 0xd0];
315 |           paddw.x xmm14, [add_tile + 0xe0];
316 |           paddw.x xmm15, [add_tile + 0xf0];
317 |         }
318 |       }
319 |      
320 |       #asm SSE {
321 |         movdqa.x [acc_tile + 0x00], xmm0;
322 |         movdqa.x [acc_tile + 0x10], xmm1;
323 |         movdqa.x [acc_tile + 0x20], xmm2;
324 |         movdqa.x [acc_tile + 0x30], xmm3;
325 |         movdqa.x [acc_tile + 0x40], xmm4;
326 |         movdqa.x [acc_tile + 0x50], xmm5;
327 |         movdqa.x [acc_tile + 0x60], xmm6;
328 |         movdqa.x [acc_tile + 0x70], xmm7;
329 |         movdqa.x [acc_tile + 0x80], xmm8;
330 |         movdqa.x [acc_tile + 0x90], xmm9;
331 |         movdqa.x [acc_tile + 0xa0], xmm10;
332 |         movdqa.x [acc_tile + 0xb0], xmm11;
333 |         movdqa.x [acc_tile + 0xc0], xmm12;
334 |         movdqa.x [acc_tile + 0xd0], xmm13;
335 |         movdqa.x [acc_tile + 0xe0], xmm14;
336 |         movdqa.x [acc_tile + 0xf0], xmm15; 
337 |       }
338 |       accindex += 128;
339 |     }
340 |   }
341 |   accumulator.computedAccumulation = 1;
342 |   return true;
343 | }
344 | 
345 | refresh_accumulator :: (chess: *Chess, nnue: [3] *NNUEdata) {
346 |   accumulator := *(nnue[0].accumulator);
347 |   activeIndices: [2] IndexList;
348 |   activeIndices[0].size = 0;
349 |   activeIndices[1].size = 0;
350 |   append_active_indices(chess, activeIndices);
351 | 
352 |   for c: 0..1 {
353 |     accindex := 0;
354 |     while accindex < 256 {
355 |       tile := *ft_biases[accindex];
356 |       acc_tile := *accumulator.accumulation[c][accindex];
357 |       #asm SSE {
358 |         movdqa.x xmm0: vec,  [tile + 0x00];
359 |         movdqa.x xmm1: vec,  [tile + 0x10];
360 |         movdqa.x xmm2: vec,  [tile + 0x20];
361 |         movdqa.x xmm3: vec,  [tile + 0x30];
362 |         movdqa.x xmm4: vec,  [tile + 0x40];
363 |         movdqa.x xmm5: vec,  [tile + 0x50];
364 |         movdqa.x xmm6: vec,  [tile + 0x60];
365 |         movdqa.x xmm7: vec,  [tile + 0x70];
366 |         movdqa.x xmm8: vec,  [tile + 0x80];
367 |         movdqa.x xmm9: vec,  [tile + 0x90];
368 |         movdqa.x xmm10: vec, [tile + 0xa0];
369 |         movdqa.x xmm11: vec, [tile + 0xb0];
370 |         movdqa.x xmm12: vec, [tile + 0xc0];
371 |         movdqa.x xmm13: vec, [tile + 0xd0];
372 |         movdqa.x xmm14: vec, [tile + 0xe0];
373 |         movdqa.x xmm15: vec, [tile + 0xf0];
374 |       }
375 |       for k: 0..activeIndices[c].size-1 {
376 |         index := activeIndices[c].values[k];
377 |         offset := kHalfDimensions * index;
378 |         add_tile := *ft_weights[offset + accindex];
379 |         #asm SSE {
380 |           paddw.x xmm0,  [add_tile + 0x00];
381 |           paddw.x xmm1,  [add_tile + 0x10];
382 |           paddw.x xmm2,  [add_tile + 0x20];
383 |           paddw.x xmm3,  [add_tile + 0x30];
384 |           paddw.x xmm4,  [add_tile + 0x40];
385 |           paddw.x xmm5,  [add_tile + 0x50];
386 |           paddw.x xmm6,  [add_tile + 0x60];
387 |           paddw.x xmm7,  [add_tile + 0x70];
388 |           paddw.x xmm8,  [add_tile + 0x80];
389 |           paddw.x xmm9,  [add_tile + 0x90];
390 |           paddw.x xmm10, [add_tile + 0xa0];
391 |           paddw.x xmm11, [add_tile + 0xb0];
392 |           paddw.x xmm12, [add_tile + 0xc0];
393 |           paddw.x xmm13, [add_tile + 0xd0];
394 |           paddw.x xmm14, [add_tile + 0xe0];
395 |           paddw.x xmm15, [add_tile + 0xf0];
396 |         }
397 |       }
398 |       #asm SSE {
399 |         movdqa.x [acc_tile + 0x00], xmm0;
400 |         movdqa.x [acc_tile + 0x10], xmm1;
401 |         movdqa.x [acc_tile + 0x20], xmm2;
402 |         movdqa.x [acc_tile + 0x30], xmm3;
403 |         movdqa.x [acc_tile + 0x40], xmm4;
404 |         movdqa.x [acc_tile + 0x50], xmm5;
405 |         movdqa.x [acc_tile + 0x60], xmm6;
406 |         movdqa.x [acc_tile + 0x70], xmm7;
407 |         movdqa.x [acc_tile + 0x80], xmm8;
408 |         movdqa.x [acc_tile + 0x90], xmm9;
409 |         movdqa.x [acc_tile + 0xa0], xmm10;
410 |         movdqa.x [acc_tile + 0xb0], xmm11;
411 |         movdqa.x [acc_tile + 0xc0], xmm12;
412 |         movdqa.x [acc_tile + 0xd0], xmm13;
413 |         movdqa.x [acc_tile + 0xe0], xmm14;
414 |         movdqa.x [acc_tile + 0xf0], xmm15; 
415 |       }
416 |       accindex += 128;
417 |     }
418 |   }
419 |   accumulator.computedAccumulation = 1;
420 | }
421 | 
422 | append_active_indices :: (chess: *Chess, active: []IndexList) {
423 |   half_kp_append_active_indices(chess, chess.w_king, 0, *active[0]);
424 |   half_kp_append_active_indices(chess, chess.b_king, 1, *active[1]);
425 | }
426 | 
427 | append_changed_indices :: (chess: *Chess, nnue: [3] *NNUEdata, removed: [] IndexList, added:[] IndexList, reset: [] bool) {
428 |   dp := *nnue[0].dirtyPiece;
429 |   if nnue[1].accumulator.computedAccumulation then {
430 |     {
431 |       king := chess.w_king;
432 |       ksq := cast(s32) bsf(king);
433 |       reset[0] = dp.pc[0] == 1;
434 |       if reset[0] then {
435 |         half_kp_append_active_indices(chess, king, 0, *added[0]);
436 |       } else {
437 |         half_kp_append_changed_indices(ksq, 0, dp, *removed[0], *added[0]);
438 |       }
439 |     }
440 |     {
441 |       king := chess.b_king;
442 |       ksq := cast(s32) bsf(king);
443 |       reset[1] = dp.pc[0] == 7;
444 |       if reset[1] then {
445 |         half_kp_append_active_indices(chess, king, 1, *added[1]);
446 |       } else {
447 |         half_kp_append_changed_indices(ksq, 1, dp, *removed[1], *added[1]);
448 |       }
449 |     }
450 |   } else {
451 |     dp2 := *nnue[1].dirtyPiece;
452 |     {
453 |       king := chess.w_king;
454 |       ksq := cast(s32) bsf(king);
455 |       reset[0] = dp.pc[0] == 1 || dp2.pc[0] == 1;
456 |       if reset[0] then {
457 |         half_kp_append_active_indices(chess, king, 0, *added[0]);
458 |       } else {
459 |         half_kp_append_changed_indices(ksq, 0, dp, *removed[0], *added[0]);
460 |         half_kp_append_changed_indices(ksq, 0, dp2, *removed[0], *added[0]);
461 |       }
462 |     }
463 | 
464 |     {
465 |       king := chess.b_king;
466 |       ksq := cast(s32) bsf(king);
467 |       reset[1] = dp.pc[0] == 7 || dp2.pc[0] == 7;
468 |       if reset[1] then {
469 |         half_kp_append_active_indices(chess, king, 1, *added[1]);
470 |       } else {
471 |         half_kp_append_changed_indices(ksq, 1, dp, *removed[1], *added[1]);
472 |         half_kp_append_changed_indices(ksq, 1, dp2, *removed[1], *added[1]);
473 |       }
474 |     }
475 |   }
476 | }
477 | 
478 | half_kp_append_active_indices :: (chess: *Chess, king: u64, c: s32, active: *IndexList) {
479 |   ksq := cast(s32) bsf(king);
480 |   ksq = orient(c, ksq) * PS_END;
481 |   occupied := chess.occupied;
482 |   kings := chess.w_king | chess.b_king;
483 |   occupied ^= kings;
484 |   while occupied {
485 |     sq := cast(s32) bsf(occupied);
486 |     occupied &= occupied - 1;
487 |     pc := cast(s32) chess.pieces[sq];
488 |     active.values[active.size] = make_index(xx c, sq, pc, ksq);
489 |     active.size += 1;
490 |   }
491 | }
492 | 
493 | bsf :: (value: u64) -> int #expand {
494 |   result: int = 0;
495 |   #asm { bsf.q result, value; }
496 |   return result;
497 | }
498 | 
499 | half_kp_append_changed_indices :: (ksq: s32, c: s32, dp: DirtyPiece, removed: *IndexList, added: *IndexList) {
500 |   ksq = orient(c, ksq) * PS_END;
501 |   num := dp.dirtyNum - 1;
502 |   for i: 0..num {
503 |     pc := dp.pc[i];
504 |     if pc == 1 || pc == 7 continue;
505 |     from := dp.from[i];
506 |     to := dp.to[i];
507 |     if from != 64 then {
508 |       removed.values[removed.size] = make_index(c, from, pc, ksq);
509 |       removed.size += 1;
510 |     }
511 | 
512 |     if to != 64 then {
513 |       added.values[added.size] = make_index(c, to, pc, ksq);
514 |       added.size += 1;
515 |     }
516 |   }
517 | }
518 | 
519 | make_index :: (c: s32, s: s32, pc: s32, ksq: s32) -> s32 #expand {
520 |   return orient(c, s) + PieceToIndex[c][pc] + ksq;
521 | }
522 | 
523 | orient :: (c: s32, s: s32) -> s32 #expand {
524 |   if c == 0 then {
525 |     return s;
526 |   } else {
527 |     return s ^ 0x3F;
528 |   }
529 | }
530 | 
531 | PS_W_PAWN   ::  1;
532 | PS_B_PAWN   ::  1*64 + 1;
533 | PS_W_KNIGHT ::  2*64 + 1;
534 | PS_B_KNIGHT ::  3*64 + 1;
535 | PS_W_BISHOP ::  4*64 + 1;
536 | PS_B_BISHOP ::  5*64 + 1;
537 | PS_W_ROOK   ::  6*64 + 1;
538 | PS_B_ROOK   ::  7*64 + 1;
539 | PS_W_QUEEN  ::  8*64 + 1;
540 | PS_B_QUEEN  ::  9*64 + 1;
541 | PS_END      :: 10*64 + 1;
542 | 
543 | PieceToIndex: [2][14] s32 = .[ 
544 |   s32.[0, 0, PS_W_QUEEN, PS_W_ROOK, PS_W_BISHOP, PS_W_KNIGHT, PS_W_PAWN,
545 |        0, PS_B_QUEEN, PS_B_ROOK, PS_B_BISHOP, PS_B_KNIGHT, PS_B_PAWN, 0],
546 |   s32.[ 0, 0, PS_B_QUEEN, PS_B_ROOK, PS_B_BISHOP, PS_B_KNIGHT, PS_B_PAWN,
547 |        0, PS_W_QUEEN, PS_W_ROOK, PS_W_BISHOP, PS_W_KNIGHT, PS_W_PAWN, 0]
548 | ];
549 | 
550 | transform :: (chess: *Chess, nnue: [3] *NNUEdata, output: *s8, out_mask: *u32) {
551 |   if !update_accumulator(chess, nnue) then
552 |     refresh_accumulator(chess, nnue);
553 |   accumulation: [][256] s16 = nnue[0].accumulator.accumulation;
554 | 
555 |   // 128 bit xmm register
556 |   // 8 bit data
557 |   // 128 / 8 => 16 numbers at a time.
558 |   // 256 numbers total / 16 numbers per SIMD = 16 times.
559 |   data  := *output[0];
560 |   turn := chess.turn;
561 |   for p: 0..1 {
562 |     accindex := 0;
563 |     while accindex < 256 {
564 |       accum := *accumulation[turn][accindex];
565 |       #asm SSE {
566 |         xmm0: vec; xmm1: vec; xmm2: vec; xmm3: vec;
567 |         xmm4: vec; xmm5: vec; xmm6: vec; xmm7: vec;
568 |         movaps.x   xmm0, [accum + 0x00];
569 |         packsswb.x xmm0, [accum + 0x10];
570 |         movaps.x   xmm1, [accum + 0x20];
571 |         packsswb.x xmm1, [accum + 0x30];
572 |         movaps.x   xmm2, [accum + 0x40];
573 |         packsswb.x xmm2, [accum + 0x50];
574 |         movaps.x   xmm3, [accum + 0x60];
575 |         packsswb.x xmm3, [accum + 0x70];
576 |         movaps.x   xmm4, [accum + 0x80];
577 |         packsswb.x xmm4, [accum + 0x90];
578 |         movaps.x   xmm5, [accum + 0xa0];
579 |         packsswb.x xmm5, [accum + 0xb0];
580 |         movaps.x   xmm6, [accum + 0xc0];
581 |         packsswb.x xmm6, [accum + 0xd0];
582 |         movaps.x   xmm7, [accum + 0xe0];
583 |         packsswb.x xmm7, [accum + 0xf0];
584 |         movups.x   [data + 0x00], xmm0;
585 |         movups.x   [data + 0x10], xmm1;
586 |         movups.x   [data + 0x20], xmm2;
587 |         movups.x   [data + 0x30], xmm3;
588 |         movups.x   [data + 0x40], xmm4;
589 |         movups.x   [data + 0x50], xmm5;
590 |         movups.x   [data + 0x60], xmm6;
591 |         movups.x   [data + 0x70], xmm7;
592 |         add data, 0x80;
593 |       }
594 |       accindex += 128;
595 |     }
596 | 
597 |     turn ^= 1;
598 |   }
599 | }
600 | 
601 | affine_txfm :: (input: *s8, output: *void, inDims: u32, biases: *s32, weights: *s8) #expand {
602 | 
603 |   // GCC -O3 "optimized" output
604 |   // terrible scrabbled eggs output, but faster than CPU w/o SIMD
605 |   // the SSE code is a bit difficult to translate.
606 |   sse_simd :: (factor: s32, tmp: *s32, weights: *s8) #expand {
607 |     edx := factor;
608 |     rsi := tmp;
609 |     rdi := weights;
610 |     #asm {
611 |       movdqa.x    xmm1:, [rdi];
612 |       pxor.x      xmm6:, xmm6;
613 |       pxor.x      xmm7:, xmm7;
614 |       movd        xmm5:,  edx;
615 |       movdqa.x    xmm2:, xmm6;
616 |       movdqa.x    xmm9:, xmm7;
617 |       pshufd.x    xmm0:, xmm5, 0;
618 |       pcmpgtb.x   xmm2, xmm1;
619 |       movdqa.x    xmm3:, xmm1;
620 |       movdqa.x    xmm5, xmm0;
621 |       psrlq.x     xmm5, 32;
622 |       movdqa.x    xmm10:, xmm7;
623 |       punpcklbw.x xmm3, xmm2;
624 |       punpckhbw.x xmm1, xmm2;
625 |       pcmpgtw.x   xmm9, xmm3;
626 |       pcmpgtw.x   xmm10, xmm1;
627 |       movdqa.x    xmm2, xmm3;
628 |       punpckhwd.x xmm2, xmm9;
629 |       movdqa.x    xmm4:, xmm2;
630 |       psrlq.x     xmm2, 32;
631 |       pmuludq.x   xmm4, xmm0;
632 |       pmuludq.x   xmm2, xmm5;
633 |       pshufd.x    xmm4, xmm4, 8;
634 |       pshufd.x    xmm2, xmm2, 8;
635 |       punpckldq.x xmm4, xmm2;
636 |       movdqu.x    xmm2, [rsi+16];
637 |       paddd.x     xmm4, xmm2;
638 |       movdqa.x    xmm2, xmm1;
639 |       punpckhwd.x xmm1, xmm10;
640 |       punpcklwd.x xmm2, xmm10;
641 |       movdqa.x    xmm10, xmm1;
642 |       movups.x    [rsi+16], xmm4;
643 |       movdqa.x    xmm8:, xmm2;
644 |       psrlq.x     xmm2, 32;
645 |       pmuludq.x   xmm8, xmm0;
646 |       pmuludq.x   xmm2, xmm5;
647 |       pmuludq.x   xmm10, xmm0;
648 |       pshufd      xmm2, xmm2, 8;
649 |       pshufd      xmm8, xmm8, 8;
650 |       punpckldq.x xmm8, xmm2;
651 |       movdqu.x    xmm2, [rsi+32];
652 |       paddd.x     xmm8, xmm2;
653 |       movdqa.x    xmm2, xmm1;
654 |       pshufd      xmm1, xmm10, 8;
655 |       psrlq       xmm2, 32;
656 |       movups.x    [rsi+32], xmm8;
657 |       movdqa.x    xmm8, xmm7;
658 |       pmuludq xmm2, xmm5;
659 |       pshufd  xmm2, xmm2, 8;
660 |       punpckldq       xmm1, xmm2;
661 |       movdqu.x  xmm2, [rsi+48];
662 |       paddd   xmm2, xmm1;
663 |       movdqa  xmm1, xmm3;
664 |       punpcklwd       xmm1, xmm9;
665 |       movups.x [rsi+48], xmm2;
666 |       movdqa  xmm3, xmm1;
667 |       psrlq   xmm1, 32;
668 |       pmuludq xmm3, xmm0;
669 |       pmuludq xmm1, xmm5;
670 |       pshufd  xmm3, xmm3, 8;
671 |       pshufd  xmm1, xmm1, 8;
672 |       punpckldq       xmm3, xmm1;
673 |       movdqu.x  xmm1, [rsi];
674 |       paddd.x   xmm1, xmm3;
675 |       movdqu.x  xmm3, [rsi+80];
676 |       movups.x  [rsi], xmm1;
677 |       movdqa.x  xmm1, [rdi+16];
678 |       pcmpgtb xmm6, xmm1;
679 |       movdqa  xmm2, xmm6;
680 |       movdqa  xmm6, xmm1;
681 |       punpcklbw       xmm6, xmm2;
682 |       punpckhbw       xmm1, xmm2;
683 |       pcmpgtw xmm8, xmm6;
684 |       pcmpgtw xmm7, xmm1;
685 |       movdqa  xmm2, xmm6;
686 |       punpckhwd       xmm2, xmm8;
687 |       movdqa  xmm4, xmm2;
688 |       psrlq   xmm2, 32;
689 |       pmuludq xmm4, xmm0;
690 |       pmuludq xmm2, xmm5;
691 |       pshufd  xmm4, xmm4, 8;
692 |       pshufd  xmm2, xmm2, 8;
693 |       punpckldq       xmm4, xmm2;
694 |       movdqa  xmm2, xmm1;
695 |       punpckhwd       xmm1, xmm7;
696 |       punpcklwd       xmm2, xmm7;
697 |       paddd   xmm4, xmm3;
698 |       movdqa  xmm7, xmm1;
699 |       movdqa  xmm3, xmm2;
700 |       psrlq   xmm2, 32;
701 |       movups.x  [rsi+80], xmm4;
702 |       pmuludq xmm3, xmm0;
703 |       pmuludq xmm2, xmm5;
704 |       pmuludq xmm7, xmm0;
705 |       pshufd  xmm2, xmm2, 8;
706 |       pshufd  xmm3, xmm3, 8;
707 |       punpckldq       xmm3, xmm2;
708 |       movdqu.x  xmm2, [rsi+96];
709 |       paddd   xmm3, xmm2;
710 |       movdqa  xmm2, xmm1;
711 |       pshufd  xmm1, xmm7, 8;
712 |       psrlq   xmm2, 32;
713 |       movups.x  [rsi+96], xmm3;
714 |       pmuludq xmm2, xmm5;
715 |       pshufd  xmm2, xmm2, 8;
716 |       punpckldq       xmm1, xmm2;
717 |       movdqu.x  xmm2, [rsi+112];
718 |       paddd   xmm2, xmm1;
719 |       movdqa  xmm1, xmm6;
720 |       movdqu.x  xmm6, [rsi+64];
721 |       punpcklwd       xmm1, xmm8;
722 |       movups  [rsi+112], xmm2;
723 |       pmuludq xmm0, xmm1;
724 |       psrlq   xmm1, 32;
725 |       pmuludq xmm1, xmm5;
726 |       pshufd  xmm0, xmm0, 8;
727 |       pshufd  xmm1, xmm1, 8;
728 |       punpckldq       xmm0, xmm1;
729 |       paddd   xmm0, xmm6;
730 |       movups  [rsi+64], xmm0;
731 |     }
732 |   }
733 | 
734 | 
735 |   tmp: [32] s32;
736 |   memcpy(*tmp[0], *biases[0], size_of(s32) * 32);
737 | 
738 |   offset := 0;
739 |   mask: u32 = 0;
740 |   input_pointer := input;
741 |   #asm SSE {
742 |     pxor.x zeroes: vec, zeroes;
743 |   }
744 | 
745 |   while offset < inDims {
746 |     // input 
747 |     #asm SSE, SSE2 {
748 |       movups.x   xmm0: vec, [input_pointer];
749 |       pcmpgtb.x  xmm0, zeroes;
750 |       pmovmskb.x mask, xmm0;
751 |       add input_pointer, 16;
752 |     }
753 | 
754 |     while mask {
755 |       idx: int;
756 |       #asm SSE {
757 |         bsf idx, mask;
758 |         add idx, offset;
759 |       }
760 |       factor: s32 = input[idx];
761 |       index := idx << 5;  // idx * 32.
762 |       sse_simd(factor, *tmp[0], *weights[index]);
763 |       mask &= mask - 1;
764 |     }
765 | 
766 |     offset += 16;
767 |   }
768 | 
769 | 
770 |   #asm SSE2 {
771 |     mov.d    reg: gpr, 0x00_7f_00_7f;
772 |     movd     xmm_127: vec, reg;
773 |     pshufd.x xmm_127, xmm_127, 0;
774 |     pxor.x   xmm_000: vec, xmm_000;
775 |   }
776 | 
777 |   outVec := output;
778 |   tmp_data := tmp.data;
779 |   for #v2 < 0..7 {
780 |     #asm SSE {
781 |       movups.x   xmm_relu: vec, [tmp_data];
782 |       packssdw.x xmm_relu, xmm_000;
783 |       psraw.x    xmm_relu, 6;
784 |       pmaxsw.x   xmm_relu, xmm_000;
785 |       pminsw.x   xmm_relu, xmm_127;
786 |       packsswb.x xmm_relu, xmm_000;
787 |       movups.x   [outVec], xmm_relu;
788 |       add        tmp_data, 0x10;
789 |       add        outVec,   0x04;
790 |     }
791 |   }
792 | }
793 | 
794 | affine_propagate :: (input: *s8, biases: s32, weights: *s8) -> s32 #expand {
795 |   eax: s32 = 0x0001_0001;
796 |   #asm SSE, SSE2, SSE3, SSE4_1 {
797 |     movups.x    xmm0: vec, [input];
798 |     movups.x    xmm1: vec, [input + 0x10];
799 |     pmaddubsw.x xmm0, [weights];
800 |     pmaddubsw.x xmm1, [weights + 0x10];
801 |     movd        ones_xmm: vec, eax;
802 |     pshufd      ones_xmm, ones_xmm, 0x0;
803 |     pmaddwd.x   xmm0, ones_xmm;
804 |     pmaddwd.x   xmm1, ones_xmm;
805 |     paddd.x     xmm0, xmm1;
806 |     pshufd      xmm1, xmm0, 0x1b;
807 |     paddd.x     xmm0, xmm1;
808 |     movd        eax, xmm0;
809 |     pextrd      val: gpr, xmm0, 1;
810 |     add         eax, val;
811 |     add         eax, biases;
812 |   }
813 | 
814 |   return eax;
815 | }
816 | 
817 | #import "Basic";
818 | #import "File";
819 | 
820 | 


--------------------------------------------------------------------------------
/resources/AnonymousPro.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieltan1517/chess-jai/cdb560927f4a9cc02ccd91480f24f47f59609114/resources/AnonymousPro.ttf


--------------------------------------------------------------------------------
/resources/capture.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieltan1517/chess-jai/cdb560927f4a9cc02ccd91480f24f47f59609114/resources/capture.wav


--------------------------------------------------------------------------------
/resources/chess_pieces.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieltan1517/chess-jai/cdb560927f4a9cc02ccd91480f24f47f59609114/resources/chess_pieces.png


--------------------------------------------------------------------------------
/resources/move.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieltan1517/chess-jai/cdb560927f4a9cc02ccd91480f24f47f59609114/resources/move.wav


--------------------------------------------------------------------------------
/resources/nn-04cf2b4ed1da.nnue:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieltan1517/chess-jai/cdb560927f4a9cc02ccd91480f24f47f59609114/resources/nn-04cf2b4ed1da.nnue


--------------------------------------------------------------------------------
/resources/settings_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieltan1517/chess-jai/cdb560927f4a9cc02ccd91480f24f47f59609114/resources/settings_icon.png


--------------------------------------------------------------------------------
/search.jai:
--------------------------------------------------------------------------------
   1 | #import "Math";
   2 | #import "Basic";
   3 | #import "Thread";
   4 | 
   5 | lazy_smp_threads: Thread_Group;
   6 | num_threads: int = 0;
   7 | chess_work: [] ChessGame;
   8 | 
   9 | free_threads :: () {
  10 |   if num_threads > 0
  11 |     shutdown(*lazy_smp_threads);
  12 | }
  13 | 
  14 | uci_search :: (chess: *ChessGame) -> bestmove: Move16 {
  15 |   time_begin = seconds_since_init();
  16 |   nodes_searched = 0;
  17 |   // copy the work. add to queue.
  18 |   // make sure we are on max difficulty before starting threads.
  19 |   if difficulty == 8 {
  20 |     for *chess_work {
  21 |       copy_chessgame(it, chess);
  22 |       add_work(*lazy_smp_threads, it);
  23 |     }
  24 |     start(*lazy_smp_threads);
  25 |   }
  26 | 
  27 |   // start the lazy SMP threads!
  28 |   bestmove := search(chess);
  29 | 
  30 |   // make sure work is finished before we report bestmove.
  31 |   work_remaining := num_threads;
  32 |   while work_remaining > 0 {
  33 |     results := get_completed_work(*lazy_smp_threads);
  34 |     work_remaining -= results.count;
  35 |     reset_temporary_storage();
  36 |   }
  37 | 
  38 |   return bestmove;
  39 | }
  40 | 
  41 | set_threads :: (num_wanted: int) {
  42 |   num_wanted -= 1;
  43 |   if num_threads == num_wanted then
  44 |     return;
  45 |   if num_threads > 0 {
  46 |     shutdown(*lazy_smp_threads);
  47 |   }
  48 | 
  49 |   // initialize the threads
  50 |   num_threads = num_wanted;
  51 |   init(*lazy_smp_threads, cast(s32)num_threads, search_thread, false);
  52 |   lazy_smp_threads.logging = false;
  53 | 
  54 |   // initialize the workers
  55 |   for *chess_work {
  56 |     free_chess_game(it);
  57 |   }
  58 |   if chess_work.count
  59 |     array_free(chess_work);
  60 | 
  61 |   chess_work = NewArray(num_threads, ChessGame, alignment=64);
  62 |   for *chess_work {
  63 |     it.main_thread = false;
  64 |     initialize_chess_game_memory(it);
  65 |   }
  66 | }
  67 | 
  68 | set_multi_pv :: (number: int) {
  69 |   multi_pv = number;
  70 | }
  71 | 
  72 | set_difficulty :: (number: int) {
  73 |   difficulty = number;
  74 | }
  75 | 
  76 | difficulty: int = 8;
  77 | 
  78 | search_thread :: (group: *Thread_Group, thread: *Thread, work: *void) -> Thread_Continue_Status {
  79 |   chess := cast(*ChessGame) work;
  80 |   search(chess);
  81 |   return .CONTINUE;
  82 | }
  83 | 
  84 | // main search function. iterative deepening.
  85 | search :: (chess: *ChessGame) -> bestmove: Move16 {
  86 | 
  87 |   print_stats :: (depth: int, line: int) #expand {
  88 |     if chess.main_thread == false return;
  89 | 
  90 |     builder: String_Builder;
  91 |     builder.allocator = temp;
  92 |     t_ms := cast(int)(`time_taken*1000.0); // convert secs to ms
  93 |     nps  := cast(int)(nodes_searched/(`time_taken+0.0001));
  94 |     print_to_builder(*builder, "info depth % seldepth % ", depth, chess.maxply);
  95 |     if score >= -INF+20 && score <= INF-20 then {
  96 |       print_to_builder(*builder, "nodes % time % score cp % nps % multipv % pv", nodes_searched, t_ms, score, nps, line);
  97 |     } else {
  98 |       mate := INF - abs(score);
  99 |       mate = (mate/2) + (mate & 1);
 100 |       print_to_builder(*builder, "nodes % time % score mate % nps % multipv % pv", nodes_searched, t_ms, mate, nps, line);
 101 |     }
 102 |     count := chess.history.pv_table[0].move_count - 1;
 103 |     for i: 0..count {
 104 |       move := chess.history.pv_table[0][i];
 105 |       if move == Move16.Quiet break;  // do not print out NULL moves.
 106 |       append(*builder, #char " ");
 107 |       append(*builder, move);
 108 |     }
 109 |     str := builder_to_string(*builder,, allocator=__temporary_allocator);
 110 |     print("%1%2", str, NEWLINE);
 111 |   }
 112 | 
 113 |   clear_history(chess);
 114 |   if chess.maxdepth == -1 then
 115 |     chess.maxdepth = S64_MAX; // set maxdepth arbitarily high to make it loop infinitely.
 116 |   if chess.maxnodes == -1 then
 117 |     chess.maxnodes = S64_MAX; // set maxnodes arbitarily high to make it loop infinitely.
 118 |   if chess.movetime == -1 then
 119 |     chess.movetime = S64_MAX; // set movetime arbitarily high to make it loop infinitely.
 120 |   // this is basically a fancy wrapper around negamax, negamax does all the work, this just sets everything up.
 121 |   score : int; 
 122 |   alpha: int = -INF+1;
 123 |   beta:  int =  INF-1;
 124 |   delta := 50;
 125 |   cur_depth := 1;
 126 | 
 127 |   maxdepth := set_maxdepth(chess.maxdepth);
 128 |   maxnodes := chess.maxnodes;
 129 | 
 130 |   root_moves: MoveQueue;
 131 |   best_score := -INF;
 132 |   best_move  := Move16.Quiet;
 133 | 
 134 |   num_multi_pv := get_num_multipv(chess);
 135 | 
 136 |   while outer_loop := cur_depth <= maxdepth { 
 137 |     line := 1;
 138 |     root_moves.count = 0;
 139 |     while line <= num_multi_pv {
 140 |       if cur_depth <= 3 then {
 141 |         alpha = -INF;
 142 |         beta = INF;
 143 |       }
 144 |       reset_temporary_storage();
 145 |       chess.ply = 0;
 146 |       chess.node_state = NodeState.NULL | NodeState.SSE;
 147 |       chess.maxply = 0; 
 148 |       chess.depth = cur_depth;
 149 |       score = negamax_root(chess, *root_moves, line, cur_depth, alpha, beta, chess.fifty);
 150 |       time_taken := seconds_since_init() - time_begin;
 151 |       if stop() || nodes_searched >= maxnodes then 
 152 |         break outer_loop;
 153 |  
 154 |       if score <= alpha then {
 155 |         if chess.main_thread == true
 156 |           print("info depth %1 lowerbound %2%3", cur_depth, score, NEWLINE);
 157 |         alpha = max(score-delta, -INF);
 158 |         delta += delta + delta / 5;
 159 |         continue;
 160 |       }
 161 |  
 162 |       if score >= beta then {
 163 |         if chess.main_thread == true
 164 |           print("info depth %1 upperbound %2%3", cur_depth, score, NEWLINE);
 165 |         beta = min(score+delta, INF);
 166 |         delta += delta + delta / 5;
 167 |         continue;
 168 |       }
 169 |  
 170 |       alpha = max(score - delta, -INF);
 171 |       beta  = min(score + delta, INF);
 172 |  
 173 |       chess.score = cast, no_check(s16)score;
 174 | 
 175 |       if line == 1 {
 176 |         best_score = cast(s16) score;
 177 |         best_move  = get_bestmove(chess);
 178 |       } else {
 179 |         if score > best_score then {
 180 |           best_score = cast(s16) score;
 181 |           best_move  = get_bestmove(chess);
 182 |         }
 183 |       }
 184 | 
 185 |       print_stats(cur_depth, line);
 186 |       line += 1;
 187 |     }
 188 | 
 189 |     cur_depth += 1;
 190 |   }
 191 | 
 192 |   search_age += 1;
 193 |   return pick_move(best_move, best_score, num_multi_pv, *root_moves);
 194 | 
 195 | }
 196 | 
 197 | count_moves :: (chess: *Chess) -> int {
 198 |   moves: Moves(true);
 199 |   generate_moves(chess, *moves);
 200 |   return moves.count;
 201 | }
 202 | 
 203 | set_maxdepth :: (maxdepth: int) -> int {
 204 |   DEPTH_DIFFICULTY :: int.[1, 2, 3, 4, 5, 6, 9];
 205 |   if difficulty == 8 then {
 206 |     return min(maxdepth, 1000);
 207 |   } else {
 208 |     return DEPTH_DIFFICULTY[difficulty-1];
 209 |   }
 210 | }
 211 | 
 212 | pick_move :: (bestmove: Move16, topscore: s32, nummultipv: int, moves: *MoveQueue) -> Move16 {
 213 | 
 214 |   ERROR :: int.[512, 256, 175, 128, 75, 50, 20];
 215 |   BLUNDERING :: u64.[50, 25, 25, 10, 10, 5, 2];
 216 | 
 217 |   if difficulty == 8 then {
 218 |     // only looking for best move.
 219 |     return bestmove;
 220 |   } else {
 221 | 
 222 |     // pick randomly between the different suboptimal moves.
 223 |     // used to vary the computer difficulty and create
 224 |     // "different difficulty levels" for the
 225 |     // chess engine.
 226 |     // the greater the margin_error, the greater the chance of the computer making a 
 227 |     // mistake.
 228 | 
 229 |     margin_error := ERROR[difficulty-1];
 230 |     print("info string difficulty %1%2", difficulty, NEWLINE);
 231 | 
 232 |     if (random_u64(*move_rng) % 100) < BLUNDERING[difficulty-1] {
 233 |       // give the computer a random chance to blunder, increase error margin.
 234 |       margin_error += 256;
 235 |       print("info string blunder move%1", NEWLINE);
 236 |     }
 237 | 
 238 |     move := bestmove;
 239 |     i := 0;
 240 |     while i < moves.count {
 241 |       score: int = moves.array[i].priority;
 242 |       margin: int = cast(int)random_u64(*move_rng) % margin_error;
 243 |       score += margin;
 244 |       if score > topscore {
 245 |         move = moves.array[i].move;
 246 |       }
 247 |       i += 1;
 248 |     }
 249 | 
 250 |     return move;
 251 |   }
 252 | }
 253 | 
 254 | move_rng: PRNG = ---;
 255 | 
 256 | get_num_multipv :: (chess: *Chess) -> int {
 257 | 
 258 |   LINES :: int.[8, 5, 4, 3, 3, 3, 2, 1];
 259 |   num_moves := count_moves(chess);
 260 |   num_multi_pv := multi_pv;
 261 |   if difficulty != 8 {
 262 |     num_multi_pv = max(LINES[difficulty-1], num_multi_pv);
 263 |   }
 264 | 
 265 |   num_multi_pv = min(num_moves, num_multi_pv);
 266 |   return num_multi_pv;
 267 | }
 268 | 
 269 | initialize_move_randomness :: () {
 270 |   now := current_time_consensus();
 271 |   move_rng.seed = cast, no_check(u64)now.low ^ cast, no_check(u64)now.high;
 272 | }
 273 | 
 274 | negamax_root :: (chess: *ChessGame, root_moves: *MoveQueue, multipv: int, depth: int, alpha: int, beta: int, fifty: int) -> int {
 275 | 
 276 |   root_multipv :: (move: Move16, moves: *MoveQueue) -> bool {
 277 |     i := 0;
 278 |     while i<moves.count {
 279 |       if move == moves.array[i].move then
 280 |         return true;
 281 |       i += 1;
 282 |     }
 283 |     return false;
 284 |   }
 285 | 
 286 |   ply := chess.ply;
 287 | 
 288 |   // tt probe.
 289 |   tthit, ttentry := tt_probe_negamax(chess, ply, chess.hash, alpha, beta, depth);
 290 |   ttmove := chess.history.pv_table[ply][0];
 291 |   hash_move := ifx tthit then ttentry.ttmove else Move16.Quiet;
 292 | 
 293 |   defer {
 294 |     using chess.history;
 295 |     memset(*pv_table[ply+1], 0, size_of(PV_Line));
 296 |     killer_moves[ply+1][1] = 0;
 297 |     killer_moves[ply+1][0] = 0;
 298 |   }
 299 | 
 300 |   array_add(*chess.eval, INF);
 301 |   defer pop(*chess.eval);
 302 | 
 303 |   ep := chess.en_passant;
 304 |   hash_flag := TFLAGS.ALPHA;
 305 |   cap_moves: [32] Move16;
 306 |   quiet_moves: [64] Move16;
 307 |   num_captures := 0;
 308 |   num_quiets := 0;
 309 |   moves_searched := 0;
 310 | 
 311 |   tt_capture: bool = false;
 312 |   tt_quiet: bool   = false;
 313 | 
 314 |   move_count := 0;
 315 |   bmove := Move16.Quiet;
 316 |   pto := -1;
 317 |   best_score: int = -INF;
 318 |   for :pv move, move_score : chess {
 319 |     move_count += 1;
 320 |     if root_multipv(move, root_moves) {
 321 |       continue;
 322 |     }
 323 | 
 324 |     flags, from, to := decode_move16(move);
 325 |     ext := 0;
 326 |     if flags <= Move16.Double_Pawn_Push {
 327 |       // add quiet move.
 328 |       quiet_moves[num_quiets] = move;
 329 |       num_quiets += 1;
 330 |     } else {
 331 |       // add capture move.
 332 |       cap_moves[num_captures] = move;
 333 |       num_captures += 1;
 334 |     }
 335 |      
 336 |     // make move.
 337 |     next_fifty := fifty_move(chess, move, fifty);
 338 |     cap, castling := make_move(chess, move);
 339 |     eval := 0;
 340 |     if moves_searched == 0 {
 341 |       eval = -negamax_pv(chess, depth+ext-1, -beta, -alpha, next_fifty);
 342 |     } else {
 343 |       if in_check(chess) {
 344 |         // do not reduce depth on check.
 345 |         eval = -negamax_zw(chess, depth, -alpha-1, -alpha, next_fifty);
 346 |       } else {
 347 |         R := 0;
 348 |         if flags <= .Double_Pawn_Push {
 349 |           R -= (move_count >> 4);
 350 |         }
 351 |         eval = -negamax_zw(chess, depth - R - 1, -alpha-1, -alpha, next_fifty);
 352 |       }
 353 | 
 354 |       // search the move again that has failed to be proved to be bad with normal alpha beta score bounds
 355 |       if eval > alpha then {
 356 |         eval = -negamax_pv(chess, depth-1, -beta, -alpha, next_fifty);
 357 |       }
 358 |     }
 359 | 
 360 |     unmake_move(chess, move, cap, castling);
 361 |     chess.en_passant = ep;
 362 |     if eval > best_score then {
 363 |       best_score = eval;
 364 |       if best_score > alpha {
 365 |         using chess.history;
 366 |         hash_flag = TFLAGS.EXACT;
 367 |         bmove = move;
 368 |         pv_table[ply][0] = move;
 369 |         move_count := pv_table[ply+1].move_count;
 370 |         memcpy(*pv_table[ply][1], *pv_table[ply+1][0], move_count * size_of(Move16));
 371 |         pv_table[ply].move_count = move_count + 1;
 372 |         alpha = best_score;
 373 |      
 374 |         if best_score >= beta then {
 375 |           // non-captures == Quiet|Pawn Push Moves
 376 |           if flags <= Move16.Double_Pawn_Push then {
 377 |             update_quiet(chess, move, depth, quiet_moves, num_quiets, cap_moves, num_captures);
 378 |           }
 379 |           tt_store(chess.hash, best_score, TFLAGS.BETA, depth, move);
 380 |           return best_score;
 381 |         } 
 382 |       } 
 383 |     }
 384 |     moves_searched += 1;
 385 |   }
 386 | 
 387 |   if move_count == 0 then
 388 |     return ifx in_check(chess) then -INF+ply else 0+ply;
 389 | 
 390 |   if hash_flag == TFLAGS.EXACT && get_move16_flag(bmove) > Move16.Double_Pawn_Push then 
 391 |     update_tactics(chess, bmove, depth, cap_moves, num_captures, quiet_moves, num_quiets);
 392 | 
 393 |   tt_store(chess.hash, best_score, hash_flag, depth, bmove);
 394 |   append(root_moves, cast(s16)best_score, bmove);
 395 |   return best_score;
 396 | }
 397 | 
 398 | negamax_pv :: (chess: *ChessGame, depth: int, alpha: int, beta: int, fifty: int) -> int {
 399 |   ply := chess.ply;
 400 |   if fifty >= 100 then
 401 |     return 0+ply;
 402 | 
 403 |   // mate distance pruning
 404 |   alpha = max(alpha,-INF+ply);
 405 |   beta  = min(beta,  INF-ply);
 406 |   if alpha >= beta then {
 407 |     return alpha;
 408 |   }
 409 | 
 410 |   if is_draw(chess, fifty) then
 411 |     return 0+ply;
 412 | 
 413 |   if depth <= 0 then
 414 |     return quiescene(chess, alpha, beta, fifty);
 415 | 
 416 |   // cannot trust that tt_entry will remain the same.
 417 |   // it can be overwritten by accident due to hash collisions
 418 |   // esp. during multi-threading. need to copy out the ttentry.
 419 | 
 420 |   // tt probe.
 421 |   tthit, ttentry := tt_probe_negamax(chess, ply, chess.hash, alpha, beta, depth);
 422 |   ttmove := chess.history.pv_table[ply][0];
 423 |   hash_move := ifx tthit then ttentry.ttmove else Move16.Quiet;
 424 | 
 425 |   defer {
 426 |     using chess.history;
 427 |     memset(*pv_table[ply+1], 0, size_of(PV_Line));
 428 |     killer_moves[ply+1][1] = 0;
 429 |     killer_moves[ply+1][0] = 0;
 430 |   }
 431 | 
 432 |   // internal iterative deepening.
 433 |   if depth > 8 && ttmove == 0 && hash_move == 0 then 
 434 |     depth -= 1;
 435 | 
 436 |   // NOTE: we make eval really high,
 437 |   // so we will look at checks much more
 438 |   // carefully, when improving=1, more branches to look at.
 439 |   eval := INF;
 440 |   if tthit == true then
 441 |     eval = ttentry.score;
 442 |   else if !in_check(chess) then
 443 |     eval = cast(s16) evaluate(chess, fifty);
 444 | 
 445 |   array_add(*chess.eval, eval);
 446 |   defer pop(*chess.eval);
 447 | 
 448 |   ep := chess.en_passant;
 449 |   hash_flag := TFLAGS.ALPHA;
 450 |   cap_moves: [32] Move16;
 451 |   quiet_moves: [64] Move16;
 452 |   num_captures := 0;
 453 |   num_quiets := 0;
 454 |   moves_searched := 0;
 455 | 
 456 |   improving := get_improving(chess);
 457 |   late_move_prune := (3 + (depth*depth)) / (2-improving);
 458 | 
 459 |   tt_capture: bool = false;
 460 |   tt_quiet: bool   = false;
 461 |   prev_to :: (chess: ChessGame) -> int {
 462 |     if chess.ply >= 1 {
 463 |       count := chess.moves.count;
 464 |       prev_move1 := chess.moves[count-1].mov16;
 465 |       to := get_move16_to(prev_move1);
 466 |       return to;
 467 |     }
 468 |     return -1;
 469 |   }
 470 | 
 471 |   move_count := 0;
 472 |   bmove := Move16.Quiet;
 473 |   pto := prev_to(chess);
 474 | 
 475 |   best_score: int = -INF;
 476 |   score_margin := -80_000 * depth;
 477 |   for :pv move, move_score : chess {
 478 |     move_count += 1;
 479 |     if (chess.node_state & NodeState.SSE) == 0 && ply == chess.exply && move == chess.excluded_move then {
 480 |       continue;
 481 |     }
 482 | 
 483 |     flags, from, to := decode_move16(move);
 484 |     pfrom := piece_at(chess, from);
 485 |     att := piece_at(chess, from);
 486 |     vic := piece_at(chess, to);
 487 | 
 488 |     // piece
 489 |     if chess.depth>6 && moves_searched>0 {
 490 |       // Late Move Pruning. Search the Root Plies Exhuastively
 491 |       if flags <= Move16.Double_Pawn_Push && (moves_searched > late_move_prune) then
 492 |         break;
 493 |       if flags <= Move16.Double_Pawn_Push && move_score < score_margin then
 494 |         break;
 495 |       if depth < 6 && flags == Move16.Capture && PVALUE[att] > PVALUE[vic] && see(chess,move, -250*depth)  then
 496 |         continue;
 497 |       else if depth < 6 && flags <= Move16.Double_Pawn_Push && see(chess,move, -90*depth*depth) then
 498 |         continue;
 499 |     }
 500 | 
 501 |     ext := 0;
 502 |     if chess.node_state & NodeState.SSE && depth>=7 && ply < chess.depth && move == hash_move && tthit && cast(int)ttentry.depth >= depth-3 
 503 |        && ttentry.flag == TFLAGS.ALPHA && abs(alpha) < (INF-50) {
 504 |       if to == pto && flags >= Move16.Capture {
 505 |         // recapture extension.
 506 |         ext = 1;
 507 |         tt_capture = true;
 508 |         tt_quiet   = false;
 509 |       } else {
 510 |         // singular extension:
 511 |         // if one move is better than all the rest, then we consider this singular
 512 |         // "singular" is determined by checking all other moves at a shallow depth on a nullwindow
 513 |         singular_beta := ttentry.score - depth;
 514 |         chess.excluded_move = move;
 515 |         chess.exply = ply;
 516 |         chess.node_state ^= NodeState.SSE;
 517 |         score := negamax_zw(chess, depth/2-1, singular_beta-1, singular_beta, fifty);
 518 |         chess.node_state ^= NodeState.SSE;
 519 |         chess.excluded_move = Move16.Quiet;
 520 |         chess.exply = INF;
 521 |         if score < singular_beta then {
 522 |           ext = 1;
 523 |           tt_capture = flags > Move16.Double_Pawn_Push;
 524 |           tt_quiet   = flags <= Move16.Double_Pawn_Push;
 525 |         } else if singular_beta >= beta then {
 526 |           return singular_beta;
 527 |         } else if ttentry.score >= beta then {
 528 |           ext = -2;
 529 |         } else if ttentry.score <= alpha && ttentry.score <= score {
 530 |           ext = -1;
 531 |         }
 532 |       }
 533 |     }
 534 | 
 535 |     if flags <= Move16.Double_Pawn_Push {
 536 |       // add quiet move.
 537 |       quiet_moves[num_quiets] = move;
 538 |       num_quiets += 1;
 539 |     } else {
 540 |       // add capture move.
 541 |       cap_moves[num_captures] = move;
 542 |       num_captures += 1;
 543 |     }
 544 |      
 545 |     // make move.
 546 |     next_fifty := fifty_move(chess, move, fifty);
 547 |     cap, castling := make_move(chess, move);
 548 |     eval := 0;
 549 |     if ext==1 || moves_searched == 0 {
 550 |       eval = -negamax_pv(chess, depth+ext-1, -beta, -alpha, next_fifty);
 551 |     } else {
 552 |       if in_check(chess) {
 553 |         // do not reduce depth on check.
 554 |         eval = -negamax_zw(chess, depth, -alpha-1, -alpha, next_fifty);
 555 |       } else if flags > Move16.Double_Pawn_Push {
 556 |         // captures. promotions.
 557 |         R := 1 - 4 * move_score / (abs(move_score) + 24576);
 558 |         R = max(0, R);
 559 |         eval = -negamax_zw(chess, depth+ext-R-1, -alpha-1, -alpha, next_fifty);
 560 |       } else if num_quiets > 3 && flags <= Move16.Double_Pawn_Push {
 561 |         // Late Move Reduction when not in check and not giving check.
 562 |         R := LateMoveReduction[min(depth,63)][min(moves_searched,63)];
 563 |         R -= improving;
 564 |         if chess.probcut then {
 565 |           R += 2;
 566 |         }
 567 | 
 568 |         R -= 1 + 15 / (3 + depth);
 569 | 
 570 |         // increase reduction if capture.
 571 |         if tt_capture then {
 572 |           R += 1;
 573 |         }
 574 | 
 575 |         // decrease reduction if quiet.
 576 |         if tt_quiet then {
 577 |           R -= 2;
 578 |         }
 579 | 
 580 |         R -= move_score / 40_000;
 581 |         R = max(0, R);
 582 |         eval = -negamax_zw(chess, depth+ext-R-1, -alpha-1, -alpha, next_fifty);
 583 |       } else {
 584 |         // do normal search.
 585 |         eval = -negamax_zw(chess, depth+ext-1, -alpha-1, -alpha, next_fifty);
 586 |       }
 587 | 
 588 |       // search the move again that has failed to be proved to be bad with normal alpha beta score bounds
 589 |       if eval > alpha then 
 590 |         eval = -negamax_pv(chess, depth+ext-1, -beta, -alpha, next_fifty);
 591 |     }
 592 | 
 593 |     unmake_move(chess, move, cap, castling);
 594 |     chess.en_passant = ep;
 595 |     if eval > best_score then {
 596 |       best_score = eval;
 597 |       bmove = move;
 598 |       if best_score > alpha {
 599 |         using chess.history;
 600 |         hash_flag = TFLAGS.EXACT;
 601 |         pv_table[ply][0] = move;
 602 |         move_count := pv_table[ply+1].move_count;
 603 |         memcpy(*pv_table[ply][1], *pv_table[ply+1][0], move_count * size_of(Move16));
 604 |         pv_table[ply].move_count = move_count + 1;
 605 |         alpha = eval;
 606 |      
 607 |         if best_score >= beta then {
 608 |           // non-captures == Quiet|Pawn Push Moves
 609 |           if flags <= Move16.Double_Pawn_Push 
 610 |             update_quiet(chess, move, depth, quiet_moves, num_quiets, cap_moves, num_captures);
 611 |           tt_store(chess.hash, best_score, TFLAGS.BETA, depth, move);
 612 |           return best_score;
 613 |         } 
 614 |       } 
 615 |     }
 616 |     moves_searched += 1;
 617 |   }
 618 | 
 619 |   if move_count == 0 then
 620 |     return ifx in_check(chess) then -INF+ply else 0+ply;
 621 | 
 622 |   if hash_flag == TFLAGS.EXACT && get_move16_flag(bmove) > Move16.Double_Pawn_Push then 
 623 |     update_tactics(chess, bmove, depth, cap_moves, num_captures, quiet_moves, num_quiets);
 624 | 
 625 |   tt_store(chess.hash, best_score, hash_flag, depth, bmove);
 626 |   return best_score;
 627 | 
 628 | }
 629 | 
 630 | update_tactics :: (chess: *ChessGame, move: Move16, depth: int, cap_moves: []Move16, num_captures: int, quiet_moves: []Move16, num_quiets: int) {
 631 |   ply := chess.ply;
 632 |   incr : s32 = min(cast(s32)(depth*depth), 128);
 633 |   prev1, to1 := get_prev_move(chess, 1);
 634 |   prev2, to2 := get_prev_move(chess, 2);
 635 |   prev4, to4 := get_prev_move(chess, 4);
 636 | 
 637 |   using chess.history;
 638 |   for i: 0..num_captures-1 {
 639 |     prev_move := cap_moves[i];
 640 |     if prev_move != move {
 641 |       from := get_move16_from(prev_move);
 642 |       p := piece_at(chess, from);
 643 |       to := get_move16_to(prev_move);
 644 |       opp := norm(piece_at(chess, to));
 645 |       incr_history(*cap_history[p][to][opp], -incr);
 646 |     }
 647 |   }
 648 | 
 649 |   // decrement all quiet moves which < alpha.
 650 |   for i: 0..num_quiets-1 {
 651 |     quiet_move := quiet_moves[i];
 652 |     from := get_move16_from(quiet_move);
 653 |     p := piece_at(chess, from);
 654 |     to := get_move16_to(quiet_move);
 655 |     incr_history(*history_moves[p][to], -incr);
 656 |     incr_history(*counter_history[prev1][to1][p][to], -incr);
 657 |     incr_history(*counter_history[prev2][to2][p][to], -incr);
 658 |     incr_history(*counter_history[prev4][to4][p][to], -incr);
 659 |   }
 660 | 
 661 |   from := get_move16_from(move);
 662 |   p := piece_at(chess, from);
 663 |   to := get_move16_to(move);
 664 |   opp := norm(piece_at(chess, to));
 665 |   incr_history(*cap_history[p][to][opp], incr);
 666 | }
 667 | 
 668 | pv :: (chess: *ChessGame, body: Code, f: For_Flags) #expand {
 669 |   // main principle variation search move picker.
 670 |   stage: enum {PV; CAPTURES; KILLERS; QUIETS; END; } = .PV;
 671 |   move_queue: MoveQueue;
 672 |   unsafe: u64 = 0;
 673 |   check: u64 = 0;
 674 |   turn := chess.turn;
 675 |   while outer_loop := !stop() {
 676 |     while move_queue.count <= 0 {
 677 |       if stage == {
 678 |       case .PV;
 679 |         stage = .CAPTURES;
 680 |         if is_legal(chess, `hash_move) then {
 681 |           append(*move_queue, 0x7FFF, `hash_move);
 682 |         } else if is_legal(chess, `ttmove) {
 683 |           append(*move_queue, 0x7FFF, `ttmove);
 684 |         }
 685 |       case .CAPTURES;
 686 |         stage = .KILLERS;
 687 |         unsafe, check = generate_attacks(chess);
 688 |         generate_tactics(unsafe, check, chess, *move_queue);
 689 |         filter_capture_pv(*move_queue, `ttmove, `hash_move);
 690 |         sort_captures(*move_queue, chess);
 691 |       case .KILLERS;
 692 |         using chess.history;
 693 |         stage = .QUIETS;
 694 |         move := killer_moves[`ply][0];
 695 |         if is_legal(chess, move) {
 696 |           append(*move_queue, 0x7FFF, move);
 697 |         }
 698 | 
 699 |         move = killer_moves[`ply][1];
 700 |         if is_legal(chess, move) {
 701 |           append(*move_queue, 0x7FFF-1, move);
 702 |         } 
 703 |       case .QUIETS;
 704 |         using chess.history;
 705 |         stage = .END;
 706 |         generate_quiets(unsafe, check, chess, *move_queue);
 707 |         kill1 := killer_moves[`ply][0];
 708 |         kill2 := killer_moves[`ply][1];
 709 |         filter_quiet_pv(*move_queue, `ttmove, `hash_move, kill1, kill2);
 710 |         sort_quiets(*move_queue, chess);
 711 |       case .END;
 712 |         break outer_loop; // terminate loop
 713 |       }
 714 |     }
 715 |     `it, priority := pop(*move_queue);
 716 |     `it_index := cast(int)priority;
 717 |     #insert body;
 718 |   }
 719 | }
 720 | 
 721 | pvcut :: (chess: *ChessGame, body: Code, f: For_Flags) #expand {
 722 |   // main principle variation search move picker.
 723 |   stage: enum {PV; CAPTURES; KILLERS; QUIETS; END; } = .PV;
 724 |   move_queue: MoveQueue;
 725 |   unsafe: u64 = 0;
 726 |   check: u64 = 0;
 727 |   turn := chess.turn;
 728 |   while outer_loop := !stop() {
 729 |     while move_queue.count <= 0 {
 730 |       if stage == {
 731 |       case .PV;
 732 |         stage = .CAPTURES;
 733 |         if is_legal(chess, `hash_move) then {
 734 |           append(*move_queue, 0x7FFF, `hash_move);
 735 |         } else if is_legal(chess, `ttmove) {
 736 |           append(*move_queue, 0x7FFF, `ttmove);
 737 |         }
 738 |       case .CAPTURES;
 739 |         stage = .END;
 740 |         unsafe, check = generate_attacks(chess);
 741 |         generate_tactics(unsafe, check, chess, *move_queue);
 742 |         filter_capture_pv(*move_queue, `ttmove, `hash_move);
 743 |         sort_captures(*move_queue, chess);
 744 |       case .END;
 745 |         break outer_loop; // terminate loop
 746 |       }
 747 |     }
 748 |     `it, priority := pop(*move_queue);
 749 |     `it_index := cast(int)priority;
 750 |     #insert body;
 751 |   }
 752 | }
 753 | 
 754 | filter_capture_pv :: (move_queue: *MoveQueue, pv: Move16, hash: Move16) {
 755 |   i := 0;
 756 |   while i < move_queue.count {
 757 |     move := move_queue.array[i].move;
 758 |     if move == pv || move == hash then {
 759 |       last := move_queue.count - 1;
 760 |       move_queue.array[i] = move_queue.array[last];
 761 |       move_queue.count -= 1;
 762 |     } else {
 763 |       i += 1;
 764 |     }
 765 |   }
 766 | }
 767 | 
 768 | filter_quiet_pv :: (move_queue: *MoveQueue, pv: Move16, hash: Move16, kill1: Move16, kill2: Move16) {
 769 |   i := 0;
 770 |   while i < move_queue.count {
 771 |     move := move_queue.array[i].move;
 772 |     if move == pv || move == hash || move == kill1 || move == kill2 then {
 773 |       last := move_queue.count - 1;
 774 |       move_queue.array[i] = move_queue.array[last];
 775 |       move_queue.count -= 1;
 776 |     } else {
 777 |       i += 1;
 778 |     }
 779 |   }
 780 | }
 781 | 
 782 | sort_captures :: (move_queue: *MoveQueue, chess: *ChessGame) {
 783 | 
 784 |   score_capture :: (flag: Move16, from: int, to: int, chess: *ChessGame) -> s32 {
 785 |     using chess.history;
 786 |     if flag == {
 787 |     case .Capture;
 788 |       vic := norm(piece_at(chess, to));
 789 |       att := piece_at(chess, from);
 790 |       return mvv_lva(vic, att) + cap_history[att][to][vic];
 791 |     case .Ep_Capture;
 792 |       att := piece_at(chess, from);
 793 |       return mvv_lva(.W_PAWN, .W_PAWN) + cap_history[att][to][0];
 794 |     case .Knight_Promotion;
 795 |       att := piece_at(chess, from);
 796 |       return cap_history[att][to][0];
 797 |     case .Bishop_Promotion;
 798 |       att := piece_at(chess, from);
 799 |       return cap_history[att][to][0];
 800 |     case .Rook_Promotion;
 801 |       att := piece_at(chess, from);
 802 |       return cap_history[att][to][0];
 803 |     case .Queen_Promotion;
 804 |       att := piece_at(chess, from);
 805 |       return 20000 + cap_history[att][to][0];
 806 |     case .Knight_Promotion_Capture;
 807 |       vic := norm(piece_at(chess, to));
 808 |       att := piece_at(chess, from);
 809 |       return cap_history[att][to][vic];
 810 |     case .Bishop_Promotion_Capture;
 811 |       vic := norm(piece_at(chess, to));
 812 |       att := piece_at(chess, from);
 813 |       return cap_history[att][to][vic];
 814 |     case .Rook_Promotion_Capture;
 815 |       vic := norm(piece_at(chess, to));
 816 |       att := piece_at(chess, from);
 817 |       return cap_history[att][to][vic];
 818 |     case .Queen_Promotion_Capture;
 819 |       vic := norm(piece_at(chess, to));
 820 |       att := piece_at(chess, from);
 821 |       return cap_history[att][to][vic];
 822 |     case .King_Castle;
 823 |       att := piece_at(chess, from);
 824 |       return cap_history[att][to][0];
 825 |     case .Queen_Castle;
 826 |       att := piece_at(chess, from);
 827 |       return cap_history[att][to][0];
 828 |     }
 829 | 
 830 |     assert(false);
 831 |     return 0;
 832 |   }
 833 | 
 834 |   for i: 0..move_queue.count-1 {
 835 |      move := move_queue.array[i].move;
 836 |      flags, from, to := decode_move16(move);
 837 |      priority := score_capture(flags, from, to, chess);
 838 |      move_queue.array[i].priority = priority;
 839 |   }
 840 |   construct_heap(move_queue);
 841 | }
 842 | 
 843 | sort_quiets :: (move_queue: *MoveQueue, chess: *ChessGame) {
 844 | 
 845 |   score_quiets :: (flag: Move16, from: int, to: int, chess: *ChessGame, counter: [][64] s32, follow1: [][64] s32, follow2: [][64] s32) -> s32 {
 846 |     piece := piece_at(chess, from);
 847 |     return chess.history.history_moves[piece][to] + (counter[piece][to] * 2) + follow1[piece][to] + (follow2[piece][to] / 2);
 848 |   }
 849 | 
 850 | 
 851 |   counter: [][64] s32 = get_countermove_history(chess, 1);
 852 |   follow1: [][64] s32 = get_countermove_history(chess, 2);
 853 |   follow2: [][64] s32 = get_countermove_history(chess, 4);
 854 | 
 855 |   for i: 0..move_queue.count-1 {
 856 |      move := move_queue.array[i].move;
 857 |      flags, from, to := decode_move16(move);
 858 |      priority := score_quiets(flags, from, to, chess, counter, follow1, follow2);
 859 |      move_queue.array[i].priority = priority;
 860 |   }
 861 |   construct_heap(move_queue);
 862 | 
 863 | }
 864 | 
 865 | negamax_zw :: (chess: *ChessGame, depth: int, alpha: int, beta: int, fifty: int) -> int {
 866 |   read_input(chess.main_thread, nodes_searched, chess.maxnodes, time_begin, chess.movetime);
 867 |   ply := chess.ply;
 868 |   if ply && fifty >= 100 then
 869 |     return 0+ply;
 870 | 
 871 |   if is_draw(chess, fifty) then
 872 |     return 0+ply;
 873 | 
 874 |   // mate distance pruning
 875 |   alpha = max(alpha,-INF+ply);
 876 |   beta  = min(beta,  INF-ply);
 877 |   if alpha >= beta then {
 878 |     return alpha;
 879 |   }
 880 | 
 881 |   // cannot trust that tt_entry will remain the same.
 882 |   // it can be overwritten by accident due to hash collisions
 883 |   // esp. during multi-threading. need to copy out the ttentry.
 884 | 
 885 |   // tt probe.
 886 |   tthit, ttentry := tt_probe_negamax(chess, ply, chess.hash, alpha, beta, depth);
 887 |   tt_prune(tthit, ttentry, ply, false, alpha, beta, depth);
 888 | 
 889 |   if depth <= 0 then
 890 |     return quiescene(chess, alpha, beta, fifty);
 891 | 
 892 |   ttmove := chess.history.pv_table[ply][0];
 893 |   hash_move := ifx tthit then ttentry.ttmove else Move16.Quiet;
 894 | 
 895 |   defer {
 896 |     using chess.history;
 897 |     memset(*pv_table[ply+1], 0, size_of(PV_Line));
 898 |     killer_moves[ply+1][1] = 0;
 899 |     killer_moves[ply+1][0] = 0;
 900 |   }
 901 | 
 902 |   // NOTE: we make eval really high,
 903 |   // so we will look at checks much more
 904 |   // carefully, when improving=1, more branches to look at.
 905 |   array_add(*chess.eval, INF);
 906 |   defer pop(*chess.eval);
 907 |   if !in_check(chess) {
 908 |     eval := ifx tthit then ttentry.score else evaluate(chess, fifty);
 909 |     chess.eval[chess.eval.count-1] = cast(s16) eval;
 910 |     improving := get_improving(chess);
 911 | 
 912 |     // reverse futility pruning. stockfish has margin at 214*depth
 913 |     margin := 125 *(depth-improving);
 914 |     if depth < 11 && (eval-margin) >= beta then {
 915 |       return eval;
 916 |     }
 917 | 
 918 |     // razoring.
 919 |     if depth <= 5 && eval + (200*depth) <= alpha {
 920 |       score := quiescene(chess, alpha, beta, fifty);
 921 |       if score <= alpha then {
 922 |         return score;
 923 |       }
 924 |     }
 925 | 
 926 |     // null move pruning w/ verification at higher depths.
 927 |     if depth > 3 && eval >= beta then {
 928 |       R := 4 + depth/3 + min((eval-beta) / 256, 3);
 929 |       ep := make_null_move(chess);
 930 |       score := -negamax_zw(chess, depth-R-1, -beta, -beta + 1, 0);
 931 |       unmake_null_move(chess, ep);
 932 | 
 933 |       // verification at higher depths.
 934 |       if depth>12 || zugzwang(chess) then {
 935 |         score = negamax_zw(chess, depth-R-1, beta-1, beta, 0);
 936 |       }
 937 |     
 938 |       if score >= beta then {
 939 |         return score;
 940 |       }
 941 |     }
 942 | 
 943 |     // prob cut.
 944 |     prob_beta := beta + 125 - 46 * improving;
 945 |     if depth > 5 && abs(beta) < 20_000 && !(tthit && ttentry.depth >= depth-3 && ttentry.score != 0 && ttentry.score < prob_beta) then {
 946 |       chess.probcut += 1;
 947 |       defer chess.probcut -= 1;
 948 |       ep := chess.en_passant;
 949 |       for :pvcut move, move_score: chess {
 950 |         flags, from, to := decode_move16(move);
 951 |         pfrom := piece_at(chess, from);
 952 |        
 953 |         // make move.
 954 |         cap, castling := make_move(chess, move);
 955 |         score := -quiescene(chess, -prob_beta, -prob_beta + 1, 0);
 956 |         if score >= prob_beta then
 957 |           score = -negamax_zw(chess, depth-4, -prob_beta, -prob_beta + 1, 0);
 958 |     
 959 |         // unmake move.
 960 |         unmake_move(chess, move, cap, castling);
 961 |         chess.en_passant = ep;
 962 |     
 963 |         if score >= prob_beta then {
 964 |           tt_store(chess.hash, score, TFLAGS.BETA, depth-4, move);
 965 |           return score;
 966 |         }
 967 |       }
 968 |     }
 969 |   }
 970 | 
 971 |   ep := chess.en_passant;
 972 |   cap_moves: [32] Move16;
 973 |   quiet_moves: [64] Move16;
 974 |   num_captures := 0;
 975 |   num_quiets := 0;
 976 |   moves_searched := 0;
 977 |   prev_to :: (chess: ChessGame) -> int {
 978 |     if chess.ply >= 1 {
 979 |       count := chess.moves.count;
 980 |       prev_move1 := chess.moves[count-1].mov16;
 981 |       to := get_move16_to(prev_move1);
 982 |       return to;
 983 |     }
 984 |     return -1;
 985 |   }
 986 | 
 987 | 
 988 | 
 989 |   improving := get_improving(chess);
 990 |   late_move_prune := (3 + (depth*depth)) / (2-improving);
 991 |   tt_capture: bool = false;
 992 |   tt_quiet: bool   = false;
 993 |   move_count := 0;
 994 |   bmove := Move16.Quiet;
 995 |   pto := prev_to(chess);
 996 |   best_score: int = -INF;
 997 |   score_margin := -80_000 * depth;
 998 |   for :pv move, move_score : chess {
 999 |     move_count += 1;
1000 |     if (chess.node_state & NodeState.SSE) == 0 && ply == chess.exply && move == chess.excluded_move then {
1001 |       continue;
1002 |     }
1003 | 
1004 |     flags, from, to := decode_move16(move);
1005 |     pfrom := piece_at(chess, from);
1006 |     att := piece_at(chess, from);
1007 |     vic := piece_at(chess, to);
1008 | 
1009 |     // piece
1010 |     if chess.depth>6 && moves_searched>0 {
1011 |       // Late Move Pruning. Search the Root Plies Exhuastively
1012 |       if flags <= Move16.Double_Pawn_Push && (moves_searched > late_move_prune) then
1013 |         break;
1014 |       if flags <= Move16.Double_Pawn_Push && move_score < score_margin then
1015 |         break;
1016 |       if depth < 6 && flags == Move16.Capture && PVALUE[att] > PVALUE[vic] && see(chess, move, -250*depth) then
1017 |         continue;
1018 |       else if depth < 6 && flags <= Move16.Double_Pawn_Push && see(chess, move, -90*depth*depth) then
1019 |         continue;
1020 |     }
1021 | 
1022 |     ext := 0;
1023 |     if chess.node_state & NodeState.SSE && depth>=7 && ply < chess.depth && move == hash_move && tthit && cast(int)ttentry.depth >= depth-3 
1024 |        && ttentry.flag == TFLAGS.ALPHA {
1025 |       if to == pto && flags >= Move16.Capture {
1026 |         // recapture extension.
1027 |         ext = 1;
1028 |         tt_capture = true;
1029 |         tt_quiet   = false;
1030 |       } else {
1031 |         // singular extension:
1032 |         // if one move is better than all the rest, then we consider this singular
1033 |         // singular is determined by checking all other moves at a shallow depth on a nullwindow
1034 |         singular_beta := ttentry.score - depth;
1035 |         chess.excluded_move = move;
1036 |         chess.exply = ply;
1037 |         chess.node_state ^= NodeState.SSE;
1038 |         score := negamax_zw(chess, depth/2-1, singular_beta-1, singular_beta, fifty);
1039 |         chess.node_state ^= NodeState.SSE;
1040 |         chess.excluded_move = Move16.Quiet;
1041 |         chess.exply = INF;
1042 |         if score < singular_beta then {
1043 |           ext = 1;
1044 |           tt_capture = flags > Move16.Double_Pawn_Push;
1045 |           tt_quiet   = flags <= Move16.Double_Pawn_Push;
1046 |         } else if singular_beta >= beta then {
1047 |           return singular_beta;
1048 |         } else if ttentry.score >= beta then {
1049 |           ext = -2;
1050 |         } else if ttentry.score <= alpha && ttentry.score <= score {
1051 |           ext = -1;
1052 |         }
1053 |       }
1054 |     }
1055 | 
1056 |     if flags <= Move16.Double_Pawn_Push {
1057 |       // add quiet move.
1058 |       quiet_moves[num_quiets] = move;
1059 |       num_quiets += 1;
1060 |     } else {
1061 |       // add capture move.
1062 |       cap_moves[num_captures] = move;
1063 |       num_captures += 1;
1064 |     }
1065 |      
1066 |     // make move.
1067 |     next_fifty := fifty_move(chess, move, fifty);
1068 |     cap, castling := make_move(chess, move);
1069 |     eval := 0;
1070 |     if ext==1 || moves_searched == 0 {
1071 |       eval = -negamax_zw(chess, depth+ext-1, -beta, -alpha, next_fifty);
1072 |     } else {
1073 |       if in_check(chess) {
1074 |         // do not reduce depth on check.
1075 |         eval = -negamax_zw(chess, depth, -alpha-1, -alpha, next_fifty);
1076 |       } else if flags > Move16.Double_Pawn_Push {
1077 |         // captures. promotions.
1078 |         R := 1 - 4 * move_score / (abs(move_score) + 24576);
1079 |         R = max(0, R);
1080 |         eval = -negamax_zw(chess, depth+ext-R-1, -alpha-1, -alpha, next_fifty);
1081 |       } else if ply > 0 && num_quiets > 3 && flags <= Move16.Double_Pawn_Push {
1082 |         // Late Move Reduction when not in check and not giving check.
1083 |         R := LateMoveReduction[min(depth,63)][min(moves_searched,63)];
1084 |         R -= improving;
1085 |         if chess.probcut then {
1086 |           R += 2;
1087 |         }
1088 | 
1089 |         // increase reduction if capture.
1090 |         if tt_capture then {
1091 |           R += 1;
1092 |         }
1093 | 
1094 |         // decrease reduction if quiet.
1095 |         if tt_quiet then {
1096 |           R -= 2;
1097 |         }
1098 | 
1099 |         R -= move_score / 40_000;
1100 |         R = max(0, R);
1101 |         eval = -negamax_zw(chess, depth+ext-R-1, -alpha-1, -alpha, next_fifty);
1102 |       } else {
1103 |         // do normal search.
1104 |         eval = -negamax_zw(chess, depth+ext-1, -alpha-1, -alpha, next_fifty);
1105 |       }
1106 |     }
1107 | 
1108 |     unmake_move(chess, move, cap, castling);
1109 |     chess.en_passant = ep;
1110 |     if eval >= best_score {
1111 |       best_score = eval;
1112 |       bmove = move;
1113 |       if best_score >= beta then {
1114 |         using chess.history;
1115 |         pv_table[ply][0] = move;
1116 |         move_count := pv_table[ply+1].move_count;
1117 |         memcpy(*pv_table[ply][1], *pv_table[ply+1][0], move_count * size_of(Move16));
1118 |         pv_table[ply].move_count = move_count + 1;
1119 |      
1120 |         // non-captures == Quiet|Pawn Push Moves
1121 |         if flags <= Move16.Double_Pawn_Push 
1122 |           update_quiet(chess, move, depth, quiet_moves, num_quiets, cap_moves, num_captures);
1123 |         tt_store(chess.hash, best_score, TFLAGS.BETA, depth, move);
1124 |         return best_score;
1125 |       } 
1126 |     }
1127 | 
1128 |     moves_searched += 1;
1129 |   }
1130 | 
1131 |   if move_count == 0 then
1132 |     return ifx in_check(chess) then -INF+ply else 0+ply;
1133 |   tt_store(chess.hash, best_score, .ALPHA, depth, bmove);
1134 |   return best_score;
1135 | }
1136 | 
1137 | update_quiet :: (chess: *ChessGame, move: Move16, depth: int, quiet_moves: []Move16, num_quiets: int, cap_moves: []Move16, num_captures: int) {
1138 |   ply := chess.ply;
1139 |   incr : s32 = min(cast(s32) (depth*depth), 512);
1140 |   prev1, to1 := get_prev_move(chess, 1);
1141 |   prev2, to2 := get_prev_move(chess, 2);
1142 |   prev4, to4 := get_prev_move(chess, 4);
1143 | 
1144 |   using chess.history;
1145 |   // decrement all quiet moves which < alpha.
1146 |   for i: 0..num_quiets-1 {
1147 |     prev_move := quiet_moves[i];
1148 |     if prev_move != move {
1149 |       from := get_move16_from(prev_move);
1150 |       p := piece_at(chess, from);
1151 |       to := get_move16_to(prev_move);
1152 |       incr_history(*history_moves[p][to], -incr);
1153 |       incr_history(*counter_history[prev1][to1][p][to], -incr*2);
1154 |       incr_history(*counter_history[prev2][to2][p][to], -incr);
1155 |       incr_history(*counter_history[prev4][to4][p][to], -incr/2);
1156 |     }
1157 |   }
1158 | 
1159 |   for i: 0..num_captures-1 {
1160 |     prev_move := cap_moves[i];
1161 |     from := get_move16_from(prev_move);
1162 |     p := piece_at(chess, from);
1163 |     to := get_move16_to(prev_move);
1164 |     opp := norm(piece_at(chess, to));
1165 |     incr_history(*cap_history[p][to][opp], -incr);
1166 |   }
1167 | 
1168 |   // update history heuristic
1169 |   p := piece_at(chess, get_move16_from(move));
1170 |   to := get_move16_to(move);
1171 |   incr_history(*history_moves[p][to], incr);
1172 |   incr_history(*counter_history[prev1][to1][p][to], incr*2);
1173 |   incr_history(*counter_history[prev2][to2][p][to], incr);
1174 |   incr_history(*counter_history[prev4][to4][p][to], incr/2);
1175 | 
1176 |   if move != killer_moves[ply][0] then {
1177 |     killer_moves[ply][1] = killer_moves[ply][0];
1178 |     killer_moves[ply][0] = move;
1179 |   }
1180 | }
1181 | 
1182 | incr_history :: (data: *s32, incr: s32) #expand {
1183 |   <<data += (64*incr) - (<<data)*incr / 1024;
1184 | }
1185 | 
1186 | norm :: (p: Piece) -> Piece #expand {
1187 |   if p >= Piece.B_KING
1188 |     p -= 7;
1189 |   return p;
1190 | }
1191 | 
1192 | tt_probe_negamax :: (chess: ChessGame, ply: int, hash: u64, alpha: int, beta: int, depth: int) -> bool, TTData {
1193 |   tthit: bool = false; 
1194 |   ttentry: TTData;
1195 | 
1196 |   if (chess.node_state & NodeState.SSE) == 0 {
1197 |     if ply != chess.exply then {
1198 |       tthit, ttentry = tt_probe(chess.hash, alpha, beta, depth);
1199 |     }
1200 |   } else {
1201 |     tthit, ttentry = tt_probe(chess.hash, alpha, beta, depth);
1202 |   }
1203 | 
1204 |   return tthit, ttentry;
1205 | }
1206 | 
1207 | tt_probe :: (hash: u64, alpha: int, beta: int, depth: int) -> bool, TTData {
1208 |   key  := hash % cast,no_check (u64) ttable.count;
1209 |   prefetch(*ttable[key], Prefetch_Hint.T0);
1210 |   entry := ttable[key];
1211 |   if (entry.hash ^ (entry.padding & ~0xFF)) == hash then {
1212 |     // change the tt age.
1213 |     entry.age = search_age;
1214 |     memcpy(*ttable[key], *entry, size_of(TTEntry));
1215 |     return true, entry.data;
1216 |   }
1217 |   return false, entry.data;
1218 | }
1219 | 
1220 | tt_prune :: (tthit: bool, ttentry: TTData, ply: int, pv_node: bool, alpha: int, beta: int, depth: int) #expand {
1221 |   ttscore :: (score: int) -> int #expand {
1222 |     if score > INF-20  then
1223 |       return INF-ply;
1224 |     else if score < -INF+20 then
1225 |       return -INF+ply;
1226 |     return score;
1227 |   }
1228 | 
1229 |   if tthit && !pv_node && ply>0 && ttentry.depth >= depth {
1230 |     score := ttscore(ttentry.score);
1231 | 
1232 |     flag  := ttentry.flag;
1233 |     if flag == TFLAGS.EXACT then
1234 |       `return score;
1235 |     if flag == TFLAGS.ALPHA && score <= alpha then
1236 |       `return score;
1237 |     if flag == TFLAGS.BETA && score >= beta then
1238 |       `return score;
1239 |   }
1240 | 
1241 | }
1242 | 
1243 | tt_store :: (hash: u64, score: int, flag: TFLAGS, depth: int, ttmove: Move16) {
1244 |   key  := hash % cast,no_check(u64) ttable.count;
1245 |   prefetch(*ttable[key], Prefetch_Hint.T0);
1246 |   entry: TTEntry;
1247 |   entry.score = cast (s16) score;
1248 |   entry.flag = flag;
1249 |   entry.depth = cast, trunc(u8) depth;
1250 |   entry.ttmove = ttmove;
1251 |   entry.age = search_age;
1252 |   entry.hash = hash ^ (entry.padding & ~0xFF);
1253 | 
1254 |   ttentry := *ttable[key];
1255 |   if depth > ttentry.depth || ttentry.age != search_age then 
1256 |     memcpy(ttentry, *entry, size_of(TTEntry));
1257 | }
1258 | 
1259 | // perft results are found here: https://www.chessprogramming.org/Perft_Results
1260 | // more perft results: http://www.rocechess.ch/perft.html
1261 | perft_all :: () {
1262 |   setup_perft_run :: ($fen: string, expected: ..int) #expand {
1263 |     print("Testing FEN=[%1]%2", fen, NEWLINE);
1264 |     if equal(fen, "startpos") {
1265 |       chess_startpos(*chess);
1266 |     } else {
1267 |       chess_fen(*chess, fen);
1268 |     }
1269 | 
1270 |     for exp, depth: expected {
1271 |       perft_run(depth+1, exp);
1272 |     }
1273 |     print("------------------------------------%1%1", NEWLINE);
1274 |   }
1275 | 
1276 |   perft_run :: (depth: int, expected: int) #expand {
1277 |     time_taken := seconds_since_init();
1278 |     nodes := perft(*chess, depth);
1279 |     time_taken = seconds_since_init() - time_taken;
1280 |     if nodes == expected {
1281 |       print("perft successful at the depth of: %1, %2 nodes found, time taken: %3%4", depth, nodes, time_taken, NEWLINE);
1282 |     } else {
1283 |       print("FAILED at the depth of: %1, expected: %2, nodes found: %3, time taken: %4\n", depth, expected, nodes, time_taken, NEWLINE);
1284 |     }
1285 |   }
1286 | 
1287 | 
1288 |   chess: Chess;
1289 |   setup_perft_run("startpos", 
1290 |                   20, 
1291 |                   400, 
1292 |                   8_902, 
1293 |                   197_281, 
1294 |                   4_865_609,
1295 |                   119_060_324);
1296 |   // position 1: initial position
1297 |   setup_perft_run(fen="rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1",
1298 |                   20, 
1299 |                   400, 
1300 |                   8_902, 
1301 |                   197_281, 
1302 |                   4_865_609);
1303 | 
1304 |   // position 2: kiwipete by peter mckenzie
1305 |   setup_perft_run(fen="r3k2r/p1ppqpb1/bn2pnp1/3PN3/1p2P3/2N2Q1p/PPPBBPPP/R3K2R w KQkq -",
1306 |                   48,
1307 |                   2_039,
1308 |                   97_862,
1309 |                   4_085_603,
1310 |                   193_690_690);
1311 |                   //8_031_647_685);
1312 | 
1313 |   // position 3: en paesant bugs
1314 |   setup_perft_run(fen="8/2p5/3p4/KP5r/1R3p1k/8/4P1P1/8 w - -",
1315 |                   14,
1316 |                   191,
1317 |                   2_812,
1318 |                   43_238,
1319 |                   674_624,
1320 |                   11_030_083,
1321 |                   178_633_661);
1322 |                   //8_031_647_685);  // <-- this takes way too long...
1323 | 
1324 |   // position 4: castling
1325 |   setup_perft_run("r3k2r/Pppp1ppp/1b3nbN/nP6/BBP1P3/q4N2/Pp1P2PP/R2Q1RK1 w kq - 0 1",
1326 |                   6,
1327 |                   264,
1328 |                   9_467,
1329 |                   422_333,
1330 |                   15_833_292,
1331 |                   706_045_033);
1332 | 
1333 |   // position 4: castling (mirrored)
1334 |   setup_perft_run("r2q1rk1/pP1p2pp/Q4n2/bbp1p3/Np6/1B3NBn/pPPP1PPP/R3K2R b KQ - 0 1",
1335 |                   6,
1336 |                   264,
1337 |                   9_467,
1338 |                   422_333,
1339 |                   15_833_292,
1340 |                   706_045_033);
1341 | 
1342 |   // position 5: TalkChess discussed. rnbq1k1r/pp1Pbppp/2p5/8/2B5/8/PPP1NnPP/RNBQK2R w KQ - 1 8  
1343 |   setup_perft_run("rnbq1k1r/pp1Pbppp/2p5/8/2B5/8/PPP1NnPP/RNBQK2R w KQ - 1 8",
1344 |                   44,
1345 |                   1_486,
1346 |                   62_379,
1347 |                   2_103_487,
1348 |                   89_941_194);
1349 | 
1350 |   // position 6: Steven Edwards weirdness
1351 |   setup_perft_run("r4rk1/1pp1qppp/p1np1n2/2b1p1B1/2B1P1b1/P1NP1N2/1PP1QPPP/R4RK1 w - - 0 10",
1352 |                   46,
1353 |                   2_079,
1354 |                   89_890,
1355 |                   3_894_594,
1356 |                   164_075_551);
1357 | 
1358 |   // position 7: promotion bugs
1359 |   setup_perft_run("n1n5/PPPk4/8/8/8/8/4Kppp/5N1N b - - 0 1",
1360 |                   24,
1361 |                   496,
1362 |                   9_483,
1363 |                   182_838,
1364 |                   3_605_103);
1365 | 
1366 |   print("All perft tests complete!%1", NEWLINE);
1367 | }
1368 | 
1369 | perft_divide :: (chess: *Chess, depth: int) -> int {
1370 |   taken := seconds_since_init();
1371 |   if depth <= 0 then return 1;
1372 | 
1373 |   moves: Moves;
1374 |   generate_moves(chess, *moves);
1375 | 
1376 |   num_nodes := 0;
1377 |   perft_count := 0;
1378 | 
1379 |   ep := chess.en_passant;
1380 |   for i: 0..moves.count-1 {
1381 |     move := moves.array[i];
1382 |     cap, flags, hash := make_move(chess, move);
1383 |     move_count := perft(chess, depth-1);
1384 |     _, from, to := decode_move16(move);
1385 |     print("%1%2: %3 nodes%4", cast(serialized_bb)from, cast(serialized_bb)to, move_count, NEWLINE);
1386 | 
1387 |     num_nodes += move_count;
1388 |     unmake_move(chess, move, cap, flags, hash);
1389 |     chess.en_passant = ep;
1390 |     perft_count += 1;
1391 |   }
1392 | 
1393 |   print("moves from here: %1%2%2", perft_count, NEWLINE);
1394 |   taken = seconds_since_init() - taken;
1395 |   print("Total %1 nodes%3Time taken: %2%3", num_nodes, taken, NEWLINE);
1396 |   print("NPS: %1%2", cast(float)num_nodes / taken, NEWLINE);
1397 |   return num_nodes;
1398 | }
1399 | 
1400 | INF :: S16_MAX; // Alias
1401 | 
1402 | #import "Machine_X64";
1403 | // search capture moves only, we do this to avoid horizon effect
1404 | // see: https://www.chessprogramming.org/Quiescence_Search
1405 | quiescene :: (chess: *ChessGame, alpha: int, beta: int, fifty: int = 0) -> int {
1406 |   pv :: (chess: *ChessGame, body: Code, f: For_Flags) #expand {
1407 |     stage: enum {PV; CAPTURES; QUIETS; END; } = .PV;
1408 | 
1409 |     move_queue: MoveQueue(32);
1410 |     unsafe: u64 = 0;
1411 |     check: u64 = 0;
1412 |     turn := chess.turn;
1413 |     while outer_loop := !stop() {
1414 |       while move_queue.count <= 0 {
1415 |         if stage == {
1416 |         case .PV;
1417 |           stage = .CAPTURES;
1418 |           if is_legal(chess, `ttmove) then {
1419 |             append(*move_queue, 0x7FFF, `ttmove);
1420 |           }
1421 |           if `hashmove != `ttmove && is_legal(chess, `hashmove) then {
1422 |             append(*move_queue, 0x7FFF-1, `hashmove);
1423 |           }
1424 |         case .CAPTURES;
1425 |           unsafe, check = generate_attacks(chess);
1426 |           if check == 0 then {
1427 |             stage = .END;
1428 |           } else {
1429 |             stage = .QUIETS;
1430 |           }
1431 |           generate_tactics(unsafe, check, chess, *move_queue);
1432 |           filter_capture_pv(*move_queue, `ttmove, `hashmove);
1433 |           sort_captures(*move_queue, chess);
1434 |         case .QUIETS;
1435 |           stage = .END;
1436 |           generate_quiets(unsafe, check, chess, *move_queue);
1437 |           filter_capture_pv(*move_queue, `ttmove, `hashmove);
1438 |           sort_quiets(*move_queue, chess);
1439 |         case .END;
1440 |           break outer_loop; // terminate loop
1441 |         }
1442 |       }
1443 |       `it, `it_index := pop(*move_queue);
1444 |       #insert body;
1445 |     }
1446 |   }
1447 | 
1448 |   can_delta_prune :: (using chess: *Chess) -> bool #expand {
1449 |     w := w_queen | w_rook | w_bishop | w_knight;
1450 |     b := b_queen | b_rook | b_bishop | b_knight;
1451 |     return w != 0 || b != 0;
1452 |   }
1453 | 
1454 |   is_promoting_pawn :: inline (using chess: *Chess) -> bool {
1455 |     return (w_pawn & cast(u64)rank_7) != 0 || (b_pawn & cast(u64)rank_2) != 0; 
1456 |   }
1457 | 
1458 |   // a bad capture => true, a good capture => false
1459 |   bad_capture :: (chess: *Chess, move: Move16, from: int, to: int) -> bool {
1460 |     pfrom := cast(int) piece_at(chess, from);
1461 |     pto   := cast(int) piece_at(chess, to);
1462 |     if PVALUE[pfrom] <= PVALUE[pto] then {
1463 |       return false;
1464 |     }
1465 | 
1466 |     return see(chess, move, 0);
1467 |   }
1468 |   ply := chess.ply;
1469 |   pv_node := alpha != beta-1;
1470 |   tthit, ttentry := tt_probe(chess.hash, alpha, beta, 0);
1471 |   tt_prune(tthit, ttentry, ply, pv_node, alpha, beta, 0);
1472 |   best_move := Move16.Quiet;
1473 | 
1474 |   ttmove  := chess.history.pv_table[ply][0];
1475 |   if get_move16_flag(ttmove) < Move16.Capture || !is_legal(chess, ttmove) then {
1476 |     ttmove = 0;
1477 |   } 
1478 | 
1479 |   hashmove := Move16.Quiet;
1480 |   if tthit && ttmove == 0 && get_move16_flag(ttentry.ttmove) >= Move16.Capture then {
1481 |     hashmove = ttentry.ttmove;
1482 |   }
1483 | 
1484 |   defer {
1485 |     using chess.history;
1486 |     memset(*pv_table[ply+1], 0, size_of(PV_Line));
1487 |     killer_moves[ply+1][1] = 0;
1488 |     killer_moves[ply+1][0] = 0;
1489 |   }
1490 | 
1491 |   best_score: int = -INF;
1492 |   hash_flag := TFLAGS.ALPHA;
1493 |   pos_check := in_check(chess);
1494 |   if !pos_check {
1495 |     // no standing pat while in check 
1496 |     best_score = evaluate(chess, fifty);
1497 |     if best_score >= beta then {
1498 |       return best_score;
1499 |     }
1500 | 
1501 |     // delta pruning:
1502 |     if can_delta_prune(chess) {
1503 |       if best_score < (alpha - 975) then {
1504 |         return best_score; 
1505 |       }
1506 |     }
1507 | 
1508 |     if alpha < best_score then {
1509 |       // hash_flag = TFLAGS.EXACT;
1510 |       alpha = best_score; 
1511 |     }
1512 |   }
1513 | 
1514 |   // search captures only, captures negate the fifty move rule
1515 |   ep := chess.en_passant;
1516 |   move_count := 0;
1517 |   for :pv move, move_score: chess {
1518 |     move_count += 1;
1519 |     type, from, to := decode_move16(move);
1520 | 
1521 |     // skip bad captures w/ SEE algorithm.
1522 |     if pos_check == false && bad_capture(chess, move, from, to) then
1523 |       continue;
1524 | 
1525 |     cap, flags := make_move(chess, move);
1526 |     next_fifty := fifty_move(chess, move, fifty);
1527 |     score := -quiescene(chess, -beta, -alpha, next_fifty);
1528 |     unmake_move(chess, move, cap, flags);
1529 |     chess.en_passant = ep;
1530 |     if score >= best_score {
1531 |       best_score = score;
1532 |       if best_score > alpha then {
1533 |         hash_flag = TFLAGS.EXACT;
1534 |         best_move = move;
1535 |         alpha = best_score;
1536 |      
1537 |         using chess.history;
1538 |         pv_table[ply][0] = move;
1539 |         move_count := pv_table[ply+1].move_count;
1540 |         memcpy(*pv_table[ply][1], *pv_table[ply+1][0], move_count * size_of(Move16));
1541 |         pv_table[ply].move_count = move_count + 1;
1542 |         if best_score >= beta then {
1543 |           tt_store(chess.hash, best_score, TFLAGS.BETA, 0, move);
1544 |           return best_score;
1545 |         }
1546 |      
1547 |       }
1548 |     }
1549 | 
1550 |   }
1551 | 
1552 |   if move_count == 0 && pos_check then {
1553 |     return -INF+ply;
1554 |   }
1555 | 
1556 |   tt_store(chess.hash, best_score, hash_flag, 0, best_move);
1557 |   return best_score;
1558 | 
1559 | }
1560 | 
1561 | PV_Line :: struct {
1562 |   move_count: u16;
1563 |   argmove: [79] Move16;
1564 | }
1565 | 
1566 | operator [] :: (using p: PV_Line, index: int) -> Move16 {
1567 |   return argmove[index];
1568 | }
1569 | 
1570 | operator []= :: (using p: *PV_Line, index: int, move: Move16) #expand {
1571 |   argmove[index] = move;
1572 | }
1573 | 
1574 | operator *[] :: (p: *PV_Line, index: int) -> *Move16 #expand {
1575 |   return *p.argmove[index];
1576 | }
1577 | 
1578 | operator [] :: (using m: Moves, index: int) -> Move16 {
1579 |   return array[index];
1580 | }
1581 | 
1582 | time_begin: float64;
1583 | 
1584 | // performance test
1585 | // walks the move generation tree of strictly legal moves to count the leaf nodes
1586 | perft :: (chess: *Chess, depth: int) -> int {
1587 |   gen_moves :: (chess: *Chess, body: Code, f: For_Flags) #expand {
1588 |     moves: Moves;
1589 |     unsafe, check := generate_attacks(chess);
1590 |     generate_tactics(unsafe, check, chess, *moves);
1591 |     i := 0;
1592 |     `it := Move16.Quiet;
1593 |     `it_index := 0;
1594 |     while i < moves.count {
1595 |       it = moves[i];
1596 |       #insert body;
1597 |       i += 1;
1598 |       it_index += 1;
1599 |     }
1600 |     moves.count = 0;
1601 |     memset(*moves, 0, size_of(Moves));
1602 |     generate_quiets(unsafe, check, chess, *moves);
1603 |     i = 0;
1604 |     while i < moves.count {
1605 |       it = moves[i];
1606 |       #insert body;
1607 |       i += 1;
1608 |       it_index += 1;
1609 |     }
1610 |   }
1611 |   if depth == 0 then return 1;
1612 | 
1613 |   if depth == 1 then {
1614 |     moves: Moves(true);
1615 |     generate_moves(chess, *moves);
1616 |     return moves.count;
1617 |   }
1618 | 
1619 |   num_nodes := 0;
1620 |   ep := chess.en_passant;
1621 |   for :gen_moves mov: chess {
1622 |     cap, castling, hash := make_move(chess, mov);
1623 | 
1624 |     num_nodes += perft(chess, depth-1);
1625 | 
1626 |     unmake_move(chess, mov, cap, castling, hash);
1627 |     chess.en_passant = ep;
1628 |   }
1629 |   return num_nodes;
1630 | 
1631 | }
1632 | 
1633 | #no_reset LateMoveReduction: [64][64] int;
1634 | #run {
1635 |   for i: 1..63 {
1636 |     for j: 1..63 {
1637 |       LMR := log(cast(float)i) * log(cast(float)j) * 0.5;
1638 |       LateMoveReduction[i][j] = cast (int) LMR;
1639 |     }
1640 |   }
1641 | }
1642 | 
1643 | // most valuable victim, least valuable attacker.
1644 | // see: https://www.chessprogramming.org/MVV-LVA
1645 | mvv_lva :: (victim: Piece, attacker: Piece) -> score: s32 {
1646 |   piece_score :: #run -> [13] s32 {
1647 |     using Piece;
1648 |     array: [13] s32;
1649 |     array[cast(int)NONE     ] = 0;
1650 |     array[cast(int)W_KING   ] = 9000;
1651 |     array[cast(int)W_QUEEN  ] = 9000;
1652 |     array[cast(int)W_ROOK   ] = 5000;
1653 |     array[cast(int)W_BISHOP ] = 3500;
1654 |     array[cast(int)W_KNIGHT ] = 3250;
1655 |     array[cast(int)W_PAWN   ] = 1000;
1656 | 
1657 |     array[cast(int)B_KING   ] = 9000;
1658 |     array[cast(int)B_QUEEN  ] = 9000;
1659 |     array[cast(int)B_ROOK   ] = 5000;
1660 |     array[cast(int)B_BISHOP ] = 3500;
1661 |     array[cast(int)B_KNIGHT ] = 1000;
1662 |     array[cast(int)B_PAWN   ] = 1000;
1663 |     return array;
1664 |   }
1665 | 
1666 |   v := cast(int) victim;
1667 |   a := cast(int) attacker;
1668 |   vs := piece_score[v] - (piece_score[a]/100) + 10000;
1669 |   return vs;
1670 | }
1671 | 
1672 | nodes_searched := 0;
1673 | 
1674 | History :: struct {
1675 |   pv_table:        [128] PV_Line;
1676 |   cap_history:     [13][64][7] s32;
1677 |   history_moves:   [13][64] s32;
1678 |   killer_moves:    [128][2] Move16;
1679 |   counter_history: [13][64][13][64] s32;
1680 | }
1681 | 
1682 | TTEntry :: struct {
1683 |   hash: u64;
1684 |   union {
1685 |     using data: TTData;
1686 |     padding: u64;
1687 |   }
1688 | }
1689 | 
1690 | ttable: [] TTEntry;
1691 | 
1692 | init_ttable :: (size: int = 16_000_000) {
1693 |   if ttable.count > 0 then
1694 |     array_free(ttable);
1695 |   num_entries := size / size_of(TTEntry);
1696 |   ttable = NewArray(num_entries, TTEntry);
1697 |   memset(*ttable[0], 0, size_of(TTEntry)*num_entries);
1698 | }
1699 | 
1700 | TTData :: struct {
1701 |   age   : u8; // note: we put age as the first byte to make it easier to mask.
1702 |   flag  : TFLAGS;
1703 |   depth : u8;
1704 |   ttmove: Move16;
1705 |   score : s16;
1706 | }
1707 | 
1708 | search_age : u8 = 0;
1709 | 
1710 | TFLAGS :: enum u8 { EXACT; ALPHA; BETA; }
1711 | 
1712 | Clear_Hash :: () {
1713 |   memset(*ttable[0], 0, size_of(TTEntry)*ttable.count);
1714 | }
1715 | 
1716 | Moves :: struct(perft1 := false) {
1717 |   count: s32;
1718 |   #if perft1 == false {
1719 |     array: [64] Move16;
1720 |   }
1721 | }
1722 | 
1723 | add_move :: (moves: *Moves, from: int, to: int, flags: Move16) #expand {
1724 |   #if moves.perft1 == false {
1725 |     move := to_move16(from, to, flags);
1726 |     moves.array[moves.count] = move;
1727 |   }
1728 |   moves.count += 1;
1729 | }
1730 | 
1731 | add_move :: (queue: *MoveQueue, from: int, to: int, flags: Move16) #expand {
1732 |   move := to_move16(from, to, flags);
1733 |   append(queue, 0, move);
1734 | }
1735 | 
1736 | // static exchange evaluation.
1737 | see :: (chess: *Chess, move: Move16, threshold: int) -> bool {
1738 |   attacks_to :: (occ: u64, sq: int) -> u64 #expand {
1739 |     to := cast,no_check(u64) (1 << sq);
1740 |     atts : u64 = 0;
1741 |     atts |= pawn_captures(to, Turn.BLACK) & chess.w_pawn;
1742 |     atts |= pawn_captures(to, Turn.WHITE) & chess.b_pawn;
1743 |     atts |= knight_moves(to) & (chess.w_knight|chess.b_knight);
1744 |     atts |= bishop_moves(sq, occ) & (chess.w_bishop|chess.b_bishop|chess.w_queen|chess.b_queen);
1745 |     atts |= rook_moves(sq, occ) & (chess.w_rook|chess.b_rook|chess.w_queen|chess.b_queen);
1746 |     atts |= king_moves(to) & (chess.w_king|chess.b_king);
1747 |     return atts;
1748 |   }
1749 | 
1750 |   consider_x_rays :: (occ: u64, to: int, from_set: u64, bishops: u64, rooks: u64) -> u64 #expand {
1751 |     atts : u64 = 0;
1752 |     atts |= bishop_moves(to, occ) & bishops;
1753 |     atts |= rook_moves(to, occ) & rooks;
1754 |     return atts & occ;
1755 |   }
1756 | 
1757 |   get_least_valuable_piece :: (attadef: u64, turn: Turn)-> bitboard: u64, piece: int #expand {
1758 | 
1759 |     least_valuable_piece :: (attadef: u64, $turn: Turn) -> bitboard: u64, piece: int #expand {
1760 |       subset: u64 = 0;
1761 |       pawns := get_pawn(chess, turn);
1762 |       subset = attadef & pawns;
1763 |       if subset then
1764 |         return get_bit(subset), xx Piece.W_PAWN;
1765 | 
1766 |       knights := get_knight(chess, turn);
1767 |       subset = attadef & knights;
1768 |       if subset then
1769 |         return get_bit(subset), xx Piece.W_KNIGHT;
1770 | 
1771 |       bishops := get_bishop(chess, turn);
1772 |       subset = attadef & bishops;
1773 |       if subset then
1774 |         return get_bit(subset), xx Piece.W_BISHOP;
1775 | 
1776 |       rooks := get_rook(chess, turn);
1777 |       subset = attadef & rooks;
1778 |       if subset then
1779 |         return get_bit(subset), xx Piece.W_ROOK;
1780 | 
1781 |       queens := get_queen(chess, turn);
1782 |       subset = attadef & queens;
1783 |       if subset then
1784 |         return get_bit(subset), xx Piece.W_QUEEN;
1785 | 
1786 |       kings := get_king(chess, turn);
1787 |       subset = attadef & kings;
1788 |       if subset then
1789 |         return get_bit(subset), xx Piece.W_KING;
1790 | 
1791 |       return 0, xx Piece.NONE;
1792 |     }
1793 |     if turn == Turn.WHITE {
1794 |       bitboard, piece := least_valuable_piece(attadef, Turn.WHITE);
1795 |       return bitboard, piece;
1796 |     } else {
1797 |       bitboard, piece := least_valuable_piece(attadef, Turn.BLACK);
1798 |       return bitboard, piece;
1799 |     }
1800 |   }
1801 | 
1802 |   get_bit :: (bits: u64) -> u64 {
1803 |     b: int = cast, no_check(int) bits;
1804 |     b = -b;
1805 |     return bits & (cast, no_check(u64)b);
1806 |   }
1807 | 
1808 |   _, from, to := decode_move16(move);
1809 |   target := cast(int) piece_at(chess, to);
1810 |   apiece := cast(int) piece_at(chess, from);
1811 | 
1812 |   gain: [16] s16;
1813 |   d := 0;
1814 |   may_x_ray: u64 = chess.w_pawn | chess.w_bishop | chess.w_rook | chess.w_queen
1815 |                  | chess.b_pawn | chess.b_bishop | chess.b_rook | chess.b_queen;
1816 |   from_set := cast,no_check(u64) 1 << from;
1817 |   occupied := chess.occupied;
1818 |   attadef  := attacks_to(occupied, to);
1819 |   gain[d] = PVALUE[target] - cast(s16)threshold;
1820 |   if gain[d] < 0 then
1821 |     return false;
1822 | 
1823 |   turn := chess.turn;
1824 | 
1825 |   bishops := (chess.w_bishop|chess.b_bishop|chess.w_queen|chess.b_queen);
1826 |   rooks   := (chess.w_rook|chess.b_rook|chess.w_queen|chess.b_queen);
1827 | 
1828 |   while from_set != 0 {
1829 |     d += 1; // next depth and side
1830 |     turn ^= 1;
1831 |     gain[d] = PVALUE[apiece] - gain[d-1]; // speculative store, if defended
1832 | 
1833 |     // prune.
1834 |     if max(-gain[d-1], gain[d]) < 0 then
1835 |       break; 
1836 | 
1837 |     attadef  ^= from_set; // reset bit in set to traverse
1838 |     occupied ^= from_set; // reset bit in temporary occupancy (for x-Rays)
1839 |     if from_set & may_x_ray then
1840 |       attadef |= consider_x_rays(occupied, to, from_set, bishops, rooks);
1841 | 
1842 |     from_set, apiece = get_least_valuable_piece(attadef, turn);
1843 |   }
1844 | 
1845 |   for #v2 < i: 1..d-1 {
1846 |     gain[i-1]= -max(-gain[i-1], gain[i]);
1847 |   }
1848 | 
1849 |   return gain[0] < threshold;
1850 | 
1851 | }
1852 | 
1853 | PVALUE :: #run -> [13] s16 {
1854 |   using Piece;
1855 |   array: [13] s16;
1856 |   array[cast(int)NONE     ] = 0;
1857 |   array[cast(int)W_KING   ] = 10000;
1858 |   array[cast(int)W_QUEEN  ] = 1000;
1859 |   array[cast(int)W_ROOK   ] = 500;
1860 |   array[cast(int)W_BISHOP ] = 300;
1861 |   array[cast(int)W_KNIGHT ] = 300;
1862 |   array[cast(int)W_PAWN   ] = 100;
1863 | 
1864 |   array[cast(int)B_KING   ] = 10000;
1865 |   array[cast(int)B_QUEEN  ] = 1000;
1866 |   array[cast(int)B_ROOK   ] = 500;
1867 |   array[cast(int)B_BISHOP ] = 300;
1868 |   array[cast(int)B_KNIGHT ] = 300;
1869 |   array[cast(int)B_PAWN   ] = 100;
1870 |   return array;
1871 | }
1872 | 
1873 | get_bestmove :: (c: *ChessGame) -> Move16 #expand {
1874 |   return c.history.pv_table[0][0];
1875 | }
1876 | 
1877 | copy_chessgame :: (dest: *ChessGame, src: *ChessGame) {
1878 |   memcpy(*dest.chess, *src.chess, size_of(Chess));
1879 |   // skip history, since that is zero.
1880 |   dest.ply = src.ply;
1881 |   dest.maxply = src.maxply;
1882 |   dest.depth = src.depth;
1883 |   dest.maxnodes = src.maxnodes;
1884 |   dest.movetime = src.movetime;
1885 |   dest.maxdepth = src.maxdepth;
1886 |   dest.excluded_move = src.excluded_move;
1887 |   dest.exply = src.exply;
1888 |   dest.probcut = src.probcut;
1889 | 
1890 |   // don't copy NNUEdata or node_state.
1891 |   array_copy(*dest.moves, src.moves);
1892 |   array_copy(*dest.rtable, src.rtable);
1893 |   array_copy(*dest.eval, src.eval);
1894 |   dest.fifty = src.fifty;
1895 | }
1896 | 
1897 | ChessGame :: struct {
1898 |   #as using chess: Chess;
1899 |   history: History;
1900 |   ply: int;
1901 |   maxply: int;
1902 |   depth: int;
1903 |   maxnodes: int;
1904 |   movetime: int;
1905 |   maxdepth: int;
1906 |   excluded_move: Move16;
1907 |   exply: int;
1908 |   probcut: s8 = 0;
1909 |   main_thread := false;
1910 |   fifty: int;
1911 |   score: s16 = 0;
1912 | 
1913 |   nnue: [] NNUEdata;
1914 |   node_state: NodeState;
1915 |   moves: [..] Move32;
1916 |   rtable: [..] u64;
1917 |   eval: [..] s16;
1918 | } 
1919 | 
1920 | free_chess_game :: (chess: *ChessGame) {
1921 |   array_free(chess.nnue);
1922 |   array_free(chess.moves);
1923 |   array_free(chess.rtable);
1924 |   array_free(chess.eval);
1925 |   free(chess);
1926 | }
1927 | 
1928 | clear_history :: (chess: *ChessGame) {
1929 |   using chess.history;
1930 |   memset(pv_table.data, 0, size_of(type_of(pv_table)));
1931 |   memset(cap_history.data, 0, size_of(type_of(cap_history)));
1932 |   memset(history_moves.data, 0, size_of(type_of(history_moves)));
1933 |   memset(killer_moves.data,  0, size_of(type_of(killer_moves)));
1934 |   memset(counter_history.data, 0, size_of(type_of(counter_history)));
1935 | }
1936 | 
1937 | NodeState :: enum_flags u8 {
1938 |   NULL;
1939 |   SSE;
1940 | }
1941 | 
1942 | Move32 :: struct {
1943 |   mov16: Move16;
1944 |   piece: Piece;
1945 | }
1946 | 
1947 | clear :: (using c: *ChessGame) {
1948 |   array_reset(*moves);
1949 |   array_reset(*rtable);
1950 |   array_reset(*eval);
1951 | }
1952 | 
1953 | // init chess game
1954 | initialize_chess_game_memory :: (using c: *ChessGame) {
1955 |   MAX :: 500;
1956 |   array_reserve(*moves, MAX);
1957 |   array_reserve(*rtable, MAX);
1958 |   array_reserve(*eval, MAX);
1959 |   nnue = NewArray(MAX, NNUEdata, alignment=64);
1960 |   //assert((cast(s64)nnue.data % 64) == 0);
1961 | }
1962 | 
1963 | is_draw :: (using,except(fifty) c: *ChessGame, fifty: int) -> bool {
1964 |   count_p := popcount(c.occupied);
1965 |   if count_p <= 3 then {
1966 |     if count_p == 2 then {
1967 |       return true;
1968 |     }
1969 |     if (w_bishop|w_knight|b_bishop|b_knight) & c.occupied then {
1970 |       return true;
1971 |     }
1972 |   }
1973 | 
1974 |   hash_val := c.hash;
1975 |   repeat_count := 0;
1976 |   index := rtable.count-1;
1977 |   while fifty > 0 {
1978 |     if rtable[index] == hash_val then {
1979 |       repeat_count += 1;
1980 |       if repeat_count >= 2 then
1981 |         return true;
1982 |     }
1983 |     index -= 1;
1984 |     fifty -= 1;
1985 |   }
1986 | 
1987 |   return false;
1988 | }
1989 | 
1990 | make_move :: (c: *ChessGame, move: Move16) -> Piece, Castling {
1991 | 
1992 |   make_move_nnue :: (using c: *ChessGame, move: Move16) #expand {
1993 |     dp := *nnue[ply+1].dirtyPiece;
1994 |     nnue[ply+1].accumulator.computedAccumulation = 0;
1995 |     dp.dirtyNum = 1;
1996 |     // remove captured piece.
1997 |     if flags == Move16.Capture || flags >= Move16.Knight_Promotion_Capture {
1998 |       dp.dirtyNum = 2;
1999 |       dp.pc[1] = xx piece_at(c, to);
2000 |       dp.from[1] = xx to;
2001 |       dp.to[1] = 64;
2002 |     } else if flags == Move16.Ep_Capture {
2003 |       epto := bit_scan_forward(ifx c.turn==Turn.WHITE chess.en_passant>>8 else chess.en_passant<<8);
2004 |       dp.dirtyNum = 2;
2005 |       dp.pc[1] = xx piece_at(c, epto);
2006 |       dp.from[1] = xx epto;
2007 |       dp.to[1] = 64;
2008 |     }
2009 | 
2010 |     dp.pc[0] = xx piece_at(c, from);
2011 |     dp.from[0] = xx from;
2012 |     dp.to[0] = xx to;
2013 | 
2014 |     if flags >= Move16.Knight_Promotion then {
2015 |       pic : s32 = xx Piece.NONE;
2016 |       if flags == Move16.Knight_Promotion || flags == Move16.Knight_Promotion_Capture {
2017 |         pic = xx (ifx turn == Turn.WHITE Piece.W_KNIGHT else Piece.B_KNIGHT);
2018 |       } else if flags == Move16.Bishop_Promotion || flags == Move16.Bishop_Promotion_Capture {
2019 |         pic = xx (ifx turn == Turn.WHITE Piece.W_BISHOP else Piece.B_BISHOP);
2020 |       } else if flags == Move16.Rook_Promotion || flags == Move16.Rook_Promotion_Capture {
2021 |         pic = xx (ifx turn == Turn.WHITE Piece.W_ROOK else Piece.B_ROOK);
2022 |       } else if flags == Move16.Queen_Promotion || flags == Move16.Queen_Promotion_Capture {
2023 |         pic = xx (ifx turn == Turn.WHITE Piece.W_QUEEN else Piece.B_QUEEN);
2024 |       }
2025 | 
2026 |       dp.to[0] = 64;
2027 |       dp.pc[dp.dirtyNum] = pic;
2028 |       dp.from[dp.dirtyNum] = 64;
2029 |       dp.to[dp.dirtyNum] = xx to;
2030 |       dp.dirtyNum += 1;
2031 |     }
2032 | 
2033 |     if flags == Move16.King_Castle {
2034 |       pic: s32 = xx (ifx turn == Turn.WHITE Piece.W_ROOK else Piece.B_ROOK);
2035 |       from_castle: s32 = xx (ifx turn == Turn.WHITE serialized_bb.h1 else serialized_bb.h8);
2036 |       to_castle: s32 = xx (ifx turn == Turn.WHITE serialized_bb.f1 else serialized_bb.f8);
2037 |       dp.dirtyNum = 2;
2038 |       dp.pc[1] = pic;
2039 |       dp.from[1] = from_castle;
2040 |       dp.to[1] = to_castle;
2041 |     }
2042 | 
2043 |     if flags == Move16.Queen_Castle {
2044 |       pic : s32 = xx (ifx turn == Turn.WHITE Piece.W_ROOK else Piece.B_ROOK);
2045 |       from_castle: s32 = xx (ifx turn == Turn.WHITE serialized_bb.a1 else serialized_bb.a8);
2046 |       to_castle: s32 = xx (ifx turn == Turn.WHITE serialized_bb.d1 else serialized_bb.d8);
2047 |       dp.dirtyNum = 2;
2048 |       dp.pc[1] = pic;
2049 |       dp.from[1] = from_castle;
2050 |       dp.to[1] = to_castle;
2051 |     }
2052 |   }
2053 |   fetch_and_add(*nodes_searched);
2054 |   flags, from, to := decode_move16(move);
2055 |   piece := c.pieces[from];
2056 |   move32: Move32;
2057 |   move32.mov16 = move;
2058 |   move32.piece = piece;
2059 |   make_move_nnue(c, move);
2060 | 
2061 |   cap, castling, hash := make_move(*c.chess, move);
2062 |   key := hash % cast, no_check(u64) ttable.count;
2063 |   array_add(*c.moves, move32);
2064 |   array_add(*c.rtable, hash);
2065 | 
2066 |   c.ply += 1;
2067 |   c.maxply = max(c.maxply, c.ply);
2068 |   return cap, castling;
2069 | }
2070 | 
2071 | unmake_move :: (c: *ChessGame, move: Move16, cap: Piece, castling: Castling) {
2072 |   pop(*c.moves);
2073 |   hash := pop(*c.rtable);
2074 |   unmake_move(*c.chess, move, cap, castling, hash);
2075 |   c.ply -= 1;
2076 | }
2077 | 
2078 | get_prev_move :: (chess: *ChessGame, num: int) -> piece: int, to: int {
2079 |   if chess.ply < num return 0, 0;
2080 | 
2081 |   idx := chess.moves.count-num;
2082 |   prev_move := chess.moves[idx];
2083 |   piece, to := piece_to(prev_move);
2084 |   return piece, to;
2085 | }
2086 | 
2087 | get_improving :: (using c: *ChessGame) -> int {
2088 |   if ply>1 && eval[eval.count-1] > eval[eval.count-3] then 
2089 |     return 1;
2090 |   else
2091 |     return 0;
2092 | }
2093 | 
2094 | piece_to :: (mov: Move32) -> int, int #expand {
2095 |   to := get_move16_to(mov.mov16);
2096 |   return cast(int)mov.piece, to;
2097 | }
2098 | 
2099 | make_null_move :: (c: *ChessGame) -> ep: u64 {
2100 | 
2101 |   make_null_move_nnue :: (using c: *ChessGame) #expand {
2102 |     memcpy(*c.nnue[c.ply+1].accumulator, *c.nnue[c.ply].accumulator, size_of(Accumulator));
2103 |     dp := *c.nnue[c.ply+1].dirtyPiece;
2104 |     dp.dirtyNum = 0;
2105 |   }
2106 | 
2107 |   ep, hash := make_null_move(*c.chess);
2108 |   null_move: Move32 = Move32.{0, 0};
2109 |   array_add(*c.moves, null_move);
2110 |   array_add(*c.rtable, hash);
2111 |   make_null_move_nnue(c);
2112 |   c.ply += 1;
2113 |   c.maxply = max(c.maxply, c.ply);
2114 |   return ep;
2115 | 
2116 | }
2117 | 
2118 | unmake_null_move :: (c: *ChessGame, ep: u64) {
2119 |   pop(*c.moves);
2120 |   hash := pop(*c.rtable);
2121 |   unmake_null_move(*c.chess, ep, hash);
2122 |   c.ply -= 1;
2123 | }
2124 | 
2125 | get_countermove_history :: (chess: *ChessGame, num: int) -> [][64] s32 {
2126 |   using chess.history;
2127 |   prev, to := get_prev_move(chess, num);
2128 |   return counter_history[prev][to];
2129 | }
2130 | 
2131 | // fetch and add.
2132 | fetch_and_add :: (val: *int) #expand {
2133 |   #if CPU == .X64 {
2134 |     #asm {
2135 |       mov incr: gpr, 1;
2136 |       xadd.q [val], incr;
2137 |     }
2138 |   } else {
2139 |     // unknown cpu architecture. just default to simple incrementing of number.
2140 |     // this doesn't support multi-threading...but it should be okay-ish...
2141 |     val.* += 1;
2142 |   }
2143 | }
2144 | 
2145 | multi_pv: int = 1;
2146 | 
2147 | heapify :: (queue: *MoveQueue, index: int) {
2148 |   array: [] Queue_Pair = queue.array;
2149 |   while true {
2150 |     largest := index;
2151 |     left := 2 * index + 1;
2152 |     right := 2 * index + 2;
2153 |  
2154 |     N := queue.count;
2155 |  
2156 |     if left < N && array[left].priority > array[largest].priority then
2157 |       largest = left;
2158 |  
2159 |     if right < N && array[right].priority > array[largest].priority then
2160 |       largest = right;
2161 | 
2162 |     if largest == index then
2163 |       break;
2164 |  
2165 |     temp := array[index];
2166 |     array[index] = array[largest];
2167 |     array[largest] = temp;
2168 |     index = largest;
2169 |   }
2170 | 
2171 | }
2172 | 
2173 | pop :: (queue: *MoveQueue) -> move: Move16, priority: s32 {
2174 |   move := queue.array[0].move;
2175 |   priority := queue.array[0].priority;
2176 |   queue.count -= 1;
2177 |   queue.array[0] = queue.array[queue.count];
2178 |   heapify(queue, 0);
2179 |   return move, priority;
2180 | }
2181 | 
2182 | construct_heap :: (queue: *MoveQueue) {
2183 | 
2184 |   count := queue.array.count/2 - 1;
2185 |   for #v2 < index: 0..count {
2186 |     heapify(queue, index);
2187 |   }
2188 | 
2189 | }
2190 | 
2191 | MoveQueue :: struct(SZ: int = 128) {
2192 |   count: int;
2193 |   array: [SZ] Queue_Pair;
2194 | }
2195 | 
2196 | append :: (queue: *MoveQueue, priority: s16, move: Move16) {
2197 |   index := queue.count;
2198 |   queue.array[index].priority = priority;
2199 |   queue.array[index].move = move;
2200 |   queue.count += 1;
2201 | }
2202 | 
2203 | Queue_Pair :: struct {
2204 |   priority: s32;
2205 |   move: Move16;
2206 | }
2207 | 


--------------------------------------------------------------------------------
/uci.jai:
--------------------------------------------------------------------------------
  1 | main :: () {
  2 | 
  3 |   chess_startpos :: (chessgame: *ChessGame) #expand {
  4 |     chess_startpos(*chessgame.chess);
  5 |     chessgame.nnue[chessgame.ply].accumulator.computedAccumulation = 0;
  6 |   }
  7 | 
  8 |   chess_fen :: (chessgame: *ChessGame, fen_string: string) -> bool #expand {
  9 |     chessgame.nnue[chessgame.ply].accumulator.computedAccumulation = 0;
 10 |     return chess_fen(*chessgame.chess, fen_string);
 11 |   }
 12 | 
 13 |   nnue_startup();
 14 |   init_global_bitboards();
 15 |   initialize_move_randomness();
 16 |   init_ttable();
 17 |   initialize_chess_game_memory(*chess);
 18 |   chess.main_thread = true;
 19 |   chess_startpos(*chess);
 20 |   fifty := 0;
 21 | 
 22 |   for :getline input: os {
 23 |     reset_temporary_storage();
 24 |     if equal(input, "quit") {
 25 |       free_threads();
 26 |       return;
 27 |     }
 28 | 
 29 |     if equal(input, "uci") {
 30 |       print(uci_response);
 31 |       print("option name Clear Hash type button%1", NEWLINE);
 32 |       print("option name Hash type spin default 16 min 1 max 2000%1", NEWLINE);
 33 |       print("option name Threads type spin default 1 min 1 max 512%1", NEWLINE);
 34 |       print("option name MultiPV type spin default 1 min 1 max 100%1", NEWLINE);
 35 |       print("option name Difficulty type spin default 8 min 1 max 8%1", NEWLINE);
 36 |       print("uciok%1", NEWLINE);
 37 |     }
 38 |     
 39 |     if equal(input, "isready") {
 40 |       print("readyok%", NEWLINE);
 41 |     } 
 42 | 
 43 |     if equal(input, "ucinewgame") {
 44 |       Clear_Hash();
 45 |       chess_startpos(*chess);
 46 |     } 
 47 | 
 48 |     if equal(input, "perft_all") {
 49 |       perft_all();
 50 |     } 
 51 | 
 52 |     if equal(input, "eval") {
 53 |       eval := uci_evaluate(*chess);
 54 |       print_chess(*chess);
 55 |       push_allocator(temp);
 56 |       str := to_fen_string(*chess);
 57 |       print("FEN=[%1]%2", str, NEWLINE);
 58 |       print("Evaluate = %1 cp%2%2", eval, NEWLINE);
 59 |     }
 60 | 
 61 |     if begins_with(input, "position ") {
 62 |       fifty = parse_position(input, *chess);
 63 |     }
 64 | 
 65 |     if begins_with(input, "go") {
 66 |       go_search(input, *chess, fifty);
 67 |     }
 68 | 
 69 |     if begins_with(input, "setoption ") {
 70 |       set_option(input);
 71 |     }
 72 |   }
 73 | }
 74 | 
 75 | #scope_file
 76 | os: OS;
 77 | 
 78 | parse_position :: (line: string, chess: *ChessGame) -> fifty: int {
 79 |   clear(chess);
 80 |   // TODO: this does not guard against incoherent IO, this is just
 81 |   // to get it working... 
 82 |   fifty := 0;
 83 |   input := advance(line, 9);
 84 |   input = eat_spaces(input);
 85 |   if begins_with(input, "fen ") {
 86 |     input = advance(input, 4);
 87 |     input = eat_spaces(input);
 88 |     index :=  find_index_from_left(input, "moves");
 89 |     fen_string := ifx index != -1 then slice(input, 0, index) else input;
 90 |     if !chess_fen(chess, fen_string) {
 91 |       print("invalid fen %1%2", fen_string, NEWLINE);
 92 |       chess_startpos(chess);
 93 |       return 0;
 94 |     }
 95 | 
 96 |     if index != -1 {
 97 |       input = advance(input, index);
 98 |       input = advance(input, 5);
 99 |       input = eat_spaces(input);
100 |     } else {
101 |       input = advance(input, input.count);
102 |     }
103 | 
104 |   } else if begins_with(input, "startpos") {
105 |     input = advance(input, 8);
106 |     input = eat_spaces(input);
107 |     chess_startpos(chess);
108 |     index := find_index_from_left(input, "moves");
109 |     if index != -1 {
110 |       input = advance(input, index);
111 |       input = advance(input, 5);
112 |       input = eat_spaces(input);
113 |     } else {
114 |       input = advance(input, input.count);
115 |     }
116 |   } else {
117 |     // we don't know what the input is, so return
118 |     return 0;
119 |   }
120 | 
121 |   // we setup the position, now just to parse the moves
122 |   while input {
123 |     token := parse_token(*input);
124 |     x1 := cast(int)(token[0] - #char "a");
125 |     x2 := cast(int)(token[1] - #char "1");
126 |     y1 := cast(int)(token[2] - #char "a");
127 |     y2 := cast(int)(token[3] - #char "1");
128 |     from := x2*8 + x1;
129 |     to   := y2*8 + y1;
130 | 
131 |     promote := ifx token.count == 5 then token[4] else 0;
132 |     move_type := get_move16_flag(chess, from, to, promote);
133 |     move := to_move16(from, to, move_type);
134 |     make_move(chess, move);
135 |     if (piece_at(chess,to) != Piece.W_PAWN && piece_at(chess,to) != Piece.B_PAWN) && move_type == Move16.Quiet {
136 |       fifty += 1;
137 |     } else {
138 |       fifty = 0;
139 |     }
140 |   }
141 | 
142 |   return fifty;
143 | }
144 | 
145 | go_search :: (line: string, chess: *ChessGame, fifty: int) {
146 |   parse_token(*line);
147 |   token := parse_token(*line);
148 |   if equal(token, "perft") {
149 |     depth, TF := parse_int(*line);
150 |     if !TF return;
151 |     perft_divide(chess, depth);
152 |     return;
153 |   }
154 | 
155 |   depth := -1;
156 |   nodes := -1;
157 |   movetime := -1;
158 |   movestogo := -1;
159 | 
160 |   time := -1;
161 |   incr := 0;
162 | 
163 |   while line {
164 |     if token == {
165 |     case "depth";
166 |       value, TF := parse_int(*line);
167 |       if TF == false {
168 |         print("info string error unable to parse depth%", NEWLINE);
169 |         return;
170 |       }
171 |       depth = value;
172 |     case "nodes";
173 |       value, TF := parse_int(*line);
174 |       if TF == false {
175 |         print("info string error unable to parse nodes%", NEWLINE);
176 |         return;
177 |       }
178 |       nodes = value;
179 |     case "movetime";
180 |       value, TF := parse_int(*line);
181 |       if TF == false {
182 |         print("info string error unable to parse movetime%", NEWLINE);
183 |         return;
184 |       }
185 |       movetime = value;
186 | 
187 |     case "wtime";
188 |       value, TF := parse_int(*line);
189 |       if TF == false {
190 |         print("info string error unable to parse wtime%", NEWLINE);
191 |         return;
192 |       }
193 |       if chess.turn == .WHITE {
194 |         time = value;
195 |       }
196 | 
197 |     case "btime";
198 |       value, TF := parse_int(*line);
199 |       if TF == false {
200 |         print("info string error unable to parse btime%", NEWLINE);
201 |         return;
202 |       }
203 |       if chess.turn == .BLACK {
204 |         time = value;
205 |       }
206 | 
207 |     case "winc";
208 |       value, TF := parse_int(*line);
209 |       if TF == false {
210 |         print("info string error unable to parse winc%", NEWLINE);
211 |         return;
212 |       }
213 |       if chess.turn == .WHITE {
214 |         incr = value;
215 |       }
216 | 
217 |     case "binc";
218 |       value, TF := parse_int(*line);
219 |       if TF == false {
220 |         print("info string error unable to parse binc%", NEWLINE);
221 |         return;
222 |       }
223 |       if chess.turn == .BLACK {
224 |         incr = value;
225 |       }
226 | 
227 |     case "movestogo";
228 |       value, TF := parse_int(*line);
229 |       if TF == false {
230 |         print("info string error unable to parse movestogo%", NEWLINE);
231 |         return;
232 |       }
233 |       movestogo = value;
234 | 
235 |     case;
236 |       print("info string error unable to parse [%1]%2", line, NEWLINE);
237 |       return;
238 |     }
239 | 
240 |     token = parse_token(*line);
241 |     token = eat_spaces(token);
242 |   }
243 | 
244 |   if time > -1 && movetime == -1 then {
245 |     movetime = time_management(time, incr, movestogo, chess.ply);
246 |   }
247 | 
248 |   chess.maxnodes = nodes;
249 |   chess.movetime = movetime;
250 |   chess.maxdepth = depth;
251 |   chess.fifty = fifty;
252 |   mov := uci_search(chess);
253 | 
254 |   push_allocator(temp);
255 |   str := to_string(mov);
256 |   print("bestmove %1%2", str, NEWLINE);
257 | }
258 | 
259 | set_option :: (line: string) {
260 |   parse_token(*line);
261 |   token, TF := parse_token(*line);
262 |   if TF == false || !equal(token, "name") {
263 |     print("info string error. unable to parse setoption%", NEWLINE);
264 |     return;
265 |   }
266 | 
267 |   token = eat_spaces(line);
268 |   found, left, right := split_from_left(token, " value ");
269 |   if left == {
270 |   case "Clear Hash";
271 |     if right {
272 |       print("info string error. Clear Hash cannot be assigned a value%", NEWLINE);
273 |       return;
274 |     }
275 |       
276 |     print("info string Transposition Table Cleared%", NEWLINE);
277 |     Clear_Hash();
278 |   case "Hash";
279 |     num, tf := parse_int(*right);
280 |     if !tf {
281 |       print("info string error. Invalid Hash Value%", NEWLINE);
282 |       return;
283 |     }
284 |     if num >= 1 && num <= 2000 {
285 |       num *= 1_000_000;
286 |       init_ttable(num);
287 |     } else {
288 |       print("info string error. Invalid Hash Value%", NEWLINE);
289 |       return;
290 |     }
291 |   case "MultiPV";
292 |     num, tf := parse_int(*right);
293 |     if !tf {
294 |       print("info string error. Invalid MultiPV Value%", NEWLINE);
295 |       return;
296 |     }
297 |     if num >= 1 && num <= 100 {
298 |       set_multi_pv(num);
299 |     } else {
300 |       print("info string error. Invalid MultiPV Value%", NEWLINE);
301 |       return;
302 |     }
303 |   case "Threads";
304 |     num, tf := parse_int(*right);
305 |     if !tf {
306 |       print("info string error. Invalid Thread Value%", NEWLINE);
307 |       return;
308 |     }
309 |     if num >= 1 && num <= 512 {
310 |       set_threads(num);
311 |     } else {
312 |       print("info string error. Invalid Thread Value%", NEWLINE);
313 |       return;
314 |     }
315 |   case "Difficulty";
316 |     num, tf := parse_int(*right);
317 |     if !tf {
318 |       print("info string error. Invalid Difficulty Value%", NEWLINE);
319 |       return;
320 |     }
321 |     if num >= 1 && num <= 8 {
322 |       set_difficulty(num);
323 |     } else {
324 |       print("info string error. Invalid Difficulty Value%", NEWLINE);
325 |       return;
326 |     }
327 | 
328 |   case;
329 |     print("info string error. invalid name: [%1]%2", left, NEWLINE);
330 | 
331 |   }
332 | }
333 | 
334 | // decides what the movetime is given the time/increment/other parameters in milliseconds
335 | time_management :: (time: int, incr: int, movestogo: int, ply: int) -> movetime: int {
336 |   div := 0;
337 |   if movestogo != -1 then {
338 |     div = movestogo;
339 |   } else {
340 |     div = max(60 - ply, 20);
341 |   }
342 | 
343 |   if incr > time then
344 |     incr = 0;
345 | 
346 |   time /= div;
347 |   time -= 75;
348 | 
349 |   // time up
350 |   if time < 0 {
351 |     time = 0;
352 |     incr -= 75;
353 |     if incr < 0 then {
354 |       incr = 1;
355 |     }
356 |   }
357 | 
358 |   return time + incr;
359 | }
360 | 
361 | 
362 | chess: ChessGame #align 64;
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 


--------------------------------------------------------------------------------
/windows.jai:
--------------------------------------------------------------------------------
  1 | // contains windows OS specific code.
  2 | 
  3 | 
  4 | EXE :: ".exe"; // nothing.
  5 | NEWLINE :: "\r\n";
  6 | 
  7 | 
  8 | OS :: struct {
  9 |   none: void;
 10 | }
 11 | 
 12 | // defines the AI uci message loop.
 13 | getline :: (os: *OS, loop_body: Code, flags: For_Flags) #expand {
 14 |   stdin = GetStdHandle(STD_INPUT_HANDLE);
 15 |   `it_index := 0;
 16 |   while outer := true {
 17 |     stopping = false;
 18 |     memset(buffera.data, 0, size_of(type_of(buffera)));
 19 |     memset(bufferb.data, 0, size_of(type_of(bufferb)));
 20 |     bytes_read: u32;
 21 |     if !ReadFile(stdin, buffera.data, buffera.count, *bytes_read, null) then {
 22 |       sleep_milliseconds(25);
 23 |       continue;
 24 |     }
 25 | 
 26 |     messages := to_string(buffera.data, cast(int)bytes_read);
 27 |     while messages {
 28 |       found, `it, rest := split_from_left(messages, "\r\n");
 29 |       if ends_with(it, "\r\n")
 30 |         it.count -= 2;
 31 |       if ends_with(it, "\n") then {
 32 |         it.count -= 1;
 33 |       } 
 34 |       #insert (break=break outer) loop_body;
 35 |       messages = rest;
 36 |       if !found break;
 37 |     }
 38 |   }
 39 | }
 40 | 
 41 | read_input :: (main_thread: bool, nodes: int, maxnodes: int, time_begin: float64, movetime: int) #expand {
 42 |   if stopping == true then
 43 |     `return 0;
 44 | 
 45 |   if (nodes & 8191) == 8191 {
 46 | 
 47 |     if nodes >= maxnodes {
 48 |       stopping = true;
 49 |       `return 0;
 50 |     }
 51 | 
 52 |     if main_thread == false
 53 |       return;
 54 | 
 55 |     if (nodes & 8191) == 8191 {
 56 |       time := seconds_since_init();
 57 |       left: int = xx (1000.0 * (time - time_begin));
 58 |       if left > movetime {
 59 |         stopping = true;
 60 |         `return 0;
 61 |       }
 62 |     }
 63 | 
 64 |     bytes_read: u32;
 65 |     bytes_available: u32;
 66 |     success := PeekNamedPipe(stdin, null, 0, null, *bytes_available, null) != 0;
 67 |     if success && bytes_available {
 68 |       ReadFile(stdin, bufferb.data, cast(u32) bufferb.count, *bytes_read, null);
 69 |       str := to_string(bufferb.data, bytes_read);
 70 |       while str {
 71 |         found, msg, rest := split_from_left(str, "\r\n");
 72 |         if equal(str, "isready") {
 73 |           print("readyok\r\n");
 74 |         } 
 75 | 
 76 |         if equal(str, "stop") {
 77 |           stopping = true;
 78 |           `return 0;
 79 |         }
 80 | 
 81 |         if equal(str, "quit") {
 82 |           exit(0);
 83 |         } 
 84 | 
 85 |         str = rest;
 86 |         if !found break;
 87 |       }
 88 |     }
 89 |   }
 90 | }
 91 | 
 92 | stop :: () -> bool #expand {
 93 |   return stopping == true;
 94 | }
 95 | 
 96 | #scope_file
 97 | buffera: [4096] u8;
 98 | bufferb: [4096] u8;
 99 | stopping: bool = false;
100 | 
101 | stdin: HANDLE;
102 | 
103 | 
104 | #import "Windows";
105 | #import "Windows_Utf8";
106 | #import "Basic";
107 | #import "String";
108 | #import "System"; // For get_path_of_running_executable.
109 | 


--------------------------------------------------------------------------------