├── Makefile ├── bitwise.h ├── LICENSE ├── main.c └── impl.h /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | # wow what a fancy makefile 3 | gcc main.c -------------------------------------------------------------------------------- /bitwise.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | typedef uint64_t u64; 5 | typedef uint32_t u32; 6 | typedef uint16_t u16; 7 | typedef uint8_t u8; 8 | 9 | typedef int64_t s64; 10 | 11 | static inline u64 mask(int lo, int hi) { 12 | u64 size = hi - lo; 13 | return ((1ull << size) - 1) << lo; 14 | } 15 | 16 | static inline bool bit(u64 x, int n) { 17 | return (x >> n) & 1; 18 | } 19 | 20 | static inline u64 sign_extend(u64 value, int from_size, int to_size) { 21 | bool negative = bit(value, from_size - 1); 22 | if (negative) value |= mask(from_size, to_size); 23 | return value; 24 | }; 25 | 26 | static inline u64 asr(u64 value, int shift, int size) { 27 | s64 sign_extended = (s64) sign_extend(value, size, 64); 28 | sign_extended >>= shift; 29 | return sign_extended & mask(0, size); 30 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 zaydlang 2 | 3 | This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. 4 | 5 | Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 6 | 7 | 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 8 | 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 9 | 3. This notice may not be removed or altered from any source distribution. 10 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | 2 | #define PC_BUILD 1 3 | #include "impl.h" 4 | #undef printf 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | // generate interesting random numbers 11 | unsigned int get_rand_32() { 12 | retry: 13 | switch (rand() & 0b1111) { 14 | case 0: return rand() & 0xFF; 15 | case 1: return rand() & 0xFFFF; 16 | case 2: return rand() & 0xFFFFFF; 17 | case 3: return 0xFFFFFFFF - (rand() & 0xFF); 18 | case 4: return 0xFFFFFFFF - (rand() & 0xFFFF); 19 | case 5: return 0xFFFFFFFF - (rand() & 0xFFFFFF); 20 | case 6: return 0xAAAAAAAA & (rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16); 21 | case 7: return 0x55555555 & (rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16); 22 | case 8: return 0; 23 | case 9: return ((rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16)); 24 | 25 | default: 26 | goto retry; 27 | // return (rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16); 28 | } 29 | } 30 | 31 | bool guess_mul_zero(uint32_t x) { 32 | // the carry flag's behavior for short mul when the multiplicand is 0: 33 | // used for some crude testing of the mul instruction, before doing full 34 | // out fuzzing on a GBA 35 | 36 | if (x >> 8 == 0xFFFFFF) { 37 | unsigned int masked_x = x & 0xFF; 38 | if (masked_x >= 0xC0) return false; 39 | return (masked_x & 0x55) != 0; 40 | } 41 | 42 | if (x >> 16 == 0xFFFF) { 43 | unsigned int masked_x = x & 0xFFFF; 44 | if (masked_x >= 0xC000) return false; 45 | return (masked_x & 0x5555) != 0; 46 | } 47 | 48 | if (x >> 24 == 0xFF) { 49 | unsigned int masked_x = x & 0xFFFFFF; 50 | if (masked_x >= 0xC00000) return false; 51 | return (masked_x & 0x555555) != 0; 52 | } 53 | 54 | else { 55 | return (x >> 30) == 2; 56 | } 57 | 58 | } 59 | 60 | int main() { 61 | srand(time(NULL)); 62 | 63 | printf("Running mul carry regression tests...\n"); 64 | for (int i = 0; i < 256; i++) { 65 | unsigned int multiplicand = 0; 66 | unsigned int multiplier = 0xFFFFFFFF - i; 67 | 68 | unsigned int guess = mul(multiplicand, multiplier).carry; 69 | unsigned int actual = guess_mul_zero(multiplier); 70 | 71 | if (guess != actual) { 72 | printf("[MUL CARRY REGRESSION] Failed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); 73 | return 1; 74 | } else { 75 | printf("[MUL CARRY REGRESSION] Passed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); 76 | } 77 | } 78 | 79 | printf("Running mul regression tests...\n"); 80 | for (int i = 0; i < 10000; i++) { 81 | unsigned int multiplicand = get_rand_32(); 82 | unsigned int multiplier = get_rand_32(); 83 | 84 | unsigned int guess = mul(multiplicand, multiplier).output; 85 | unsigned int actual = (unsigned int) multiplicand * (unsigned int) multiplier; 86 | 87 | if (guess != actual) { 88 | printf("[MUL REGRESSION] Failed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); 89 | return 1; 90 | } else { 91 | printf("[MUL REGRESSION] Passed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); 92 | } 93 | } 94 | 95 | printf("Running mla regression tests...\n"); 96 | for (int i = 0; i < 10000; i++) { 97 | unsigned int multiplicand = get_rand_32(); 98 | unsigned int multiplier = get_rand_32(); 99 | unsigned int accumulate = get_rand_32(); 100 | 101 | unsigned int guess = mla(multiplicand, multiplier, accumulate).output; 102 | unsigned int actual = (unsigned int) multiplicand * (unsigned int) multiplier + (unsigned int) accumulate; 103 | 104 | if (guess != actual) { 105 | printf("[MLA REGRESSION] Failed: %llx * %llx + %llx = %llx, got %llx\n", multiplicand, multiplier, accumulate, actual, guess); 106 | return 1; 107 | } 108 | } 109 | 110 | printf("Running umull regression tests...\n"); 111 | for (int i = 0; i < 10000; i++) { 112 | unsigned int multiplicand = get_rand_32(); 113 | unsigned int multiplier = get_rand_32(); 114 | unsigned int accumulate = get_rand_32(); 115 | 116 | unsigned long long guess = umull(multiplicand, multiplier).output; 117 | unsigned long long actual = (unsigned long long) multiplicand * (unsigned long long) multiplier; 118 | 119 | if (guess != actual) { 120 | printf("[UMULL REGRESSION] Failed #%d: %llx * %llx = %llx, got %llx\n", i, multiplicand, multiplier, actual, guess); 121 | return 1; 122 | } else { 123 | printf("[UMULL REGRESSION] Passed #%d: %llx * %llx = %llx, got %llx\n", i, multiplicand, multiplier, actual, guess); 124 | } 125 | } 126 | 127 | printf("Running umlal regression tests...\n"); 128 | for (int i = 0; i < 10000; i++) { 129 | unsigned int multiplicand = get_rand_32(); 130 | unsigned int multiplier = get_rand_32(); 131 | unsigned int accumulate = get_rand_32(); 132 | unsigned int accumulate2 = get_rand_32(); 133 | 134 | unsigned long long guess = umlal(accumulate, accumulate2, multiplicand, multiplier).output; 135 | unsigned long long actual_acc = (unsigned long long) accumulate + ((unsigned long long) accumulate2 << 32); 136 | unsigned long long actual = 137 | (unsigned long long) multiplicand * (unsigned long long) multiplier + actual_acc; 138 | 139 | if (guess != actual) { 140 | printf("[UMLAL REGRESSION] Failed: %llx * %llx + %llx = %llx, got %llx\n", multiplicand, multiplier, actual_acc, actual, guess); 141 | return 1; 142 | } else { 143 | printf("[UMLAL REGRESSION] Passed: %llx * %llx + %llx = %llx, got %llx\n", multiplicand, multiplier, actual_acc, actual, guess); 144 | } 145 | } 146 | 147 | printf("Running smull regression tests...\n"); 148 | for (int i = 0; i < 10000; i++) { 149 | unsigned int multiplicand = get_rand_32(); 150 | unsigned int multiplier = get_rand_32(); 151 | 152 | long long guess = smull(multiplicand, multiplier).output; 153 | long long actual = (long long) (int) multiplicand * (long long) (int) multiplier; 154 | 155 | if (guess != actual) { 156 | printf("[SMULL REGRESSION] Failed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); 157 | return 1; 158 | } else { 159 | printf("[SMULL REGRESSION] Passed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); 160 | } 161 | } 162 | 163 | printf("Running smlal regression tests...\n"); 164 | for (int i = 0; i < 10000; i++) { 165 | unsigned int multiplicand = get_rand_32(); 166 | unsigned int multiplier = get_rand_32(); 167 | unsigned int accumulate = get_rand_32(); 168 | unsigned int accumulate2 = get_rand_32(); 169 | 170 | long long guess = smlal(accumulate, accumulate2, multiplicand, multiplier).output; 171 | long long actual_acc = (long long) accumulate + ((long long) accumulate2 << 32); 172 | long long actual = (long long) (int) multiplicand * (long long) (int) multiplier + actual_acc; 173 | 174 | if (guess != actual) { 175 | printf("[SMLAL REGRESSION] Failed: %llx * %llx + %llx = %llx, got %llx\n", multiplicand, multiplier, actual_acc, actual, guess); 176 | return 1; 177 | } else { 178 | printf("[SMLAL REGRESSION] Passed: %llx * %llx + %llx = %llx, got %llx\n", multiplicand, multiplier, actual_acc, actual, guess); 179 | } 180 | } 181 | 182 | printf("All tests passed!\n"); 183 | } -------------------------------------------------------------------------------- /impl.h: -------------------------------------------------------------------------------- 1 | #include "bitwise.h" 2 | #include 3 | #include 4 | 5 | // dont define PC_BUILD if you want to use this file in a GBA rom. 6 | #ifdef PC_BUILD 7 | #include 8 | #define printf(...) printf(__VA_ARGS__) 9 | #else 10 | #define printf(...) 11 | #endif 12 | 13 | // realistically this can only be a 3-bit value. 14 | typedef u8 BoothChunk; 15 | 16 | struct BoothRecodingOutput { 17 | u64 recoded_output; 18 | bool carry; 19 | }; 20 | 21 | struct RecodedMultiplicands { 22 | struct BoothRecodingOutput m[4]; 23 | }; 24 | 25 | struct BoothRecodingOutput booth_recode(u64 input, BoothChunk booth_chunk) { 26 | struct BoothRecodingOutput output; 27 | switch (booth_chunk) { 28 | case 0: output = (struct BoothRecodingOutput) { 0, 0 }; break; 29 | case 1: output = (struct BoothRecodingOutput) { input, 0 }; break; 30 | case 2: output = (struct BoothRecodingOutput) { input, 0 }; break; 31 | case 3: output = (struct BoothRecodingOutput) { 2 * input, 0 }; break; 32 | case 4: output = (struct BoothRecodingOutput) { ~(2 * input), 1 }; break; 33 | case 5: output = (struct BoothRecodingOutput) { ~input, 1 }; break; 34 | case 6: output = (struct BoothRecodingOutput) { ~input, 1 }; break; 35 | case 7: output = (struct BoothRecodingOutput) { 0, 0 }; break; 36 | } 37 | 38 | output.recoded_output &= 0x3FFFFFFFFULL; 39 | return output; 40 | } 41 | 42 | struct CSAOutput { 43 | u64 output; 44 | u64 carry; 45 | }; 46 | 47 | struct CSAOutput perform_csa(u64 a, u64 b, u64 c) { 48 | u64 output = a ^ b ^ c; 49 | u64 carry = (a & b) | (b & c) | (c & a); 50 | return (struct CSAOutput) { output, carry }; 51 | } 52 | 53 | // contains the current high 31 bits of the acc. this is shifted by 2 after each CSA. 54 | u64 acc_shift_register = 0; 55 | 56 | struct CSAOutput perform_csa_array(u64 partial_sum, u64 partial_carry, struct RecodedMultiplicands addends) { 57 | struct CSAOutput csa_output = { partial_sum, partial_carry }; 58 | struct CSAOutput final_csa_output = { 0, 0 }; 59 | 60 | for (int i = 0; i < 4; i++) { 61 | csa_output.output &= 0x1FFFFFFFFULL; 62 | csa_output.carry &= 0x1FFFFFFFFULL; 63 | 64 | struct CSAOutput result = perform_csa(csa_output.output, addends.m[i].recoded_output & 0x1FFFFFFFFULL, csa_output.carry); 65 | 66 | // Inject the carry caused by booth recoding 67 | result.carry <<= 1; 68 | result.carry |= addends.m[i].carry; 69 | 70 | // Take the bottom two bits and inject them into the final output. 71 | // The value of the bottom two bits will not be changed by future 72 | // addends, because those addends must be at least 4 times as big 73 | // as the current addend. By directly injecting these two bits, the 74 | // hardware saves some space on the chip. 75 | final_csa_output.output |= (result.output & 3) << (2 * i); 76 | final_csa_output.carry |= (result.carry & 3) << (2 * i); 77 | 78 | // The next CSA will only operate on the upper bits - as explained 79 | // in the previous comment. 80 | result.output >>= 2; 81 | result.carry >>= 2; 82 | 83 | // Perform the magic described in the tables for the handling of TransH 84 | // and High. acc_shift_register contains the upper 31 bits of the acc 85 | // in its lower bits. 86 | u64 magic = bit(acc_shift_register, 0) + !bit(csa_output.carry, 32) + !bit(addends.m[i].recoded_output, 33); 87 | result.output |= magic << 31; 88 | result.carry |= (u64) !bit(acc_shift_register, 1) << 32; 89 | acc_shift_register >>= 2; 90 | 91 | csa_output = result; 92 | } 93 | 94 | final_csa_output.output |= csa_output.output << 8; 95 | final_csa_output.carry |= csa_output.carry << 8; 96 | 97 | return final_csa_output; 98 | } 99 | 100 | struct RecodedMultiplicands get_recoded_multiplicands(u64 multiplicand, u64 multiplier) { 101 | struct RecodedMultiplicands recoded_multiplicands; 102 | 103 | for (int i = 0; i < 4; i++) { 104 | recoded_multiplicands.m[i] = booth_recode(multiplicand, (multiplier >> (2 * i)) & 0b111); 105 | } 106 | 107 | return recoded_multiplicands; 108 | } 109 | 110 | struct CSAOutput perform_one_cycle_of_booths_mutliplication(struct CSAOutput previous_output, u64 multiplicand, u64 multiplier) { 111 | struct RecodedMultiplicands recoded_multiplicands = get_recoded_multiplicands(multiplicand, multiplier); 112 | return perform_csa_array(previous_output.output, previous_output.carry, recoded_multiplicands); 113 | } 114 | 115 | enum MultiplicationFlavor { 116 | SHORT, 117 | LONG_SIGNED, 118 | LONG_UNSIGNED, 119 | }; 120 | 121 | bool is_long(enum MultiplicationFlavor flavor) { 122 | return flavor == LONG_SIGNED || flavor == LONG_UNSIGNED; 123 | } 124 | 125 | bool is_signed(enum MultiplicationFlavor flavor) { 126 | return flavor == LONG_SIGNED || flavor == SHORT; 127 | } 128 | 129 | bool should_terminate(u64 multiplier, enum MultiplicationFlavor flavor) { 130 | if (is_signed(flavor)) { 131 | return multiplier == 0x1FFFFFFFF || multiplier == 0; 132 | } else { 133 | return multiplier == 0; 134 | } 135 | } 136 | 137 | struct AdderOutput { 138 | u32 output; 139 | bool carry; 140 | }; 141 | 142 | struct AdderOutput adder(u32 a, u32 b, bool carry) { 143 | u32 output = a + b + carry; 144 | u64 real_output = (u64) a + (u64) b + (u64) carry; 145 | return (struct AdderOutput) { output, output != real_output }; 146 | } 147 | 148 | struct MultiplicationOutput { 149 | u64 output; 150 | bool carry; 151 | }; 152 | 153 | struct u128 { 154 | u64 lo; 155 | u64 hi; 156 | }; 157 | 158 | struct u128 u128_ror(struct u128 input, int shift) { 159 | return (struct u128) { 160 | (input.lo >> shift) | (input.hi << (64 - shift)), 161 | (input.hi >> shift) | (input.lo << (64 - shift)), 162 | }; 163 | } 164 | 165 | struct MultiplicationOutput booths_multiplication(enum MultiplicationFlavor flavor, u64 multiplicand, u64 multiplier, u64 accumulator) { 166 | struct CSAOutput csa_output = { 0, 0 }; 167 | 168 | bool alu_carry_in = multiplier & 1; 169 | 170 | if (is_signed(flavor)) { 171 | multiplier = sign_extend(multiplier, 32, 34); 172 | } else { 173 | multiplier = multiplier & 0x1FFFFFFFFull; 174 | } 175 | 176 | if (is_signed(flavor)) { 177 | multiplicand = sign_extend(multiplicand, 32, 34); 178 | } else { 179 | multiplicand = multiplicand & 0x1FFFFFFFFull; 180 | } 181 | 182 | csa_output.carry = (multiplier & 1) ? ~(multiplicand) : 0; 183 | csa_output.output = accumulator; 184 | acc_shift_register = accumulator >> 34; 185 | 186 | struct u128 partial_sum = { 0, 0 }; 187 | struct u128 partial_carry = { 0, 0 }; 188 | partial_sum.lo = csa_output.output & 1; 189 | partial_carry.lo = csa_output.carry & 1; 190 | 191 | csa_output.output >>= 1; 192 | csa_output.carry >>= 1; 193 | partial_sum = u128_ror(partial_sum, 1); 194 | partial_carry = u128_ror(partial_carry, 1); 195 | 196 | int num_iterations = 0; 197 | do { 198 | csa_output = perform_one_cycle_of_booths_mutliplication(csa_output, multiplicand, multiplier); 199 | 200 | partial_sum.lo |= csa_output.output & 0xFF; 201 | partial_carry.lo |= csa_output.carry & 0xFF; 202 | 203 | csa_output.output >>= 8; 204 | csa_output.carry >>= 8; 205 | 206 | partial_sum = u128_ror(partial_sum, 8); 207 | partial_carry = u128_ror(partial_carry, 8); 208 | 209 | multiplier = asr(multiplier, 8, 33); 210 | num_iterations++; 211 | } while (!should_terminate(multiplier, flavor)); 212 | partial_sum.lo |= csa_output.output; 213 | partial_carry.lo |= csa_output.carry; 214 | 215 | // we have ror'd partial_sum and partial_carry by 8 * num_iterations + 1 216 | // we now need to ror backwards, i tried my best to mimic the table, but 217 | // i'm off by one for whatever reason. 218 | int correction_ror; 219 | if (num_iterations == 1) correction_ror = 23; 220 | if (num_iterations == 2) correction_ror = 15; 221 | if (num_iterations == 3) correction_ror = 7; 222 | if (num_iterations == 4) correction_ror = 31; 223 | 224 | partial_sum = u128_ror(partial_sum, correction_ror); 225 | partial_carry = u128_ror(partial_carry, correction_ror); 226 | 227 | if (is_long(flavor)) { 228 | if (num_iterations == 4) { 229 | struct AdderOutput adder_output_lo = 230 | adder(partial_sum.hi, partial_carry.hi, alu_carry_in); 231 | struct AdderOutput adder_output_hi = 232 | adder(partial_sum.hi >> 32, partial_carry.hi >> 32, 233 | adder_output_lo.carry); 234 | 235 | return (struct MultiplicationOutput) { 236 | ((u64) adder_output_hi.output << 32) | adder_output_lo.output, 237 | (partial_carry.hi >> 63) & 1 238 | }; 239 | } else { 240 | struct AdderOutput adder_output_lo = 241 | adder(partial_sum.hi >> 32, partial_carry.hi >> 32, alu_carry_in); 242 | 243 | int shift_amount = 1 + 8 * num_iterations; 244 | 245 | // why this is needed is unknown, but the multiplication doesn't work 246 | // without it 247 | shift_amount++; 248 | 249 | partial_carry.lo = sign_extend(partial_carry.lo, shift_amount, 64); 250 | partial_sum.lo |= acc_shift_register << (shift_amount); 251 | 252 | struct AdderOutput adder_output_hi = 253 | adder(partial_sum.lo, partial_carry.lo, adder_output_lo.carry); 254 | return (struct MultiplicationOutput) { 255 | ((u64) adder_output_hi.output << 32) | adder_output_lo.output, 256 | (partial_carry.hi >> 63) & 1 257 | }; 258 | } 259 | } else { 260 | if (num_iterations == 4) { 261 | struct AdderOutput adder_output = 262 | adder(partial_sum.hi, partial_carry.hi, alu_carry_in); 263 | return (struct MultiplicationOutput) { 264 | adder_output.output, 265 | (partial_carry.hi >> 31) & 1 266 | }; 267 | } else { 268 | struct AdderOutput adder_output = 269 | adder(partial_sum.hi >> 32, partial_carry.hi >> 32, alu_carry_in); 270 | return (struct MultiplicationOutput) { 271 | adder_output.output, 272 | (partial_carry.hi >> 63) & 1 273 | }; 274 | } 275 | } 276 | } 277 | 278 | struct MultiplicationOutput mul(u32 rm, u32 rs) { 279 | return booths_multiplication(SHORT, rm, rs, 0); 280 | } 281 | 282 | struct MultiplicationOutput mla(u32 rm, u32 rs, u32 rn) { 283 | return booths_multiplication(SHORT, rm, rs, rn); 284 | } 285 | 286 | struct MultiplicationOutput umull(u32 rm, u32 rs) { 287 | return booths_multiplication(LONG_UNSIGNED, rm, rs, 0); 288 | } 289 | 290 | struct MultiplicationOutput umlal(u32 rdlo, u32 rdhi, u32 rm, u32 rs) { 291 | return booths_multiplication(LONG_UNSIGNED, rm, rs, ((u64) rdhi << 32) | (u64) rdlo); 292 | } 293 | 294 | struct MultiplicationOutput smull(u32 rm, u32 rs) { 295 | return booths_multiplication(LONG_SIGNED, rm, rs, 0); 296 | } 297 | 298 | struct MultiplicationOutput smlal(u32 rdlo, u32 rdhi, u32 rm, u32 rs) { 299 | return booths_multiplication(LONG_SIGNED, rm, rs, (u64) rdhi << 32 | (u64) rdlo); 300 | } --------------------------------------------------------------------------------