├── LICENSE.md ├── Makefile ├── README.md ├── array.c ├── array.h ├── eval.c ├── eval.h ├── float32.c ├── float32.h ├── float64.c ├── float64.h ├── kernel32.c ├── kernel32.h ├── main.c ├── real32.c ├── real32.h ├── soft.c ├── soft.h ├── types.h ├── uint128.c └── uint128.h /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Dale Weiler 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SRCS := $(wildcard *.c) 2 | OBJS := $(SRCS:.c=.o) 3 | 4 | CFLAGS := -Wall 5 | CFLAGS += -Wextra 6 | CFLAGS += -O2 7 | CFLAGS += -g 8 | 9 | all: fpinspect 10 | 11 | fpinspect: $(OBJS) 12 | $(CC) -o $@ $^ $(CFLAGS) 13 | 14 | clean: 15 | rm -f $(OBJS) fpinspect 16 | 17 | .PHONY: clean -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Floating point expression inspector 2 | 3 | The following tool lets you inspect the computational flow of a floating point 4 | expression, seeing where rounding occurs, when exceptions are triggered, 5 | when precision may be lost, when special values propagate, when error 6 | accumulates, and other floating point headaches. 7 | 8 | # Example 9 | ``` 10 | [fpinspect]# ./fpinspect "sqrt(45.0*e+phi)/pi" 11 | Exception: 0 (1 roundings) INEXACT (45.000000 * e) 12 | Trace (1 operations) MUL 13 | 14 | Exception: 0 (1 roundings) INEXACT phi 15 | Trace (1 operations) MUL 16 | 17 | Exception: 0 (2 roundings) INEXACT ((45.000000 * e) + phi) 18 | Exception: 1 (2 roundings) INEXACT ((45.000000 * e) + phi) 19 | Trace (2 operations) MUL ADD 20 | 21 | Exception: 0 (3 roundings) INEXACT sqrt(((45.000000 * e) + phi)) 22 | Exception: 1 (3 roundings) INEXACT sqrt(((45.000000 * e) + phi)) 23 | Exception: 2 (3 roundings) INEXACT sqrt(((45.000000 * e) + phi)) 24 | Trace (3 operations) MUL ADD ADD 25 | 26 | Exception: 0 (3 roundings) INEXACT pi 27 | Exception: 1 (3 roundings) INEXACT pi 28 | Exception: 2 (3 roundings) INEXACT pi 29 | Trace (3 operations) MUL ADD ADD 30 | 31 | Exception: 0 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi) 32 | Exception: 1 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi) 33 | Exception: 2 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi) 34 | Exception: 3 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi) 35 | Trace (4 operations) MUL ADD ADD DIV 36 | 37 | (sqrt(((45.000000 * e) + phi)) / pi) 38 | ans: 3.54370117187500 39 | err: 0.00000126456894 40 | ``` 41 | 42 | As you can see, the expression `sqrt(45.0*e+phi)/pi` produces a lot of output, 43 | each empty-line-separated region is a subexpression which triggered an exception, 44 | in this case because `45 * e` is an inexact value, the inexact exception is 45 | presented first. Here you can see that such an expression involved `1 operations`, 46 | total and in this case the operation is just a `MUL`. We can also see that the 47 | resulting expression, because it's inexact, incurred one rounding. 48 | 49 | Following down the exception list, we can see that the exception propagated 50 | to `phi` in a `MUL` (which is also an inexact value), and continued, with each 51 | new inexact subexpression resulting in several roundings. Since kernels like 52 | `sqrt` might themselves use operations like `add`, we also see the final group 53 | of exceptions contains an additional `ADD` in it's trace. 54 | 55 | The final result of the expression is given in `ans:` and below that you will 56 | find the accumulative error `err:` of evaluating that expression, in this case 57 | this function is exact to five mantissa digits of precision, out of a total of 58 | seven, which means this expression has ~0.71 ULP of error. 59 | 60 | # Documentation 61 | Run the program with no expression or `-h` to see the options. 62 | 63 | Here's some constants and functions available for use in expressions. 64 | ### Constants 65 | * e 66 | * pi 67 | * phi 68 | 69 | ### Functions 70 | * floor 71 | * ceil 72 | * trunc 73 | * sqrt 74 | * abs 75 | * min 76 | * max 77 | * copysign 78 | 79 | # How it works 80 | This program implements IEEE-754 floating point completely in software, emulating 81 | all rounding modes, exceptions, and tininess detection methods which can be 82 | configured when evaluating an expression. With exception to transcendental 83 | functions, all floating point computation is also accurate to <= 1 ULP of error. 84 | 85 | Currently there is support for 32-bit single-precision floating-point 86 | `float32.{h,c}` and 64-bit double-precision floating-point `float64.{h,c}`, as 87 | double-precision is necessary for 32-bit single-precision kernels 88 | `kernel32.{h,c}` to produce correctly rounded and truncated results 89 | to <= 1 ULP of error. 90 | 91 | 64-bit double-precision floating-point makes use of 128-bit modular arithmetic 92 | implemented in `uint128.{h,c}` 93 | 94 | Accumulative error accounting is handled by `real32.{h,c}` and `real64.{h,c}` 95 | for single-precision and double-precision floating-point, respectively. 96 | 97 | > NOTE: 98 | > 99 | > There are currently no 64-bit kernels, as that would require either 80-bit 100 | extended-precision floating-point, or 128-bit quadruple-precision floating-point 101 | to be implemented in software to have the precision necessary to produce 102 | correctly rounded and truncated results to <= 1 ULP of error. -------------------------------------------------------------------------------- /array.c: -------------------------------------------------------------------------------- 1 | #include // malloc, realloc, free. 2 | 3 | #include "array.h" 4 | 5 | Bool array_grow(void **array, Size expand, Size type_size) { 6 | Array *meta = array_meta(*array); 7 | Size count = 0; 8 | void *data = NULL; 9 | 10 | if (*array) { 11 | count = 2 * meta->capacity + expand; 12 | data = realloc(meta, type_size * count + sizeof *meta); 13 | if (!data) { 14 | return false; 15 | } 16 | } else { 17 | count = expand + 1; 18 | data = malloc(type_size * count + sizeof *meta); 19 | if (!data) { 20 | return false; 21 | } 22 | ((Array*)data)->size = 0; 23 | } 24 | 25 | meta = (Array*)data; 26 | meta->capacity = count; 27 | 28 | *array = meta + 1; 29 | 30 | return true; 31 | } 32 | 33 | void array_delete(void *array) { 34 | free(array_meta(array)); 35 | } -------------------------------------------------------------------------------- /array.h: -------------------------------------------------------------------------------- 1 | #ifndef ARRAY_H 2 | #define ARRAY_H 3 | #include "types.h" 4 | 5 | typedef struct Array Array; 6 | 7 | struct Array { 8 | Size size; 9 | Size capacity; 10 | }; 11 | 12 | #define ARRAY(T) T* 13 | 14 | #define array_meta(array) \ 15 | ((Array*)(((Uint8*)(array)) - sizeof(Array))) 16 | 17 | // grow [array] by [expand] elements. 18 | #define array_try_grow(array, expand) \ 19 | (((!(array) || array_meta(array)->size + (expand) >= array_meta(array)->capacity)) \ 20 | ? array_grow(((void **)&(array)), (expand), sizeof(*(array))) \ 21 | : true) 22 | 23 | // push [value] into [array] 24 | #define array_push(array, value) \ 25 | (array_try_grow((array), 1) \ 26 | ? ((array)[array_meta(array)->size++] = (value), true) \ 27 | : false) 28 | 29 | // free [array] 30 | #define array_free(array) \ 31 | ((void)((array) ? (array_delete((void*)(array)), (array) = 0) : 0)) 32 | 33 | // size of [array] 34 | #define array_size(array) \ 35 | ((array) ? array_meta(array)->size : 0) 36 | 37 | Bool array_grow(void**, Size, Size); 38 | void array_delete(void*); 39 | 40 | #endif // ARRAY_H -------------------------------------------------------------------------------- /eval.c: -------------------------------------------------------------------------------- 1 | #include // calloc, free 2 | #include // strchr 3 | #include // fprintf, stderr 4 | 5 | #include "eval.h" 6 | 7 | typedef struct Parser Parser; 8 | typedef struct Expression Expression; 9 | 10 | struct Expression { 11 | enum { 12 | EXPR_VALUE, 13 | EXPR_CONST, 14 | EXPR_FUNC1, EXPR_FUNC2, 15 | EXPR_EQ, EXPR_LTE, EXPR_LT, 16 | EXPR_NE, EXPR_GTE, EXPR_GT, 17 | EXPR_ADD, EXPR_SUB, EXPR_MUL, EXPR_DIV, 18 | EXPR_LAST 19 | } type; 20 | Real32 value; 21 | union { 22 | Size constant; 23 | enum { 24 | // EXPR_FUNC1 25 | FUNC_FLOOR, 26 | FUNC_CEIL, 27 | FUNC_TRUNC, 28 | FUNC_SQRT, 29 | FUNC_ABS, 30 | // EXPR_FUNC2 31 | FUNC_MIN, 32 | FUNC_MAX, 33 | FUNC_COPYSIGN 34 | } func; 35 | }; 36 | Expression* params[2]; 37 | }; 38 | 39 | static const struct { 40 | const char *identifier; 41 | const Real32 value; 42 | } CONSTANTS[] = { 43 | { "e", {{LIT32(0x402df854)}, {0}} }, 44 | { "pi", {{LIT32(0x40490fdb)}, {0}} }, 45 | { "phi", {{LIT32(0x3fcf1bbd)}, {0}} }, 46 | { "fmin", {{LIT32(0x00800000)}, {0}} }, // FLT_MIN 47 | { "fmax", {{LIT32(0x7f7fffff)}, {0}} }, // FLT_MAX 48 | }; 49 | 50 | static const struct { 51 | const char *match; 52 | Uint32 func; 53 | } FUNCS1[] = { 54 | { "floor", FUNC_FLOOR }, 55 | { "ceil", FUNC_CEIL }, 56 | { "trunc", FUNC_TRUNC }, 57 | { "sqrt", FUNC_SQRT }, 58 | { "abs", FUNC_ABS } 59 | }; 60 | 61 | static const struct { 62 | const char *match; 63 | Uint32 func; 64 | } FUNCS2[] = { 65 | { "min", FUNC_MIN }, 66 | { "max", FUNC_MAX }, 67 | { "copysign", FUNC_COPYSIGN } 68 | }; 69 | 70 | #define ARRAY_COUNT(x) \ 71 | (sizeof (x) / sizeof (*(x))) 72 | 73 | static const char* func1_name(Uint32 func) { 74 | for (Size i = 0; i < ARRAY_COUNT(FUNCS1); i++) { 75 | if (FUNCS1[i].func == func) { 76 | return FUNCS1[i].match; 77 | } 78 | } 79 | return NULL; 80 | } 81 | 82 | static const char *func2_name(Uint32 func) { 83 | for (Size i = 0; i < ARRAY_COUNT(FUNCS2); i++) { 84 | if (FUNCS2[i].func == func) { 85 | return FUNCS2[i].match; 86 | } 87 | } 88 | return NULL; 89 | } 90 | 91 | // This is cheating for now until we implement an accurate strtof, strtod, etc. 92 | static Real32 real32_from_string(const char *string, char **next) { 93 | union { float f; Float32 s; } u = {strtof(string, next)}; 94 | return (Real32){u.s, {0}}; 95 | } 96 | 97 | static Bool is_identifier(int ch) { 98 | return ((unsigned)ch - '0' <= 9u) 99 | || ((unsigned)ch - 'a' <= 25u) 100 | || ((unsigned)ch - 'A' <= 25u) 101 | || ch == '_'; 102 | } 103 | 104 | static bool match(const char *s, const char *prefix) { 105 | Size i = 0; 106 | for (; prefix[i]; i++) { 107 | if (prefix[i] != s[i]) { 108 | return false; 109 | } 110 | } 111 | return !is_identifier(s[i]); // Should be terminated identifier. 112 | } 113 | 114 | struct Parser { 115 | Sint32 level; 116 | char *s; 117 | }; 118 | 119 | #define ALU(fp, op) \ 120 | fprintf(fp, "("); \ 121 | expr_print(fp, expression->params[0]); \ 122 | fprintf(fp, " %s ", op); \ 123 | expr_print(fp, expression->params[1]); \ 124 | fprintf(fp, ")"); \ 125 | break 126 | 127 | void expr_print(FILE *fp, Expression *expression) { 128 | switch (expression->type) { 129 | case EXPR_VALUE: 130 | fprintf(fp, "%f", float32_cast(expression->value.value)); 131 | break; 132 | case EXPR_CONST: 133 | fprintf(fp, "%s", CONSTANTS[expression->constant].identifier); 134 | break; 135 | case EXPR_FUNC1: 136 | fprintf(fp, "%s(", func1_name(expression->func)); 137 | expr_print(fp, expression->params[0]); 138 | fprintf(fp, ")"); 139 | break; 140 | case EXPR_FUNC2: 141 | fprintf(fp, "%s(", func2_name(expression->func)); 142 | expr_print(fp, expression->params[0]); 143 | fprintf(fp, ", "); 144 | expr_print(fp, expression->params[1]); 145 | fprintf(fp, ")"); 146 | break; 147 | case EXPR_ADD: ALU(fp, "+"); 148 | case EXPR_SUB: ALU(fp, "-"); 149 | case EXPR_MUL: ALU(fp, "*"); 150 | case EXPR_DIV: ALU(fp, "/"); 151 | default: 152 | break; 153 | } 154 | } 155 | 156 | static Real32 eval_func1_32(Context *ctx, Uint32 func, Real32 a) { 157 | switch (func) { 158 | case FUNC_FLOOR: 159 | return real32_floor(ctx, a); 160 | case FUNC_CEIL: 161 | return real32_ceil(ctx, a); 162 | case FUNC_TRUNC: 163 | return real32_trunc(ctx, a); 164 | case FUNC_SQRT: 165 | return real32_sqrt(ctx, a); 166 | case FUNC_ABS: 167 | return real32_abs(ctx, a); 168 | } 169 | return (Real32){FLOAT32_ZERO, {0}}; 170 | } 171 | 172 | static Real32 eval_func2_32(Context *ctx, Uint32 func, Real32 a, Real32 b) { 173 | switch (func) { 174 | case FUNC_MIN: 175 | return real32_min(ctx, a, b); 176 | case FUNC_MAX: 177 | return real32_max(ctx, a, b); 178 | case FUNC_COPYSIGN: 179 | return real32_copysign(ctx, a, b); 180 | } 181 | return (Real32){FLOAT32_ZERO, {0}}; 182 | } 183 | 184 | Real32 expr_eval32(Context *ctx, Expression *expression) { 185 | if (!expression) { 186 | return REAL32_ZERO; 187 | } 188 | 189 | Real32 a = expr_eval32(ctx, expression->params[0]); 190 | Real32 b = expr_eval32(ctx, expression->params[1]); 191 | 192 | Real32 result = REAL32_ZERO; 193 | 194 | switch (expression->type) { 195 | /****/ case EXPR_VALUE: result = expression->value; 196 | break; case EXPR_CONST: result = CONSTANTS[expression->constant].value; 197 | break; case EXPR_FUNC1: result = eval_func1_32(ctx, expression->func, a); 198 | break; case EXPR_FUNC2: result = eval_func2_32(ctx, expression->func, a, b); 199 | break; case EXPR_EQ: result = real32_eq(ctx, a, b); 200 | break; case EXPR_LTE: result = real32_lte(ctx, a, b); 201 | break; case EXPR_LT: result = real32_lt(ctx, a, b); 202 | break; case EXPR_NE: result = real32_ne(ctx, a, b); 203 | break; case EXPR_GTE: result = real32_gte(ctx, a, b); 204 | break; case EXPR_GT: result = real32_gt(ctx, a, b); 205 | break; case EXPR_ADD: result = real32_add(ctx, a, b); 206 | break; case EXPR_SUB: result = real32_sub(ctx, a, b); 207 | break; case EXPR_MUL: result = real32_mul(ctx, a, b); 208 | break; case EXPR_DIV: result = real32_div(ctx, a, b); 209 | break; case EXPR_LAST: // Empty. 210 | break; 211 | } 212 | 213 | Size n_operations = array_size(ctx->operations); 214 | Size n_exceptions = array_size(ctx->exceptions); 215 | 216 | for (Size i = 0; i < n_exceptions; i++) { 217 | Exception exception = ctx->exceptions[i]; 218 | fprintf(stderr, "Exception: %zu (%zu roundings) ", i, ctx->roundings); 219 | Bool flag = false; 220 | if (exception & EXCEPTION_INVALID) { 221 | fprintf(stderr, "%sINVALID", flag ? "|" : ""), flag = true; 222 | } 223 | if (exception & EXCEPTION_INFINITE) { 224 | fprintf(stderr, "%sINFINITE", flag ? "|" : ""), flag = true; 225 | } 226 | if (exception & EXCEPTION_OVERFLOW) { 227 | fprintf(stderr, "%sOVERFLOW", flag ? "|" : ""), flag = true; 228 | } 229 | if (exception & EXCEPTION_UNDERFLOW) { 230 | fprintf(stderr, "%sUNDERFLOW", flag ? "|" : ""), flag = true; 231 | } 232 | if (exception & EXCEPTION_INEXACT) { 233 | fprintf(stderr, "%sINEXACT", flag ? "|" : ""), flag = true; 234 | } 235 | fprintf(stderr, " "); 236 | expr_print(stderr, expression); 237 | fprintf(stderr, "\n"); 238 | } 239 | if (n_operations && n_exceptions) { 240 | fprintf(stderr, " Trace (%zu operations) ", n_operations); 241 | Bool hit = false; 242 | for (Size i = 0; i < n_operations; i++) { 243 | Operation operation = ctx->operations[i]; 244 | switch (operation) { 245 | case OPERATION_ADD: fprintf(stderr, "%sADD", hit ? " " : ""), hit = true; break; 246 | case OPERATION_SUB: fprintf(stderr, "%sSUB", hit ? " " : ""), hit = true; break; 247 | case OPERATION_MUL: fprintf(stderr, "%sMUL", hit ? " " : ""), hit = true; break; 248 | case OPERATION_DIV: fprintf(stderr, "%sDIV", hit ? " " : ""), hit = true; break; 249 | } 250 | } 251 | fprintf(stderr, "\n"); 252 | fprintf(stderr, "\n"); 253 | } 254 | 255 | return result; 256 | } 257 | 258 | static Expression *create(int type, Expression *e0, Expression *e1) { 259 | Expression *e = calloc(1, sizeof *e); 260 | if (!e) { 261 | return NULL; 262 | } 263 | e->type = type; 264 | e->value = REAL32_ONE; 265 | e->params[0] = e0; 266 | e->params[1] = e1; 267 | return e; 268 | } 269 | 270 | static Bool parse_expr(Expression **e, Parser *p); 271 | static Bool parse_primary(Expression **e, Parser *p, Flag sign) { 272 | Expression *d = calloc(1, sizeof *d); 273 | if (!d) { 274 | return false; 275 | } 276 | 277 | char *next = p->s; 278 | char *s0 = p->s; 279 | d->value = real32_from_string(sign ? p->s - 1 : p->s, &next); 280 | if (next != p->s) { 281 | d->type = EXPR_VALUE; 282 | p->s = next; 283 | *e = d; 284 | return true; 285 | } 286 | 287 | d->value = REAL32_ONE; 288 | 289 | for (Size i = 0; i < sizeof CONSTANTS / sizeof *CONSTANTS; i++) { 290 | if (!match(p->s, CONSTANTS[i].identifier)) { 291 | continue; 292 | } 293 | p->s += strlen(CONSTANTS[i].identifier); 294 | d->type = EXPR_CONST; 295 | d->constant = i; 296 | *e = d; 297 | return true; 298 | } 299 | 300 | p->s = strchr(p->s, '('); 301 | if (!p->s) { 302 | fprintf(stderr, "Undefined constant or missing '(' in '%s'\n", s0); 303 | p->s = next; 304 | expr_free(d); 305 | return false; 306 | } 307 | 308 | p->s++; // '(' 309 | if (*next == '(') { 310 | expr_free(d); 311 | if (!parse_expr(&d, p)) { 312 | return false; 313 | } 314 | if (*p->s != ')') { 315 | fprintf(stderr, "Missing ')' in '%s'\n", s0); 316 | expr_free(d); 317 | return false; 318 | } 319 | p->s++; // ')' 320 | *e = d; 321 | return true; 322 | } 323 | if (!parse_expr(&d->params[0], p)) { 324 | expr_free(d); 325 | return false; 326 | } 327 | if (*p->s == ',') { 328 | p->s++; // ',' 329 | parse_expr(&d->params[1], p); // ignore? 330 | } 331 | if (*p->s != ')') { 332 | fprintf(stderr, "Missing ')' or too many arguments in '%s'\n", s0); 333 | expr_free(d); 334 | return false; 335 | } 336 | p->s++; // ')' 337 | 338 | for (Size i = 0; i < ARRAY_COUNT(FUNCS1); i++) { 339 | if (match(next, FUNCS1[i].match)) { 340 | d->type = EXPR_FUNC1; 341 | d->func = FUNCS1[i].func; 342 | *e = d; 343 | return true; 344 | } 345 | } 346 | 347 | for (Size i = 0; i < ARRAY_COUNT(FUNCS2); i++) { 348 | if (match(next, FUNCS2[i].match)) { 349 | d->type = EXPR_FUNC2; 350 | d->func = FUNCS2[i].func; 351 | *e = d; 352 | return true; 353 | } 354 | } 355 | 356 | fprintf(stderr, "Unknown identifier '%s'", s0); 357 | expr_free(d); 358 | 359 | return true; 360 | } 361 | 362 | static Bool parse_top(Expression **e, Parser *p) { 363 | Flag sign = false; 364 | if (*p->s == '+') p->s++; // skip unary '+' 365 | else if (*p->s == '-') p->s++, sign = true; // skip unary '-' 366 | return parse_primary(e, p, sign); 367 | } 368 | 369 | static Bool parse_factor(Expression **e, Parser *p) { 370 | Expression *e0; 371 | if (!parse_top(&e0, p)) { 372 | return false; 373 | } 374 | // TODO(dweiler): Handle other operations here. 375 | *e = e0; 376 | return true; 377 | } 378 | 379 | static Bool parse_term(Expression **e, Parser *p) { 380 | Expression *e0, *e1, *e2; 381 | if (!parse_factor(&e0, p)) { 382 | return false; 383 | } 384 | while (*p->s == '*' || *p->s == '/') { 385 | int ch = *p->s++; 386 | e1 = e0; 387 | if (!parse_factor(&e2, p)) { 388 | expr_free(e1); 389 | return false; 390 | } 391 | e0 = create(ch == '*' ? EXPR_MUL : EXPR_DIV, e1, e2); 392 | if (!e0) { 393 | expr_free(e1); 394 | expr_free(e2); 395 | return false; 396 | } 397 | } 398 | *e = e0; 399 | return true; 400 | } 401 | 402 | static Bool parse_subexpr(Expression **e, Parser *p) { 403 | Expression *e0, *e1, *e2; 404 | if (!parse_term(&e0, p)) { 405 | return false; 406 | } 407 | while (*p->s == '+' || *p->s == '-') { 408 | int ch = *p->s++; 409 | e1 = e0; 410 | if (!parse_term(&e2, p)) { 411 | expr_free(e1); 412 | return false; 413 | } 414 | e0 = create(ch == '+' ? EXPR_ADD : EXPR_SUB, e1, e2); 415 | if (!e0) { 416 | expr_free(e1); 417 | expr_free(e2); 418 | return false; 419 | } 420 | } 421 | *e = e0; 422 | return true; 423 | } 424 | 425 | static Bool parse_expr(Expression **e, Parser *p) { 426 | Expression *e0, *e1, *e2; 427 | if (!parse_subexpr(&e0, p)) { 428 | return false; 429 | } 430 | while (*p->s == ';') { 431 | p->s++; 432 | e1 = e0; 433 | if (!parse_subexpr(&e2, p)) { 434 | expr_free(e1); 435 | return false; 436 | } 437 | e0 = create(EXPR_LAST, e1, e2); 438 | if (!e0) { 439 | expr_free(e1); 440 | expr_free(e2); 441 | return false; 442 | } 443 | } 444 | *e = e0; 445 | return true; 446 | } 447 | 448 | static Bool parse_verify(Expression *expression) { 449 | if (!expression) { 450 | return false; 451 | } 452 | switch (expression->type) { 453 | case EXPR_VALUE: // fallthrough 454 | case EXPR_CONST: 455 | return true; 456 | case EXPR_FUNC1: 457 | return parse_verify(expression->params[0]) && !expression->params[1]; 458 | default: 459 | return parse_verify(expression->params[0]) && parse_verify(expression->params[1]); 460 | } 461 | } 462 | 463 | Bool expr_parse(Expression **expression, const char *string) { 464 | Parser p = { 0 }; 465 | char *w = malloc(strlen(string) + 1); 466 | char *wp = w; 467 | const char *s0 = string; 468 | 469 | if (!w) { 470 | return false; 471 | } 472 | 473 | while (*string) { 474 | if (*string != ' ') { 475 | *wp++ = *string; 476 | } 477 | string++; 478 | } 479 | *wp++ = '\0'; 480 | 481 | p.s = w; 482 | 483 | Expression *e = NULL; 484 | if (!parse_expr(&e, &p)) { 485 | free(w); 486 | return false; 487 | } 488 | 489 | if (*p.s) { 490 | expr_free(e); 491 | free(w); 492 | fprintf(stderr, "Unexpected end of expression '%s'\n", s0); 493 | return false; 494 | } 495 | 496 | if (!parse_verify(e)) { 497 | expr_free(e); 498 | free(w); 499 | return false; 500 | } 501 | 502 | free(w); 503 | 504 | *expression = e; 505 | return true; 506 | } 507 | 508 | void expr_free(Expression *expression) { 509 | if (expression) { 510 | expr_free(expression->params[0]); 511 | expr_free(expression->params[1]); 512 | free(expression); 513 | } 514 | } -------------------------------------------------------------------------------- /eval.h: -------------------------------------------------------------------------------- 1 | #ifndef EVAL_H 2 | #define EVAL_H 3 | #include "real32.h" 4 | 5 | typedef struct Expression Expression; 6 | 7 | Bool expr_parse(Expression**, const char*); 8 | Real32 expr_eval32(Context*, Expression*); 9 | void expr_free(Expression*); 10 | void expr_print(FILE*, Expression*); 11 | 12 | #endif // EVAL_H -------------------------------------------------------------------------------- /float32.c: -------------------------------------------------------------------------------- 1 | #include "float32.h" 2 | 3 | // Count leading zero bits. 4 | static inline Sint8 count_leading_zeros_u32(Uint32 a) { 5 | return a == 0 ? 32 : __builtin_clz(a); 6 | } 7 | 8 | // Take two single-precision float values, one which must be NaN, and produce 9 | // the correct NaN result, taking care to raise an invalid exception when either 10 | // is a signaling NaN. 11 | static Float32 float32_propagate_nan(Context *ctx, Float32 a, Float32 b) { 12 | const Flag a_is_nan = float32_is_nan(a); 13 | const Flag a_is_snan = float32_is_snan(a); 14 | const Flag b_is_nan = float32_is_nan(b); 15 | const Flag b_is_snan = float32_is_snan(b); 16 | a.bits |= LIT32(0x00400000); 17 | b.bits |= LIT32(0x00400000); 18 | if (a_is_snan | b_is_snan) { 19 | context_raise(ctx, EXCEPTION_INVALID); 20 | } 21 | if (a_is_nan) { 22 | return (a_is_snan & b_is_nan) ? b : a; 23 | } 24 | return b; 25 | } 26 | 27 | CanonicalNaN float32_to_canonical_nan(Context* ctx, Float32 a) { 28 | if (float32_is_snan(a)) { 29 | context_raise(ctx, EXCEPTION_INVALID); 30 | } 31 | CanonicalNaN nan; 32 | nan.sign = a.bits >> 31; 33 | nan.lo = 0; 34 | nan.hi = (Uint64)a.bits << 41; 35 | return nan; 36 | } 37 | 38 | Float32 float32_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint32 sig) { 39 | const Round rounding_mode = ctx->round; 40 | const Flag round_nearest_even = rounding_mode == ROUND_NEAREST_EVEN; 41 | Sint8 round_increment = 0x40; 42 | if (!round_nearest_even) { 43 | if (rounding_mode == ROUND_TO_ZERO) { 44 | round_increment = 0; 45 | } else { 46 | round_increment = 0x7f; 47 | if (sign) { 48 | if (rounding_mode == ROUND_UP) { 49 | round_increment = 0; 50 | } 51 | } else { 52 | if (rounding_mode == ROUND_DOWN) { 53 | round_increment = 0; 54 | } 55 | } 56 | } 57 | } 58 | 59 | Sint8 round_bits = sig & 0x7f; 60 | 61 | if (round_bits) { 62 | ctx->roundings++; 63 | } 64 | 65 | if (0xfd <= (Uint16)exp) { 66 | if ((0xfd < exp) || ((exp == 0xfd) && ((Sint32)(sig + round_increment) < 0))) { 67 | context_raise(ctx, EXCEPTION_OVERFLOW | EXCEPTION_INEXACT); 68 | const Float32 pack = float32_pack(sign, 0xff, 0); 69 | return (Float32){pack.bits - (round_increment == 0 ? 0 : 1)}; 70 | } 71 | if (exp < 0) { 72 | const Flag is_tiny = (ctx->tininess == TININESS_BEFORE_ROUNDING) 73 | || (exp < -1) 74 | || (sig + round_increment < LIT32(0x80000000)); 75 | sig = rshr32(sig, -exp); 76 | exp = 0; 77 | round_bits = sig & 0x7f; 78 | if (is_tiny && round_bits) { 79 | context_raise(ctx, EXCEPTION_UNDERFLOW); 80 | } 81 | } 82 | } 83 | if (round_bits) { 84 | context_raise(ctx, EXCEPTION_INEXACT); 85 | } 86 | sig = (sig + round_increment) >> 7; 87 | sig &= ~(((round_bits ^ 0x40) == 0) & round_nearest_even); 88 | return float32_pack(sign, sig == 0 ? 0 : exp, sig); 89 | } 90 | 91 | static inline Float32 float32_normalize_round_and_pack(Context *ctx, Flag sign, Sint16 exp, Uint32 sig) { 92 | const Sint8 shift = count_leading_zeros_u32(sig) - 1; 93 | return float32_round_and_pack(ctx, sign, exp - shift, sig << shift); 94 | } 95 | 96 | Normal32 float32_normalize_subnormal(Uint32 sig) { 97 | const Sint8 shift = count_leading_zeros_u32(sig) - 8; 98 | return (Normal32){sig << shift, 1 - shift}; 99 | } 100 | 101 | static Float32 float32_add_sig(Context *ctx, Float32 a, Float32 b, Flag sign) { 102 | Sint16 a_exp = float32_exp(a); 103 | Sint16 b_exp = float32_exp(b); 104 | Uint32 a_sig = float32_fract(a) << 6; 105 | Uint32 b_sig = float32_fract(b) << 6; 106 | Sint16 exp_diff = a_exp - b_exp; 107 | 108 | Sint16 exp; 109 | Uint32 sig; 110 | if (0 < exp_diff) { 111 | if (a_exp == 0xff) { 112 | return a_sig ? float32_propagate_nan(ctx, a, b) : a; 113 | } 114 | if (b_exp == 0) { 115 | exp_diff--; 116 | } else { 117 | b_sig |= LIT32(0x20000000); 118 | } 119 | b_sig = rshr32(b_sig, exp_diff); 120 | exp = a_exp; 121 | } else if (exp_diff < 0) { 122 | if (b_exp == 0xff) { 123 | if (b_sig) { 124 | return float32_propagate_nan(ctx, a, b); 125 | } 126 | return float32_pack(sign, 0xff, 0); 127 | } 128 | if (a_exp == 0) { 129 | exp_diff++; 130 | } else { 131 | a_sig |= LIT32(0x20000000); 132 | } 133 | a_sig = rshr32(a_sig, -exp_diff); 134 | exp = b_exp; 135 | } else { 136 | if (a_exp == 0xff) { 137 | return (a_sig | b_sig) ? float32_propagate_nan(ctx, a, b) : a; 138 | } 139 | if (a_exp == 0) { 140 | return float32_pack(sign, 0, (a_sig + b_sig) >> 6); 141 | } 142 | sig = LIT32(0x40000000) + a_sig + b_sig; 143 | exp = a_exp; 144 | goto round_and_pack; 145 | } 146 | a_sig |= LIT32(0x20000000); 147 | sig = (a_sig + b_sig) << 1; 148 | exp--; 149 | if ((Sint32)sig < 0) { 150 | sig = a_sig + b_sig; 151 | exp++; 152 | } 153 | round_and_pack: 154 | return float32_round_and_pack(ctx, sign, exp, sig); 155 | } 156 | 157 | static Float32 float32_sub_sig(Context *ctx, Float32 a, Float32 b, Flag sign) { 158 | Sint16 a_exp = float32_exp(a); 159 | Sint16 b_exp = float32_exp(b); 160 | Uint32 a_sig = float32_fract(a) << 7; 161 | Uint32 b_sig = float32_fract(b) << 7; 162 | Sint16 exp_diff = a_exp - b_exp; 163 | 164 | // Needed because goto crosses initialization. 165 | Sint16 exp; 166 | Uint32 sig; 167 | if (0 < exp_diff) { 168 | goto a_exp_bigger; 169 | } 170 | if (exp_diff < 0) { 171 | goto b_exp_bigger; 172 | } 173 | if (a_exp == 0xff) { 174 | if (a_sig | b_sig) { 175 | return float32_propagate_nan(ctx, a, b); 176 | } 177 | context_raise(ctx, EXCEPTION_INVALID); 178 | return FLOAT32_NAN; 179 | } 180 | if (a_exp == 0) { 181 | a_exp = 1; 182 | b_exp = 1; 183 | } 184 | if (b_sig < a_sig) { 185 | goto a_bigger; 186 | } 187 | if (a_sig < b_sig) { 188 | goto b_bigger; 189 | } 190 | return float32_pack(ctx->round == ROUND_DOWN, 0, 0); 191 | b_exp_bigger: 192 | if (b_exp == 0xff) { 193 | return b_sig 194 | ? float32_propagate_nan(ctx, a, b) 195 | : float32_pack(sign ^ 1, 0xff, 0); 196 | } 197 | if (a_exp == 0) { 198 | exp_diff++; 199 | } else { 200 | a_sig |= LIT32(0x40000000); 201 | } 202 | a_sig = rshr32(a_sig, -exp_diff); 203 | b_sig |= LIT32(0x40000000); 204 | b_bigger: 205 | sig = b_sig - a_sig; 206 | exp = b_exp; 207 | sign ^= 1; 208 | goto normalize_round_and_pack; 209 | a_exp_bigger: 210 | if (a_exp == 0xff) { 211 | return a_sig ? float32_propagate_nan(ctx, a, b) : a; 212 | } 213 | if (b_exp == 0) { 214 | exp_diff--; 215 | } else { 216 | b_sig |= LIT32(0x40000000); 217 | } 218 | b_sig = rshr32(b_sig, exp_diff); 219 | a_sig |= LIT32(0x40000000); 220 | a_bigger: 221 | sig = a_sig - b_sig; 222 | exp = a_exp; 223 | normalize_round_and_pack: 224 | exp--; 225 | return float32_normalize_round_and_pack(ctx, sign, exp, sig); 226 | } 227 | 228 | Float32 float32_add(Context *ctx, Float32 a, Float32 b) { 229 | array_push(ctx->operations, OPERATION_ADD); 230 | const Flag a_sign = float32_sign(a); 231 | const Flag b_sign = float32_sign(b); 232 | return a_sign == b_sign 233 | ? float32_add_sig(ctx, a, b, a_sign) 234 | : float32_sub_sig(ctx, a, b, a_sign); 235 | } 236 | 237 | Float32 float32_sub(Context *ctx, Float32 a, Float32 b) { 238 | array_push(ctx->operations, OPERATION_SUB); 239 | const Flag a_sign = float32_sign(a); 240 | const Flag b_sign = float32_sign(b); 241 | return a_sign == b_sign 242 | ? float32_sub_sig(ctx, a, b, a_sign) 243 | : float32_add_sig(ctx, a, b, a_sign); 244 | } 245 | 246 | Float32 float32_mul(Context *ctx, Float32 a, Float32 b) { 247 | array_push(ctx->operations, OPERATION_MUL); 248 | Sint16 a_exp = float32_exp(a); 249 | Sint16 b_exp = float32_exp(b); 250 | Uint32 a_sig = float32_fract(a); 251 | Uint32 b_sig = float32_fract(b); 252 | const Flag a_sign = float32_sign(a); 253 | const Flag b_sign = float32_sign(b); 254 | const Flag sign = a_sign ^ b_sign; 255 | Uint32 mag_bits = 0; 256 | if (a_exp == 0xff) { 257 | if (a_sig || (b_exp == 0xff && b_sig)) goto propagate_nan; 258 | mag_bits = b_exp | b_sig; 259 | goto infinity; 260 | } 261 | if (b_exp == 0xff) { 262 | if (b_sig) goto propagate_nan; 263 | mag_bits = a_exp | a_sig; 264 | goto infinity; 265 | } 266 | if (a_exp == 0) { 267 | if (a_sig == 0) goto zero; 268 | const Normal32 n = float32_normalize_subnormal(a_sig); 269 | a_exp = n.exp; 270 | a_sig = n.sig; 271 | } 272 | if (b_exp == 0) { 273 | if (b_sig == 0) goto zero; 274 | const Normal32 n = float32_normalize_subnormal(b_sig); 275 | b_exp = n.exp; 276 | b_sig = n.sig; 277 | } 278 | Sint16 exp = a_exp + b_exp - 0x7f; 279 | a_sig = (a_sig | LIT32(0x00800000)) << 7; 280 | b_sig = (b_sig | LIT32(0x00800000)) << 8; 281 | 282 | // Compute with 64-bit mul, truncate to 32-bit. 283 | Uint32 sig = rshr64((Uint64)a_sig * b_sig, 32); 284 | if (sig < LIT32(0x40000000)) { 285 | exp--; 286 | sig <<= 1; 287 | } 288 | return float32_round_and_pack(ctx, sign, exp, sig); 289 | propagate_nan: 290 | return float32_propagate_nan(ctx, a, b); 291 | infinity: 292 | if (!mag_bits) { 293 | context_raise(ctx, EXCEPTION_INVALID); 294 | return FLOAT32_NAN; 295 | } else { 296 | return float32_pack(sign, 0xff, 0); 297 | } 298 | zero: 299 | return float32_pack(sign, 0, 0); 300 | } 301 | 302 | Float32 float32_div(Context *ctx, Float32 a, Float32 b) { 303 | array_push(ctx->operations, OPERATION_DIV); 304 | Sint16 a_exp = float32_exp(a); 305 | Sint16 b_exp = float32_exp(b); 306 | Uint32 a_sig = float32_fract(a); 307 | Uint32 b_sig = float32_fract(b); 308 | const Flag a_sign = float32_sign(a); 309 | const Flag b_sign = float32_sign(b); 310 | const Flag sign = a_sign ^ b_sign; 311 | if (a_exp == 0xff) { 312 | if (a_sig) goto propagate_nan; 313 | if (b_exp == 0xff) { 314 | if (b_sig) goto propagate_nan; 315 | goto invalid; 316 | } 317 | goto infinity; 318 | } 319 | if (b_exp == 0xff) { 320 | if (b_sig) goto propagate_nan; 321 | goto zero; 322 | } 323 | if (b_exp == 0) { 324 | if (b_sig == 0) { 325 | if ((a_exp | a_sig) == 0) goto invalid; 326 | context_raise(ctx, EXCEPTION_INFINITE); 327 | goto infinity; 328 | } 329 | const Normal32 n = float32_normalize_subnormal(b_sig); 330 | b_exp = n.exp; 331 | b_sig = n.sig; 332 | } 333 | if (a_exp == 0) { 334 | if (a_sig == 0) goto zero; 335 | const Normal32 n = float32_normalize_subnormal(a_sig); 336 | a_exp = n.exp; 337 | a_sig = n.sig; 338 | } 339 | Sint16 exp = a_exp - b_exp + 0x7e; 340 | a_sig = (a_sig | LIT32(0x00800000)); 341 | b_sig = (b_sig | LIT32(0x00800000)); 342 | // Use 64-bit divide for 32-bit significand. 343 | Uint64 a_sig_64; 344 | if (a_sig < b_sig) { 345 | exp--; 346 | a_sig_64 = (Uint64)a_sig << 31; 347 | } else { 348 | a_sig_64 = (Uint64)a_sig << 30; 349 | } 350 | Uint32 sig = a_sig_64 / b_sig; 351 | if (!(sig & 0x3f)) { 352 | sig |= ((Uint64)b_sig * sig != a_sig_64); 353 | } 354 | return float32_round_and_pack(ctx, sign, exp, sig); 355 | propagate_nan: 356 | return float32_propagate_nan(ctx, a, b); 357 | invalid: 358 | context_raise(ctx, EXCEPTION_INVALID); 359 | return FLOAT32_NAN; 360 | infinity: 361 | return float32_pack(sign, 0xff, 0); 362 | zero: 363 | return float32_pack(sign, 0, 0); 364 | } 365 | 366 | // a == b 367 | Flag float32_eq(Context *ctx, Float32 a, Float32 b) { 368 | if ((float32_exp(a) == 0xff && float32_fract(a)) || 369 | (float32_exp(b) == 0xff && float32_fract(b))) 370 | { 371 | if (float32_is_snan(a) || float32_is_snan(b)) { 372 | context_raise(ctx, EXCEPTION_INVALID); 373 | } 374 | return 0; 375 | } 376 | return a.bits == b.bits || (Uint32)((a.bits | b.bits) << 1) == 0; 377 | } 378 | 379 | // a <= b 380 | Flag float32_lte(Context *ctx, Float32 a, Float32 b) { 381 | if ((float32_exp(a) == 0xff && float32_fract(a)) || 382 | (float32_exp(b) == 0xff && float32_fract(b))) 383 | { 384 | context_raise(ctx, EXCEPTION_INVALID); 385 | return 0; 386 | } 387 | 388 | const Flag a_sign = float32_sign(a); 389 | const Flag b_sign = float32_sign(b); 390 | 391 | if (a_sign != b_sign) { 392 | return a_sign || (Uint32)((a.bits | b.bits) << 1) == 0; 393 | } 394 | 395 | return a.bits == b.bits || (a_sign ^ (a.bits < b.bits)); 396 | } 397 | 398 | // a < b 399 | Flag float32_lt(Context *ctx, Float32 a, Float32 b) { 400 | if ((float32_exp(a) == 0xff && float32_fract(a)) || 401 | (float32_exp(b) == 0xff && float32_fract(b))) 402 | { 403 | context_raise(ctx, EXCEPTION_INVALID); 404 | return 0; 405 | } 406 | 407 | const Flag a_sign = float32_sign(a); 408 | const Flag b_sign = float32_sign(b); 409 | 410 | if (a_sign != b_sign) { 411 | return a_sign && (Uint32)((a.bits | b.bits) << 1) != 0; 412 | } 413 | 414 | return a.bits != b.bits && (a_sign ^ (a.bits< b.bits)); 415 | } 416 | 417 | // The others are implemented with a not on the flag. IEEE 754 requires 418 | // these identities be held, so this is safe. 419 | // a != b => !(a == b) 420 | Flag float32_ne(Context *ctx, Float32 a, Float32 b) { 421 | return !float32_eq(ctx, a, b); 422 | } 423 | 424 | // a >= b => !(a < b) 425 | Flag float32_gte(Context *ctx, Float32 a, Float32 b) { 426 | return !float32_lt(ctx, a, b); 427 | } 428 | 429 | // a > b => !(a <= b) 430 | Flag float32_gt(Context *ctx, Float32 a, Float32 b) { 431 | return !float32_lte(ctx, a, b); 432 | } 433 | 434 | Float32 float32_from_sint32(Context *ctx, Sint32 a) { 435 | if (a == 0) { 436 | return (Float32){0}; 437 | } 438 | if (a == (Sint32)0x80000000) { 439 | return float32_pack(1, 0x9e, 0); 440 | } 441 | const Flag sign = a < 0; 442 | return float32_normalize_round_and_pack(ctx, sign, 0x9c, sign ? -a : a); 443 | } -------------------------------------------------------------------------------- /float32.h: -------------------------------------------------------------------------------- 1 | #ifndef SOFT32_H 2 | #define SOFT32_H 3 | #include "soft.h" 4 | 5 | static inline Uint32 float32_fract(Float32 a) { 6 | return a.bits & LIT32(0x007FFFFF); 7 | } 8 | 9 | static inline Sint16 float32_exp(Float32 a) { 10 | return (a.bits >> 23) & 0xff; 11 | } 12 | 13 | static inline Flag float32_sign(Float32 a) { 14 | return a.bits >> 31; 15 | } 16 | 17 | static inline Flag float32_is_nan(Float32 a) { 18 | return LIT32(0xFF000000) << (Uint32)(a.bits << 1); 19 | } 20 | 21 | static inline Flag float32_is_snan(Float32 a) { 22 | return ((a.bits >> 22) & 0x1ff) == 0x1fe && (a.bits & LIT32(0x003FFFFF)); 23 | } 24 | 25 | static inline Flag float32_is_any_nan(Float32 a) { 26 | return (a.bits & LIT32(0x7fffffff)) > LIT32(0x7f800000); 27 | } 28 | 29 | // Pack sign, exponent, and significant into single-precision float. 30 | static inline Float32 float32_pack(Flag sign, Sint16 exp, Uint32 sig) { 31 | return (Float32){(((Uint32)sign) << 31) + (((Uint32)exp) << 23) + sig}; 32 | } 33 | 34 | // Common constants. 35 | #define FLOAT32_NAN (Float32){LIT32(0xffffffff)} // NaN 36 | #define FLOAT32_EPSILON (Float32){LIT32(0x34000000)} // 0x0.000002p0 37 | #define FLOAT32_ZERO (Float32){LIT32(0x00000000)} // 0.0 38 | #define FLOAT32_HALF (Float32){LIT32(0x3f000000)} // 0.5 39 | #define FLOAT32_ONE (Float32){LIT32(0x3f800000)} // 1.0 40 | #define FLOAT32_MINUS_ONE (Float32){LIT32(0xbf800000)} // -1.0 41 | 42 | // Conversion of float32 NaN to CanonicalNaN format. 43 | CanonicalNaN float32_to_canonical_nan(Context*, Float32); 44 | 45 | // Normalize a subnormal. 46 | Normal32 float32_normalize_subnormal(Uint32 sig); 47 | 48 | // Build a float from sign, exponent, and significant with correct rounding. 49 | Float32 float32_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint32 sig); 50 | 51 | // Arithmetic functions. 52 | Float32 float32_add(Context*, Float32, Float32); // a + b 53 | Float32 float32_sub(Context*, Float32, Float32); // a - b 54 | Float32 float32_mul(Context*, Float32, Float32); // a * b 55 | Float32 float32_div(Context*, Float32, Float32); // a / b 56 | 57 | // Relational functions. 58 | Flag float32_eq(Context*, Float32, Float32); // a == b 59 | Flag float32_lte(Context*, Float32, Float32); // a <= b 60 | Flag float32_lt(Context*, Float32, Float32); // a < b 61 | Flag float32_ne(Context*, Float32, Float32); // a != b 62 | Flag float32_gte(Context*, Float32, Float32); // a >= b 63 | Flag float32_gt(Context*, Float32, Float32); // a > b 64 | 65 | // Conversion functions. 66 | Float32 float32_from_sint32(Context *ctx, Sint32 x); 67 | 68 | // Needed temporarily for printing. 69 | static inline float float32_cast(Float32 x) { 70 | union { Float32 s; float h; } u = {x}; 71 | return u.h; 72 | } 73 | 74 | #endif // FLOAT32_H -------------------------------------------------------------------------------- /float64.c: -------------------------------------------------------------------------------- 1 | #include "float64.h" 2 | #include "uint128.h" 3 | 4 | // Count leading zero bits. 5 | static inline Sint8 count_leading_zeros_u64(Uint64 a) { 6 | return a == 0 ? 64 : __builtin_clzl(a); 7 | } 8 | 9 | // Take two double-precision float values, one which must be NaN, and produce 10 | // the correct NaN result, taking care to raise an invalid exception when either 11 | // is a signaling NaN. 12 | static Float64 float64_propagate_nan(Context *ctx, Float64 a, Float64 b) { 13 | const Flag a_is_nan = float64_is_nan(a); 14 | const Flag a_is_snan = float64_is_snan(a); 15 | const Flag b_is_nan = float64_is_nan(b); 16 | const Flag b_is_snan = float64_is_snan(b); 17 | a.bits |= LIT64(0x0008000000000000); 18 | b.bits |= LIT64(0x0008000000000000); 19 | if (a_is_snan | b_is_snan) { 20 | context_raise(ctx, EXCEPTION_INVALID); 21 | } 22 | if (a_is_nan) { 23 | return (a_is_snan & b_is_nan) ? b : a; 24 | } 25 | return b; 26 | } 27 | 28 | CanonicalNaN float64_to_canonical_nan(Context* ctx, Float64 a) { 29 | if (float64_is_snan(a)) { 30 | context_raise(ctx, EXCEPTION_INVALID); 31 | } 32 | CanonicalNaN nan; 33 | nan.sign = a.bits >> 63; 34 | nan.lo = 0; 35 | nan.hi = a.bits << 12; 36 | return nan; 37 | } 38 | 39 | Float64 float64_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint64 sig) { 40 | const Round rounding_mode = ctx->round; 41 | const Flag round_nearest_even = rounding_mode == ROUND_NEAREST_EVEN; 42 | Sint16 round_increment = 0x200; 43 | if (!round_nearest_even) { 44 | if (rounding_mode == ROUND_TO_ZERO) { 45 | round_increment = 0; 46 | } else { 47 | round_increment = 0x3ff; 48 | if (sign) { 49 | if (rounding_mode == ROUND_UP) { 50 | round_increment = 0; 51 | } 52 | } else { 53 | if (rounding_mode == ROUND_DOWN) { 54 | round_increment = 0; 55 | } 56 | } 57 | } 58 | } 59 | 60 | Sint16 round_bits = sig & 0x3ff; 61 | 62 | if (round_bits) { 63 | ctx->roundings++; 64 | } 65 | 66 | if (0x7fd <= (Uint16)exp) { 67 | if ((0x7fd < exp) || ((exp == 0x7fd) && ((Sint64)(sig + round_increment) < 0))) { 68 | context_raise(ctx, EXCEPTION_OVERFLOW | EXCEPTION_INEXACT); 69 | const Float64 pack = float64_pack(sign, 0x7ff, 0); 70 | return (Float64){pack.bits - (round_increment == 0 ? 0 : 1)}; 71 | } 72 | if (exp < 0) { 73 | const Flag is_tiny = (ctx->tininess == TININESS_BEFORE_ROUNDING) 74 | || (exp < -1) 75 | || (sig + round_increment < LIT64(0x8000000000000000)); 76 | sig = rshr64(sig, -exp); 77 | exp = 0; 78 | round_bits = sig & 0x3ff; 79 | if (is_tiny && round_bits) { 80 | context_raise(ctx, EXCEPTION_UNDERFLOW); 81 | } 82 | } 83 | } 84 | if (round_bits) { 85 | context_raise(ctx, EXCEPTION_INEXACT); 86 | } 87 | sig = (sig + round_increment) >> 10; 88 | sig &= ~(((round_bits ^ 0x200) == 0) & round_nearest_even); 89 | return float64_pack(sign, sig == 0 ? 0 : exp, sig); 90 | } 91 | 92 | static inline Float64 float64_normalize_round_and_pack(Context *ctx, Flag sign, Sint16 exp, Uint64 sig) { 93 | const Sint8 shift = count_leading_zeros_u64(sig) - 1; 94 | return float64_round_and_pack(ctx, sign, exp - shift, sig << shift); 95 | } 96 | 97 | Normal64 float64_normalize_subnormal(Uint64 sig) { 98 | const Sint8 shift = count_leading_zeros_u64(sig) - 11; 99 | return (Normal64){sig << shift, 1 - shift}; 100 | } 101 | 102 | static Float64 float64_add_sig(Context *ctx, Float64 a, Float64 b, Flag sign) { 103 | Sint16 a_exp = float64_exp(a); 104 | Sint16 b_exp = float64_exp(b); 105 | Uint64 a_sig = float64_fract(a) << 9; 106 | Uint64 b_sig = float64_fract(b) << 9; 107 | Sint16 exp_diff = a_exp - b_exp; 108 | 109 | Sint16 exp; 110 | Uint64 sig; 111 | if (0 < exp_diff) { 112 | if (a_exp == 0x7ff) { 113 | return a_sig ? float64_propagate_nan(ctx, a, b) : a; 114 | } 115 | if (b_exp == 0) { 116 | exp_diff--; 117 | } else { 118 | b_sig |= LIT64(0x2000000000000000); 119 | } 120 | b_sig = rshr64(b_sig, exp_diff); 121 | exp = a_exp; 122 | } else if (exp_diff < 0) { 123 | if (b_exp == 0x7ff) { 124 | return b_sig 125 | ? float64_propagate_nan(ctx, a, b) 126 | : float64_pack(sign, 0x7ff, 0); 127 | } 128 | if (a_exp == 0) { 129 | exp_diff++; 130 | } else { 131 | a_sig |= LIT64(0x2000000000000000); 132 | } 133 | a_sig = rshr64(a_sig, -exp_diff); 134 | exp = b_exp; 135 | } else { 136 | if (a_exp == 0x7ff) { 137 | return (a_sig | b_sig) ? float64_propagate_nan(ctx, a, b) : a; 138 | } 139 | if (a_exp == 0) { 140 | return float64_pack(sign, 0, (a_sig + b_sig) >> 9); 141 | } 142 | sig = LIT64(0x4000000000000000) + a_sig + b_sig; 143 | exp = a_exp; 144 | goto round_and_pack; 145 | } 146 | a_sig |= LIT64(0x2000000000000000); 147 | sig = (a_sig + b_sig) << 1; 148 | exp--; 149 | if ((Sint64)sig < 0) { 150 | sig = a_sig + b_sig; 151 | exp++; 152 | } 153 | round_and_pack: 154 | return float64_round_and_pack(ctx, sign, exp, sig); 155 | } 156 | 157 | static Float64 float64_sub_sig(Context *ctx, Float64 a, Float64 b, Flag sign) { 158 | Sint16 a_exp = float64_exp(a); 159 | Sint16 b_exp = float64_exp(b); 160 | Uint64 a_sig = float64_fract(a) << 10; 161 | Uint64 b_sig = float64_fract(b) << 10; 162 | Sint16 exp_diff = a_exp - b_exp; 163 | 164 | // Needed because goto crosses initialization. 165 | Sint16 exp; 166 | Uint64 sig; 167 | if (0 < exp_diff) { 168 | goto a_exp_bigger; 169 | } 170 | if (exp_diff < 0) { 171 | goto b_exp_bigger; 172 | } 173 | if (a_exp == 0x7ff) { 174 | if (a_sig | b_sig) { 175 | return float64_propagate_nan(ctx, a, b); 176 | } 177 | context_raise(ctx, EXCEPTION_INVALID); 178 | return FLOAT64_NAN; 179 | } 180 | if (a_exp == 0) { 181 | a_exp = 1; 182 | b_exp = 1; 183 | } 184 | if (b_sig < a_sig) { 185 | goto a_bigger; 186 | } 187 | if (a_sig < b_sig) { 188 | goto b_bigger; 189 | } 190 | return float64_pack(ctx->round == ROUND_DOWN, 0, 0); 191 | b_exp_bigger: 192 | if (b_exp == 0x7ff) { 193 | return b_sig 194 | ? float64_propagate_nan(ctx, a, b) 195 | : float64_pack(sign ^ 1, 0xff, 0); 196 | } 197 | if (a_exp == 0) { 198 | exp_diff++; 199 | } else { 200 | a_sig |= LIT64(0x4000000000000000); 201 | } 202 | a_sig = rshr64(a_sig, -exp_diff); 203 | b_sig |= LIT64(0x4000000000000000); 204 | b_bigger: 205 | sig = b_sig - a_sig; 206 | exp = b_exp; 207 | sign ^= 1; 208 | goto normalize_round_and_pack; 209 | a_exp_bigger: 210 | if (a_exp == 0x7ff) { 211 | return a_sig ? float64_propagate_nan(ctx, a, b) : a; 212 | } 213 | if (b_exp == 0) { 214 | exp_diff--; 215 | } else { 216 | b_sig |= LIT64(0x4000000000000000); 217 | } 218 | b_sig = rshr64(b_sig, exp_diff); 219 | a_sig |= LIT64(0x4000000000000000); 220 | a_bigger: 221 | sig = a_sig - b_sig; 222 | exp = a_exp; 223 | normalize_round_and_pack: 224 | exp--; 225 | return float64_normalize_round_and_pack(ctx, sign, exp, sig); 226 | } 227 | 228 | Float64 float64_add(Context *ctx, Float64 a, Float64 b) { 229 | const Flag a_sign = float64_sign(a); 230 | const Flag b_sign = float64_sign(b); 231 | return a_sign == b_sign 232 | ? float64_add_sig(ctx, a, b, a_sign) 233 | : float64_sub_sig(ctx, a, b, b_sign); 234 | } 235 | 236 | Float64 float64_sub(Context *ctx, Float64 a, Float64 b) { 237 | const Flag a_sign = float64_sign(a); 238 | const Flag b_sign = float64_sign(b); 239 | return a_sign == b_sign 240 | ? float64_sub_sig(ctx, a, b, a_sign) 241 | : float64_add_sig(ctx, a, b, a_sign); 242 | } 243 | 244 | Float64 float64_mul(Context *ctx, Float64 a, Float64 b) { 245 | Sint16 a_exp = float64_exp(a); 246 | Sint16 b_exp = float64_exp(b); 247 | Uint64 a_sig = float64_fract(a); 248 | Uint64 b_sig = float64_fract(b); 249 | Flag a_sign = float64_sign(a); 250 | Flag b_sign = float64_sign(b); 251 | Flag sign = a_sign ^ b_sign; 252 | if (a_exp == 0x7ff) { 253 | if (a_sig || (b_exp == 0x7ff && b_sig)) { 254 | return float64_propagate_nan(ctx, a, b); 255 | } 256 | if ((b_exp | b_sig) == 0) { 257 | context_raise(ctx, EXCEPTION_INVALID); 258 | return FLOAT64_NAN; 259 | } 260 | return float64_pack(sign, 0x7ff, 0); 261 | } 262 | if (b_exp == 0x7ff) { 263 | if (b_sig) { 264 | return float64_propagate_nan(ctx, a, b); 265 | } 266 | if ((a_exp | a_sig) == 0) { 267 | context_raise(ctx, EXCEPTION_INVALID); 268 | return FLOAT64_NAN; 269 | } 270 | return float64_pack(sign, 0x7ff, 0); 271 | } 272 | if (a_exp == 0) { 273 | if (a_sig == 0) { 274 | return float64_pack(sign, 0, 0); 275 | } 276 | const Normal64 n = float64_normalize_subnormal(a_sig); 277 | a_exp = n.exp; 278 | a_sig = n.sig; 279 | } 280 | if (b_exp == 0) { 281 | if (b_sig == 0) { 282 | return float64_pack(sign, 0, 0); 283 | const Normal64 n = float64_normalize_subnormal(b_sig); 284 | b_exp = n.exp; 285 | b_sig = n.sig; 286 | } 287 | } 288 | Sint16 exp = a_exp + b_exp - 0x3ff; 289 | a_sig = (a_sig | LIT64(0x0010000000000000)) << 10; 290 | b_sig = (b_sig | LIT64(0x0010000000000000)) << 11; 291 | 292 | // Compute with 128-bit mul, truncate to 64-bit. 293 | Uint128 mul = uint128_mul64x64(a_sig, b_sig); 294 | mul.z0 |= mul.z1 != 0; 295 | if (0 <= (Sint64)(mul.z0 << 1)) { 296 | mul.z0 <<= 1; 297 | exp--; 298 | } 299 | return float64_round_and_pack(ctx, sign, exp, mul.z0); 300 | } 301 | 302 | Float64 float64_div(Context *ctx, Float64 a, Float64 b) { 303 | Sint16 a_exp = float64_exp(a); 304 | Sint16 b_exp = float64_exp(b); 305 | Uint64 a_sig = float64_fract(a); 306 | Uint64 b_sig = float64_fract(b); 307 | Flag a_sign = float64_sign(a); 308 | Flag b_sign = float64_sign(b); 309 | Flag sign = a_sign ^ b_sign; 310 | if (a_exp == 0x7ff) { 311 | if (a_sig) { 312 | return float64_propagate_nan(ctx, a, b); 313 | } 314 | if (b_exp == 0x7ff) { 315 | if (b_sig) { 316 | return float64_propagate_nan(ctx, a, b); 317 | } 318 | context_raise(ctx, EXCEPTION_INVALID); 319 | return FLOAT64_NAN; 320 | } 321 | return float64_pack(sign, 0xff, 0); 322 | } 323 | if (b_exp == 0x7ff) { 324 | return b_sig 325 | ? float64_propagate_nan(ctx, a, b) 326 | : float64_pack(sign, 0, 0); 327 | } 328 | if (b_exp == 0) { 329 | if (b_sig == 0) { 330 | if ((a_exp | a_sig) == 0) { 331 | context_raise(ctx, EXCEPTION_INVALID); 332 | return FLOAT64_NAN; 333 | } 334 | context_raise(ctx, EXCEPTION_INFINITE); 335 | return float64_pack(sign, 0xff, 0); 336 | } 337 | const Normal64 n = float64_normalize_subnormal(b_sig); 338 | b_exp = n.exp; 339 | b_sig = n.sig; 340 | } 341 | if (a_exp == 0) { 342 | if (a_sig == 0) { 343 | return float64_pack(sign, 0, 0); 344 | } 345 | const Normal64 n = float64_normalize_subnormal(a_sig); 346 | a_exp = n.exp; 347 | a_sig = n.sig; 348 | } 349 | Sint16 exp = a_exp - b_exp + 0x7d; 350 | a_sig = (a_sig | LIT64(0x0010000000000000)) << 10; 351 | b_sig = (b_sig | LIT64(0x0010000000000000)) << 11; 352 | if (b_sig <= a_sig + a_sig) { 353 | a_sig >>= 1; 354 | exp++; 355 | } 356 | 357 | Uint64 sig = uint128_div128x64((Uint128){a_sig, 0}, b_sig); 358 | if ((sig & 0x1ff) <= 2) { 359 | Uint128 term = uint128_mul64x64(b_sig, sig); 360 | Uint128 rem = uint128_sub((Uint128){a_sig, 0}, term); 361 | while ((Sint64)rem.z0 < 0) { 362 | sig--; 363 | rem = uint128_add(rem, (Uint128){0, b_sig}); 364 | } 365 | sig |= rem.z1 != 0; 366 | } 367 | 368 | return float64_round_and_pack(ctx, sign, exp, sig); 369 | } -------------------------------------------------------------------------------- /float64.h: -------------------------------------------------------------------------------- 1 | #ifndef SOFT64_H 2 | #define SOFT64_H 3 | #include "soft.h" 4 | 5 | static inline Uint64 float64_fract(Float64 a) { 6 | return a.bits & LIT64(0x000FFFFFFFFFFFFF); 7 | } 8 | 9 | static inline Sint16 float64_exp(Float64 a) { 10 | return (a.bits >> 52) & 0x7ff; 11 | } 12 | 13 | static inline Flag float64_sign(Float64 a) { 14 | return a.bits >> 63; 15 | } 16 | 17 | static inline Flag float64_is_nan(Float64 a) { 18 | return LIT64(0xFFE0000000000000) < (Uint64)(a.bits << 1); 19 | } 20 | 21 | static inline Flag float64_is_snan(Float64 a) { 22 | return (((a.bits >> 51) & 0xfff) == 0xffe) 23 | && (a.bits & LIT64(0x0007ffffffffffff)); 24 | } 25 | 26 | // Pack sign, exponent, and significant into double-precision float. 27 | static inline Float64 float64_pack(Flag sign, Sint16 exp, Uint64 sig) { 28 | return (Float64){(((Uint64)sign) << 63) + (((Uint64)exp) << 52) + sig}; 29 | } 30 | 31 | // Common constants. 32 | static const Float64 FLOAT64_NAN = {LIT64(0xffffffffffffffff)}; 33 | static const Float64 FLOAT64_ZERO = {0}; // 0x0p+0 34 | 35 | // Conversion of float32 NaN to CanonicalNaN format. 36 | CanonicalNaN float64_to_canonical_nan(Context*, Float64); 37 | 38 | // Normalize subnormal. 39 | Normal64 float64_normalize_subnormal(Uint64 sig); 40 | 41 | // Build a float64 from sign, exponent, and significant with correct rounding. 42 | Float64 float64_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint64 sig); 43 | 44 | // Arithmetic functions. 45 | Float64 float64_add(Context*, Float64, Float64); // a + b 46 | Float64 float64_sub(Context*, Float64, Float64); // a - b 47 | Float64 float64_mul(Context*, Float64, Float64); // a * b 48 | Float64 float64_div(Context*, Float64, Float64); // a / b 49 | 50 | // Needed temporarily for printing. 51 | static inline double float64_cast(Float64 x) { 52 | union { Float64 s; double h; } u = {x}; 53 | return u.h; 54 | } 55 | 56 | #endif // FLOAT64_H -------------------------------------------------------------------------------- /kernel32.c: -------------------------------------------------------------------------------- 1 | #include "kernel32.h" 2 | 3 | static const Float32 HUGE = {LIT32(0x7b800000)}; // 0x1p120f 4 | // When the result of evaluating something is not used the compiler will attempt 5 | // to remove that dead code, even though in this case we want the evaluation 6 | // of some expressions to happen to trigger exceptions. 7 | static inline void float32_force_eval(Float32 x) { 8 | volatile Float32 y; 9 | y = x; 10 | (void)y; // Mark as used. 11 | } 12 | 13 | Float32 float32_floor(Context *ctx, Float32 x) { 14 | const Sint16 e = float32_exp(x) - 0x7f; 15 | if (e >= 23) { 16 | return x; 17 | } 18 | if (e >= 0) { 19 | const Uint32 m = LIT32(0x007fffff) >> e; 20 | if ((x.bits & m) == 0) { 21 | return x; 22 | } 23 | float32_force_eval(float32_add(ctx, x, HUGE)); 24 | if (x.bits >> 31) { 25 | x.bits += m; 26 | } 27 | x.bits &= ~m; 28 | } else { 29 | float32_force_eval(float32_add(ctx, x, HUGE)); 30 | if (x.bits >> 31 == 0) { 31 | x.bits = 0; 32 | } else if (x.bits << 1) { 33 | x.bits = LIT32(0xbf800000); // -1.0 34 | } 35 | } 36 | return x; 37 | } 38 | 39 | Float32 float32_ceil(Context *ctx, Float32 x) { 40 | const Sint16 e = float32_exp(x) - 0x7f; 41 | if (e >= 23) { 42 | return x; 43 | } 44 | if (e >= 0) { 45 | const Uint32 m = LIT32(0x007fffff) >> e; 46 | if ((x.bits & m) == 0) { 47 | return x; 48 | } 49 | float32_force_eval(float32_add(ctx, x, HUGE)); 50 | if (x.bits >> 31 == 0) { 51 | x.bits += m; 52 | } 53 | x.bits &= ~m; 54 | } else { 55 | float32_force_eval(float32_add(ctx, x, HUGE)); 56 | if (x.bits >> 31) { 57 | x.bits = LIT32(0x80000000); // -0.0 58 | } else if (x.bits << 1) { 59 | x.bits = LIT32(0x3f800000); // 1.0 60 | } 61 | } 62 | return x; 63 | } 64 | 65 | Float32 float32_trunc(Context *ctx, Float32 x) { 66 | Sint16 e = float32_exp(x) - 0x7f + 9; 67 | if (e >= 23 + 9) { 68 | return x; 69 | } 70 | if (e < 9) { 71 | e = 1; 72 | } 73 | const Uint32 m = -1u >> e; 74 | if ((x.bits & m) == 0) { 75 | return x; 76 | } 77 | float32_force_eval(float32_add(ctx, x, HUGE)); 78 | x.bits &= ~m; 79 | return x; 80 | } 81 | 82 | // 32-bit multiplication without truncation. 83 | static inline Uint32 mul32(Uint32 a, Uint32 b) { 84 | return (Uint64)a*b >> 32; 85 | } 86 | 87 | // Computes (x-x) / (x-x) to correctly raise an invalid exception and compute 88 | // correct exceptional value of NaN, sNaN, +Inf, or -Inf for given x. 89 | static Float32 float32_invalid(Context *ctx, Float32 x) { 90 | const Float32 sub = float32_sub(ctx, x, x); 91 | return float32_div(ctx, sub, sub); 92 | } 93 | 94 | Float32 float32_sqrt(Context *ctx, Float32 x) { 95 | // if x in [1,2): i = (Sint32)(64*x); 96 | // if x in [2,4): i = (Sint32)(32*x-64); 97 | // TABLE[i]*2^-16 is estimating 1/sqrt(x) with small relative error: 98 | // |TABLE[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 99 | static const Uint16 TABLE[128] = { 100 | 0xb451, 0xb2f0, 0xb196, 0xb044, 0xaef9, 0xadb6, 0xac79, 0xab43, 101 | 0xaa14, 0xa8eb, 0xa7c8, 0xa6aa, 0xa592, 0xa480, 0xa373, 0xa26b, 102 | 0xa168, 0xa06a, 0x9f70, 0x9e7b, 0x9d8a, 0x9c9d, 0x9bb5, 0x9ad1, 103 | 0x99f0, 0x9913, 0x983a, 0x9765, 0x9693, 0x95c4, 0x94f8, 0x9430, 104 | 0x936b, 0x92a9, 0x91ea, 0x912e, 0x9075, 0x8fbe, 0x8f0a, 0x8e59, 105 | 0x8daa, 0x8cfe, 0x8c54, 0x8bac, 0x8b07, 0x8a64, 0x89c4, 0x8925, 106 | 0x8889, 0x87ee, 0x8756, 0x86c0, 0x862b, 0x8599, 0x8508, 0x8479, 107 | 0x83ec, 0x8361, 0x82d8, 0x8250, 0x81c9, 0x8145, 0x80c2, 0x8040, 108 | 0xff02, 0xfd0e, 0xfb25, 0xf947, 0xf773, 0xf5aa, 0xf3ea, 0xf234, 109 | 0xf087, 0xeee3, 0xed47, 0xebb3, 0xea27, 0xe8a3, 0xe727, 0xe5b2, 110 | 0xe443, 0xe2dc, 0xe17a, 0xe020, 0xdecb, 0xdd7d, 0xdc34, 0xdaf1, 111 | 0xd9b3, 0xd87b, 0xd748, 0xd61a, 0xd4f1, 0xd3cd, 0xd2ad, 0xd192, 112 | 0xd07b, 0xcf69, 0xce5b, 0xcd51, 0xcc4a, 0xcb48, 0xca4a, 0xc94f, 113 | 0xc858, 0xc764, 0xc674, 0xc587, 0xc49d, 0xc3b7, 0xc2d4, 0xc1f4, 114 | 0xc116, 0xc03c, 0xbf65, 0xbe90, 0xbdbe, 0xbcef, 0xbc23, 0xbb59, 115 | 0xba91, 0xb9cc, 0xb90a, 0xb84a, 0xb78c, 0xb6d0, 0xb617, 0xb560, 116 | }; 117 | 118 | Uint32 ix = x.bits; 119 | 120 | if (ix - 0x00800000 >= 0x7f800000 - 0x00800000) { 121 | // x < 0x1p-126, inf, or nan. 122 | if (ix * 2 == 0) { 123 | return x; 124 | } 125 | if (ix == LIT32(0x7f800000)) { 126 | return x; 127 | } 128 | if (ix > LIT32(0x7f800000)) { 129 | return float32_invalid(ctx, x); 130 | } 131 | // is subnormal, normalize it. 132 | const Float32 n = float32_mul(ctx, x, (Float32){LIT32(0x4b000000)}); // 0x1p23f 133 | ix = n.bits; 134 | ix -= 23 << 23; 135 | } 136 | 137 | // x = 4^e m; with int e and m in [1, 4). 138 | Uint32 even = ix & LIT32(0x00800000); 139 | Uint32 m1 = (ix << 8) | LIT32(0x80000000); 140 | Uint32 m0 = (ix << 7) & LIT32(0x7fffffff); 141 | Uint32 m = even ? m0 : m1; 142 | 143 | // 2^e is exponent part. 144 | Uint32 ey = ix >> 1; 145 | ey += LIT32(0x3f800000) >> 1; 146 | ey &= LIT32(0x7f800000); 147 | 148 | // Compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 iterations. 149 | static const Uint32 THREE = LIT32(0xc0000000); 150 | const Uint32 i = (ix >> 17) % 128; 151 | Uint32 r, s, d, u; 152 | r = (Uint32)TABLE[i] << 16; 153 | // |r*sqrt(m) - 1| < 0x1p-8 154 | s = mul32(m, r); 155 | // |s/sqrt(m) - 1| < 0x1p-8 156 | d = mul32(s, r); 157 | u = THREE - d; 158 | r = mul32(r, u) << 1; 159 | // |r*sqrt(m) - 1| < 0x1.7bp-16 160 | s = mul32(s, u) << 1; 161 | // |s/sqrt(m) - 1| < 0x1.7bp-16 162 | d = mul32(s, r); 163 | u = THREE - d; 164 | s = mul32(s, u); 165 | // -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 166 | s = (s - 1) >> 6; 167 | // s < sqrt(m) < s + 0x1.08p-23 168 | 169 | // Compute nearest rounded result. 170 | const Uint32 d0 = (m << 16) - s*s; 171 | const Uint32 d1 = s - d0; 172 | const Uint32 d2 = d1 + s + 1; 173 | s += d1 >> 31; 174 | s &= LIT32(0x007fffff); 175 | s |= ey; 176 | 177 | const Float32 y = {s}; 178 | 179 | // Handle rounding and inexact exceptions. 180 | const Float32 t = {(d2 == 0 ? 0 : LIT32(0x01000000)) | ((d1 ^ d2) & LIT32(0x80000000))}; 181 | 182 | return float32_add(ctx, y, t); 183 | } 184 | 185 | Float32 float32_abs(Context *ctx, Float32 x) { 186 | (void)ctx; 187 | x.bits &= 0x7fffffff; 188 | return x; 189 | } 190 | 191 | Float32 float32_copysign(Context *ctx, Float32 x, Float32 y) { 192 | (void)ctx; 193 | x.bits &= LIT32(0x7fffffff); // abs 194 | x.bits |= y.bits & LIT32(0x80000000); // copy sign bit 195 | return x; 196 | } 197 | 198 | Float32 float32_max(Context *ctx, Float32 x, Float32 y) { 199 | if (float32_is_any_nan(x)) { 200 | return y; 201 | } 202 | if (float32_is_any_nan(y)) { 203 | return x; 204 | } 205 | 206 | // Handle signed zeros. 207 | const Flag sign_x = float32_sign(x); 208 | const Flag sign_y = float32_sign(y); 209 | if (sign_x != sign_y) { 210 | return sign_x ? y : x; 211 | } 212 | 213 | // IEEE makes it clear min and max should both use lt relational operation. 214 | return float32_lt(ctx, x, y) ? y : x; 215 | } 216 | 217 | Float32 float32_min(Context *ctx, Float32 x, Float32 y) { 218 | if (float32_is_any_nan(x)) { 219 | return y; 220 | } 221 | if (float32_is_any_nan(y)) { 222 | return x; 223 | } 224 | 225 | // Handle signed zeros. 226 | const Flag sign_x = float32_sign(x); 227 | const Flag sign_y = float32_sign(y); 228 | if (sign_x != sign_y) { 229 | return sign_x ? x : y; 230 | } 231 | 232 | return float32_lt(ctx, x, y) ? x : y; 233 | } -------------------------------------------------------------------------------- /kernel32.h: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL32_H 2 | #define KERNEL32_H 3 | #include "real32.h" 4 | 5 | Float32 float32_floor(Context*, Float32); 6 | Float32 float32_ceil(Context*, Float32); 7 | Float32 float32_trunc(Context*, Float32); 8 | Float32 float32_sqrt(Context*, Float32); 9 | Float32 float32_abs(Context*, Float32); 10 | Float32 float32_copysign(Context*, Float32, Float32); 11 | Float32 float32_max(Context*, Float32, Float32); 12 | Float32 float32_min(Context*, Float32, Float32); 13 | 14 | #endif -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | #include // printf 2 | #include // DBL_DIG 3 | #include // atoi 4 | 5 | #include "eval.h" 6 | 7 | static int usage(const char *app) { 8 | fprintf(stderr, "%s [OPTION]... [EXPRESSION]\n", app); 9 | fprintf(stderr, "-r rounding mode\n"); 10 | fprintf(stderr, " 0 - nearest even [default]\n"); 11 | fprintf(stderr, " 1 - to zero\n"); 12 | fprintf(stderr, " 2 - down\n"); 13 | fprintf(stderr, " 3 - up\n"); 14 | fprintf(stderr, "-t tininess detection mode\n"); 15 | fprintf(stderr, " 0 - before rounding [default]\n"); 16 | fprintf(stderr, " 1 - after rounding\n"); 17 | return 1; 18 | } 19 | 20 | int main(int argc, char **argv) { 21 | argc--; 22 | argv++; 23 | if (argc == 0) { 24 | return usage(argv[-1]); 25 | } 26 | 27 | Context c; 28 | c.round = ROUND_NEAREST_EVEN; 29 | c.tininess = TININESS_BEFORE_ROUNDING; 30 | context_init(&c); 31 | 32 | // Parse some command line options. 33 | if (argv[0][0] == '-') { 34 | if (argv[0][1] == 'r') { 35 | int round = atoi(argv[1]); 36 | if (round < 0 || round > 3) { 37 | return usage(argv[-1]); 38 | } 39 | argv += 2; // skip -r %d 40 | argc -= 2; 41 | c.round = round; 42 | } else if (argv[0][1] == 't') { 43 | int tiny = atoi(argv[1]); 44 | if (tiny < 0 || tiny > 1) { 45 | return usage(argv[-1]); 46 | } 47 | argv += 2; // skip -t %d 48 | argc -= 2; 49 | c.tininess = tiny; 50 | } else { 51 | return usage(argv[-1]); 52 | } 53 | } 54 | 55 | if (argc == 0) { 56 | return usage(argv[-1]); 57 | } 58 | 59 | Expression *e; 60 | if (!expr_parse(&e, argv[0])) { 61 | return 2; 62 | } 63 | 64 | const Real32 result = expr_eval32(&c, e); 65 | expr_print(stdout, e); 66 | printf("\n\tans: %.*f\n\terr: %.*f\n", 67 | DBL_DIG - 1, float32_cast(result.value), 68 | DBL_DIG - 1, float32_cast(result.eps)); 69 | expr_free(e); 70 | 71 | context_free(&c); 72 | 73 | return 0; 74 | } -------------------------------------------------------------------------------- /real32.c: -------------------------------------------------------------------------------- 1 | #include "real32.h" 2 | #include "kernel32.h" 3 | 4 | // When calculating error we don't want to muddy the value context. Use a copy 5 | // of it with the same rounding and tininess mode ignoring everything else. 6 | static inline Context eps_ctx(const Context *ctx) { 7 | Context c; 8 | context_copy(&c, ctx); 9 | return c; 10 | } 11 | 12 | Real32 real32_add(Context *ctx, Real32 a, Real32 b) { 13 | Context ec = eps_ctx(ctx); 14 | Real32 r; 15 | r.value = float32_add(ctx, a.value, b.value); 16 | r.eps = 17 | float32_add( 18 | &ec, 19 | // err(a) + err(b) 20 | float32_add(&ec, a.eps, b.eps), 21 | // EPSILON * abs(value) 22 | float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value))); 23 | return r; 24 | } 25 | 26 | Real32 real32_sub(Context *ctx, Real32 a, Real32 b) { 27 | Context ec = eps_ctx(ctx); 28 | Real32 r; 29 | r.value = float32_sub(ctx, a.value, b.value); 30 | r.eps = 31 | float32_add( 32 | &ec, 33 | // err(a) + err(b) 34 | float32_add(&ec, a.eps, b.eps), 35 | // EPSILON * abs(value) 36 | float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value))); 37 | return r; 38 | } 39 | 40 | Real32 real32_mul(Context *ctx, Real32 a, Real32 b) { 41 | Context ec = eps_ctx(ctx); 42 | Real32 r; 43 | r.value = float32_mul(ctx, a.value, b.value); 44 | r.eps = float32_add( 45 | &ec, 46 | float32_add( 47 | &ec, 48 | float32_add( 49 | &ec, 50 | // err(a) * abs(b) 51 | float32_mul(&ec, a.eps, float32_abs(&ec, b.value)), 52 | // err(b) * abs(a) 53 | float32_mul(&ec, b.eps, float32_abs(&ec, a.value))), 54 | // err(a) * err(b) 55 | float32_mul(&ec, a.eps, b.eps)), 56 | // EPSILON * abs(value) 57 | float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value))); 58 | return r; 59 | } 60 | 61 | // Calculating division error is non-trivial when the divisor is inaccurate, 62 | // use the following to recover inaccuracies for inaccurate divisor 63 | // r^2(-x) - r*x + 0 = 0 64 | Real32 real32_div(Context *ctx, Real32 a, Real32 b) { 65 | Context ec = eps_ctx(ctx); 66 | Real32 r; 67 | r.value = float32_div(ctx, a.value, b.value); 68 | 69 | const Float32 abs_b = float32_abs(&ec, b.value); 70 | const Float32 abs_r = float32_abs(&ec, r.value); 71 | Float32 e = 72 | float32_div( 73 | &ec, 74 | float32_add( 75 | &ec, 76 | a.eps, 77 | // abs(r) * eps(b) 78 | float32_mul(&ec, abs_r, b.eps)), 79 | abs_b); 80 | 81 | // Use more accurate for inaccurate divisors. 82 | static const Float32 EPS = {LIT32(0x3c23d70a)}; // 0.01f 83 | if (float32_gt(&ec, b.eps, float32_mul(&ec, EPS, abs_b))) { 84 | const Float32 r = float32_div(&ec, b.eps, b.value); 85 | // e = e * (1 + (1 + r) * r) 86 | e = float32_mul( 87 | &ec, 88 | e, 89 | // 1 + (1 + r) * r 90 | float32_add( 91 | &ec, 92 | float32_from_sint32(&ec, 1), 93 | // (1 + r) * r 94 | float32_mul( 95 | &ec, 96 | // 1 + r 97 | float32_add( 98 | &ec, 99 | float32_from_sint32(&ec, 1), 100 | r), 101 | r))); 102 | } 103 | 104 | r.eps = 105 | // e + (EPSILON * abs(value)) 106 | float32_add( 107 | &ec, 108 | e, 109 | // EPSILON * abs(value) 110 | float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value))); 111 | 112 | return r; 113 | } 114 | 115 | Real32 real32_sqrt(Context *ctx, Real32 x) { 116 | Context ec = eps_ctx(ctx); 117 | 118 | // Calculate error. 119 | Float32 d; 120 | // Assume non-negative input. 121 | if (float32_gte(&ec, x.value, FLOAT32_ZERO)) { 122 | const Float32 r = float32_sqrt(&ec, x.value); 123 | // if x > 10.0 * err(x) 124 | const Float32 err = float32_mul(&ec, float32_from_sint32(&ec, 10), x.eps); 125 | if (float32_gt(&ec, x.value, err)) { 126 | // 0.5 * (err(x) / r) 127 | d = float32_mul(&ec, FLOAT32_HALF, float32_div(&ec, x.eps, r)); 128 | } else { 129 | // if x > err(x) 130 | if (float32_gt(&ec, x.value, x.eps)) { 131 | // r - sqrt(x - err(x)) 132 | d = float32_sub(&ec, r, float32_sqrt(&ec, float32_sub(&ec, x.value, x.eps))); 133 | } else { 134 | // max(r, sqrt(x + err(x)) - r) 135 | d = float32_max(&ec, r, float32_sub(&ec, float32_sqrt(&ec, float32_add(&ec, x.value, x.eps)), r)); 136 | } 137 | } 138 | // d += EPSILON * abs(r) 139 | d = float32_add(&ec, d, float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r))); 140 | } else { 141 | // Assume negative input. 142 | if (float32_lt(&ec, x.value, float32_mul(&ec, x.eps, FLOAT32_MINUS_ONE))) { 143 | d = FLOAT32_NAN; 144 | } else { 145 | // Assume zero input. 146 | d = float32_sqrt(&ec, x.eps); 147 | } 148 | } 149 | 150 | return (Real32){float32_sqrt(ctx, x.value), d}; 151 | } 152 | 153 | // Operations that cannot generate error. 154 | #define REAL32_WRAP1_NO_ERROR(name) \ 155 | Real32 real32_ ## name(Context *ctx, Real32 a) { \ 156 | return (Real32){float32_ ## name(ctx, a.value), {0}}; \ 157 | } 158 | 159 | #define REAL32_WRAP2_NO_ERROR(name) \ 160 | Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b) { \ 161 | return (Real32){float32_ ## name(ctx, a.value, b.value), {0}}; \ 162 | } 163 | 164 | #define REAL32_WRAP_RELATION_NO_ERROR(name) \ 165 | Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b) { \ 166 | return float32_ ## name(ctx, a.value, b.value) ? REAL32_ONE : REAL32_ZERO; \ 167 | } 168 | 169 | REAL32_WRAP1_NO_ERROR(floor) 170 | REAL32_WRAP1_NO_ERROR(ceil) 171 | REAL32_WRAP1_NO_ERROR(trunc) 172 | 173 | REAL32_WRAP1_NO_ERROR(abs) 174 | 175 | REAL32_WRAP2_NO_ERROR(copysign) 176 | REAL32_WRAP2_NO_ERROR(max) 177 | REAL32_WRAP2_NO_ERROR(min) 178 | 179 | REAL32_WRAP_RELATION_NO_ERROR(eq) 180 | REAL32_WRAP_RELATION_NO_ERROR(lte) 181 | REAL32_WRAP_RELATION_NO_ERROR(lt) 182 | REAL32_WRAP_RELATION_NO_ERROR(ne) 183 | REAL32_WRAP_RELATION_NO_ERROR(gte) 184 | REAL32_WRAP_RELATION_NO_ERROR(gt) -------------------------------------------------------------------------------- /real32.h: -------------------------------------------------------------------------------- 1 | #ifndef REAL32_H 2 | #define REAL32_H 3 | #include "float32.h" 4 | #include "kernel32.h" 5 | 6 | // Accumulative error accounting. 7 | // 8 | // The idea here is arithmetic results of soft float will always be close to 9 | // the correct value +- 0.5 * EPSILON * value. 10 | // 11 | // That is: 12 | // err(a+b) = err(a) + err(b) + EPSILON * abs(a+b) 13 | // 14 | // The error result of an elementary floating-point operation does not exceed 15 | // and is close to abs(result) * EPSILON. 16 | typedef struct Real32 Real32; 17 | 18 | struct Real32 { 19 | Float32 value; 20 | Float32 eps; 21 | }; 22 | 23 | // Cannot use FLOAT32_ZERO here as it would be a non-const initializer in C. 24 | #define REAL32_NAN (Real32){FLOAT32_NAN, {0}} // NaN 25 | #define REAL32_EPSILON (Real32){FLOAT32_EPSILON, {0}} // 0x0.000002p0 26 | #define REAL32_ZERO (Real32){FLOAT32_ZERO, {0}} // 0.0 27 | #define REAL32_HALF (Real32){FLOAT32_HALF, {0}} // 0.5 28 | #define REAL32_ONE (Real32){FLOAT32_ONE, {0}} // 1.0 29 | #define REAL32_MINUS_ONE (Real32){FLOAT32_MINUS_ONE, {0}} // -1.0 30 | 31 | Real32 real32_add(Context *ctx, Real32 a, Real32 b); 32 | Real32 real32_sub(Context *ctx, Real32 a, Real32 b); 33 | Real32 real32_mul(Context *ctx, Real32 a, Real32 b); 34 | Real32 real32_div(Context *ctx, Real32 a, Real32 b); 35 | 36 | #define REAL32_WRAP1_NO_ERROR(name) \ 37 | Real32 real32_ ## name(Context *ctx, Real32 a) 38 | #define REAL32_WRAP2_NO_ERROR(name) \ 39 | Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b) 40 | #define REAL32_WRAP_RELATION_NO_ERROR(name) \ 41 | Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b) 42 | 43 | // Operations that cannot produce errors. 44 | // 1. Truncation. 45 | REAL32_WRAP1_NO_ERROR(floor); 46 | REAL32_WRAP1_NO_ERROR(ceil); 47 | REAL32_WRAP1_NO_ERROR(trunc); 48 | // 2. Absolute. 49 | REAL32_WRAP1_NO_ERROR(abs); 50 | // 3. Sign bit inspection. 51 | REAL32_WRAP2_NO_ERROR(copysign); 52 | REAL32_WRAP2_NO_ERROR(max); 53 | REAL32_WRAP2_NO_ERROR(min); 54 | // 4. Relational operators. 55 | REAL32_WRAP_RELATION_NO_ERROR(eq); 56 | REAL32_WRAP_RELATION_NO_ERROR(lte); 57 | REAL32_WRAP_RELATION_NO_ERROR(lt); 58 | REAL32_WRAP_RELATION_NO_ERROR(ne); 59 | REAL32_WRAP_RELATION_NO_ERROR(gte); 60 | REAL32_WRAP_RELATION_NO_ERROR(gt); 61 | 62 | #undef REAL32_WRAP_RELATION_NO_ERROR 63 | #undef REAL32_WRAP2_NO_ERROR 64 | #undef REAL32_WRAP1_NO_ERROR 65 | 66 | Real32 real32_sqrt(Context*, Real32); 67 | 68 | #endif // ERROR_H -------------------------------------------------------------------------------- /soft.c: -------------------------------------------------------------------------------- 1 | #include "float32.h" 2 | #include "float64.h" 3 | 4 | void context_init(Context* context) { 5 | context->exceptions = NULL; 6 | context->operations = NULL; 7 | context->roundings = 0; 8 | } 9 | 10 | void context_free(Context* context) { 11 | array_free(context->exceptions); 12 | array_free(context->operations); 13 | } 14 | 15 | void context_copy(Context* dst, const Context *src) { 16 | context_init(dst); 17 | dst->round = src->round; 18 | dst->tininess = src->tininess; 19 | } 20 | 21 | bool context_raise(Context *context, Exception exception) { 22 | return array_push(context->exceptions, exception); 23 | } 24 | 25 | static Float32 canonical_nan_to_float32(CanonicalNaN nan) { 26 | return (Float32){(((Uint32)nan.sign) << 31) | LIT32(0x7FC00000) | (nan.hi >> 41)}; 27 | } 28 | 29 | static Float64 canonical_nan_to_float64(CanonicalNaN nan) { 30 | return (Float64){(((Uint64)nan.sign) << 63) | LIT64(0x7FF8000000000000) | (nan.hi >> 12)}; 31 | } 32 | 33 | Float32 float64_to_float32(Context *ctx, Float64 a) { 34 | Uint64 a_sig = float64_fract(a); 35 | Sint16 a_exp = float64_exp(a); 36 | Flag a_sign = float64_sign(a); 37 | if (a_exp == 0x7ff) { 38 | return a_sig 39 | ? canonical_nan_to_float32(float64_to_canonical_nan(ctx, a)) 40 | : float32_pack(a_sign, 0xff, 0); 41 | } 42 | a_sig = rshr64(a_sig, 22); 43 | Uint32 sig = a_sig; 44 | if (a_exp || sig) { 45 | sig |= LIT32(0x40000000); 46 | a_exp -= 0x381; 47 | } 48 | return float32_round_and_pack(ctx, a_sign, a_exp, sig); 49 | } 50 | 51 | Float64 float32_to_float64(Context *ctx, Float32 a) { 52 | Uint32 a_sig = float32_fract(a); 53 | Sint16 a_exp = float32_exp(a); 54 | Flag a_sign = float32_sign(a); 55 | if (a_exp == 0xff) { 56 | return a_sig 57 | ? canonical_nan_to_float64(float32_to_canonical_nan(ctx, a)) 58 | : float64_pack(a_sign, 0x7ff, 0); 59 | } 60 | if (a_exp == 0) { 61 | if (a_sig == 0) { 62 | return float64_pack(a_sign, 0, 0); 63 | } 64 | Normal32 normal = float32_normalize_subnormal(a_sig); 65 | a_exp = normal.exp; 66 | a_sig = normal.sig; 67 | a_exp--; 68 | } 69 | return float64_pack(a_sign, a_exp + 0x380, (Uint64)a_sig << 29); 70 | } -------------------------------------------------------------------------------- /soft.h: -------------------------------------------------------------------------------- 1 | #ifndef SOFT_H 2 | #define SOFT_H 3 | #include "array.h" 4 | 5 | typedef Sint8 Flag; 6 | 7 | typedef enum Round Round; 8 | typedef enum Exception Exception; 9 | typedef enum Tininess Tininess; 10 | typedef enum Operation Operation; 11 | 12 | typedef struct Context Context; 13 | 14 | typedef struct Float32 Float32; 15 | typedef struct Float64 Float64; 16 | typedef struct Normal32 Normal32; 17 | typedef struct Normal64 Normal64; 18 | 19 | typedef struct CanonicalNaN CanonicalNaN; 20 | 21 | struct Float32 { 22 | Uint32 bits; 23 | }; 24 | 25 | struct Float64 { 26 | Uint64 bits; 27 | }; 28 | 29 | struct Normal32 { 30 | Uint32 sig; 31 | Sint16 exp; 32 | }; 33 | 34 | struct Normal64 { 35 | Uint64 sig; 36 | Sint16 exp; 37 | }; 38 | 39 | // Canonical NaN format for conversion between NaNs in different precisions. 40 | struct CanonicalNaN { 41 | Flag sign; 42 | Uint64 hi; 43 | Uint64 lo; 44 | }; 45 | 46 | enum Round { 47 | ROUND_NEAREST_EVEN, 48 | ROUND_TO_ZERO, 49 | ROUND_DOWN, 50 | ROUND_UP 51 | }; 52 | 53 | enum Exception { 54 | EXCEPTION_INEXACT = 1 << 0, 55 | EXCEPTION_UNDERFLOW = 1 << 1, 56 | EXCEPTION_OVERFLOW = 1 << 2, 57 | EXCEPTION_INFINITE = 1 << 3, 58 | EXCEPTION_INVALID = 1 << 4 59 | }; 60 | 61 | enum Tininess { 62 | TININESS_AFTER_ROUNDING, 63 | TININESS_BEFORE_ROUNDING 64 | }; 65 | 66 | enum Operation { 67 | OPERATION_ADD, 68 | OPERATION_SUB, 69 | OPERATION_MUL, 70 | OPERATION_DIV 71 | }; 72 | 73 | struct Context { 74 | Round round; 75 | Size roundings; 76 | ARRAY(Exception) exceptions; ///< Array of flags of triggered exceptions. 77 | ARRAY(Operation) operations; ///< Array of all operations carried out 78 | Tininess tininess; 79 | }; 80 | 81 | void context_init(Context* context); 82 | void context_free(Context* context); 83 | void context_copy(Context* dst, const Context *src); 84 | bool context_raise(Context *context, Exception exception); 85 | 86 | // Special right shifts where the least significant bit of result is set when 87 | // any non-zero bits are shifted off. 88 | static inline Uint32 rshr32(Uint32 a, Sint16 count) { 89 | if (count == 0) { 90 | return a; 91 | } else if (count < 32) { 92 | return (a >> count) | ((a << ((-count) & 31)) != 0); 93 | } 94 | return a != 0 ? 1 : 0; 95 | } 96 | 97 | static inline Uint64 rshr64(Uint64 a, Sint16 count) { 98 | if (count == 0) { 99 | return a; 100 | } else if (count < 64) { 101 | return (a >> count) | ((a << ((-count) & 63)) != 0); 102 | } 103 | return a != 0 ? 1 : 0; 104 | } 105 | 106 | Float32 float64_to_float32(Context*, Float64); 107 | Float64 float32_to_float64(Context*, Float32); 108 | 109 | #endif // SOFT_H -------------------------------------------------------------------------------- /types.h: -------------------------------------------------------------------------------- 1 | #ifndef TYPES_H 2 | #define TYPES_H 3 | #include // u?int{8,16,32,64}_t 4 | #include // bool, true, false 5 | #include // size_t 6 | 7 | typedef uint8_t Uint8; 8 | typedef uint16_t Uint16; 9 | typedef uint32_t Uint32; 10 | typedef uint64_t Uint64; 11 | 12 | typedef int8_t Sint8; 13 | typedef int16_t Sint16; 14 | typedef int32_t Sint32; 15 | typedef int64_t Sint64; 16 | 17 | typedef bool Bool; 18 | 19 | typedef size_t Size; 20 | 21 | #define LIT32(x) ((Uint32)x ## ul) 22 | #define LIT64(x) ((Uint64)x ## ull) 23 | 24 | #endif // TYPES_H -------------------------------------------------------------------------------- /uint128.c: -------------------------------------------------------------------------------- 1 | #include "uint128.h" 2 | 3 | Uint128 uint128_mul64x64(Uint64 a, Uint64 b) { 4 | const Uint32 al = a; 5 | const Uint32 ah = a >> 32; 6 | const Uint32 bl = b; 7 | const Uint32 bh = b >> 32; 8 | Uint64 z0, z1; 9 | Uint64 ma, mb; 10 | z1 = (Uint64)al * bl; 11 | ma = (Uint64)al * bh; 12 | mb = (Uint64)ah * bl; 13 | z0 = (Uint64)ah * bh; 14 | ma += mb; 15 | z0 += ((Uint64)(ma < mb) << 32) + (ma >> 32); 16 | ma <<= 32; 17 | z1 += ma; 18 | z0 += z1 < ma; 19 | return (Uint128){z0, z1}; 20 | } 21 | 22 | Uint64 uint128_div128x64(Uint128 a, Uint64 b) { 23 | if (b <= a.z0) { 24 | return LIT64(0xFFFFFFFFFFFFFFFF); 25 | } 26 | 27 | Uint64 b0 = b >> 32; 28 | Uint64 b1; 29 | 30 | Uint64 z = (b0 << 32 <= a.z0) 31 | ? LIT64(0xFFFFFFFF00000000) : (a.z0 / b0) << 32; 32 | 33 | Uint128 mul = uint128_mul64x64(b, z); 34 | Uint128 rem = uint128_sub(a, mul); 35 | 36 | while ((Sint64)rem.z0 < 0) { 37 | z -= LIT64(0x100000000); 38 | b1 = b << 32; 39 | rem = uint128_add(rem, (Uint128){b0, b1}); 40 | } 41 | rem.z0 = (rem.z0 << 32) | (rem.z1 >> 32); 42 | 43 | z |= (b0 << 32 <= rem.z0) ? LIT32(0xffffffff) : rem.z0 / b0; 44 | 45 | return z; 46 | } -------------------------------------------------------------------------------- /uint128.h: -------------------------------------------------------------------------------- 1 | #ifndef UINT128_H 2 | #define UINT128_H 3 | #include "types.h" 4 | 5 | typedef struct Uint128 Uint128; 6 | 7 | struct Uint128 { 8 | Uint64 z0; 9 | Uint64 z1; 10 | }; 11 | 12 | // Multiplies two 64-bit integers to obtain a 128-bit product. 13 | Uint128 uint128_mul64x64(Uint64 a, Uint64 b); 14 | 15 | // Calculate approximation to the 64-bit integer quotient obtained by dividing 16 | // 64-bit b into the 128-bit a. The divisor b must be at least 2^63. 17 | Uint64 uint128_div128x64(Uint128 a, Uint64 b); 18 | 19 | // Subtraction is modulo 2^128 20 | static inline Uint128 uint128_sub(Uint128 a, Uint128 b) { 21 | const Uint64 z1 = a.z1 - b.z1; 22 | return (Uint128){a.z0 - b.z0 - z1, z1}; 23 | } 24 | 25 | // Addition is modulo 2^128 26 | static inline Uint128 uint128_add(Uint128 a, Uint128 b) { 27 | const Uint64 z1 = a.z1 + b.z1; 28 | return (Uint128){a.z0 + b.z0 + (z1 < a.z1), z1}; 29 | } 30 | 31 | #endif // UINT128_H --------------------------------------------------------------------------------