├── LICENSE.md
├── Makefile
├── README.md
├── array.c
├── array.h
├── eval.c
├── eval.h
├── float32.c
├── float32.h
├── float64.c
├── float64.h
├── kernel32.c
├── kernel32.h
├── main.c
├── real32.c
├── real32.h
├── soft.c
├── soft.h
├── types.h
├── uint128.c
└── uint128.h


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 Dale Weiler
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SRCS := $(wildcard *.c)
 2 | OBJS := $(SRCS:.c=.o)
 3 | 
 4 | CFLAGS := -Wall
 5 | CFLAGS += -Wextra
 6 | CFLAGS += -O2
 7 | CFLAGS += -g
 8 | 
 9 | all: fpinspect
10 | 
11 | fpinspect: $(OBJS)
12 | 	$(CC) -o $@ $^ $(CFLAGS)
13 | 
14 | clean:
15 | 	rm -f $(OBJS) fpinspect
16 | 
17 | .PHONY: clean


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Floating point expression inspector
  2 | 
  3 | The following tool lets you inspect the computational flow of a floating point
  4 | expression, seeing where rounding occurs, when exceptions are triggered,
  5 | when precision may be lost, when special values propagate, when error
  6 | accumulates, and other floating point headaches.
  7 | 
  8 | # Example
  9 | ```
 10 | [fpinspect]# ./fpinspect "sqrt(45.0*e+phi)/pi"
 11 | Exception: 0 (1 roundings) INEXACT (45.000000 * e)
 12 |   Trace (1 operations) MUL
 13 | 
 14 | Exception: 0 (1 roundings) INEXACT phi
 15 |   Trace (1 operations) MUL
 16 | 
 17 | Exception: 0 (2 roundings) INEXACT ((45.000000 * e) + phi)
 18 | Exception: 1 (2 roundings) INEXACT ((45.000000 * e) + phi)
 19 |   Trace (2 operations) MUL ADD
 20 | 
 21 | Exception: 0 (3 roundings) INEXACT sqrt(((45.000000 * e) + phi))
 22 | Exception: 1 (3 roundings) INEXACT sqrt(((45.000000 * e) + phi))
 23 | Exception: 2 (3 roundings) INEXACT sqrt(((45.000000 * e) + phi))
 24 |   Trace (3 operations) MUL ADD ADD
 25 | 
 26 | Exception: 0 (3 roundings) INEXACT pi
 27 | Exception: 1 (3 roundings) INEXACT pi
 28 | Exception: 2 (3 roundings) INEXACT pi
 29 |   Trace (3 operations) MUL ADD ADD
 30 | 
 31 | Exception: 0 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi)
 32 | Exception: 1 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi)
 33 | Exception: 2 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi)
 34 | Exception: 3 (4 roundings) INEXACT (sqrt(((45.000000 * e) + phi)) / pi)
 35 |   Trace (4 operations) MUL ADD ADD DIV
 36 | 
 37 | (sqrt(((45.000000 * e) + phi)) / pi)
 38 |         ans: 3.54370117187500
 39 |         err: 0.00000126456894
 40 | ```
 41 | 
 42 | As you can see, the expression `sqrt(45.0*e+phi)/pi` produces a lot of output,
 43 | each empty-line-separated region is a subexpression which triggered an exception,
 44 | in this case because `45 * e` is an inexact value, the inexact exception is
 45 | presented first. Here you can see that such an expression involved `1 operations`,
 46 | total and in this case the operation is just a `MUL`. We can also see that the
 47 | resulting expression, because it's inexact, incurred one rounding.
 48 | 
 49 | Following down the exception list, we can see that the exception propagated
 50 | to `phi` in a `MUL` (which is also an inexact value), and continued, with each
 51 | new inexact subexpression resulting in several roundings. Since kernels like
 52 | `sqrt` might themselves use operations like `add`, we also see the final group
 53 | of exceptions contains an additional `ADD` in it's trace.
 54 | 
 55 | The final result of the expression is given in `ans:` and below that you will
 56 | find the accumulative error `err:` of evaluating that expression, in this case
 57 | this function is exact to five mantissa digits of precision, out of a total of
 58 | seven, which means this expression has ~0.71 ULP of error.
 59 | 
 60 | # Documentation
 61 | Run the program with no expression or `-h` to see the options.
 62 | 
 63 | Here's some constants and functions available for use in expressions.
 64 | ### Constants
 65 |   * e
 66 |   * pi
 67 |   * phi
 68 | 
 69 | ### Functions
 70 |   * floor
 71 |   * ceil
 72 |   * trunc
 73 |   * sqrt
 74 |   * abs
 75 |   * min
 76 |   * max
 77 |   * copysign
 78 | 
 79 | # How it works
 80 | This program implements IEEE-754 floating point completely in software, emulating
 81 | all rounding modes, exceptions, and tininess detection methods which can be
 82 | configured when evaluating an expression. With exception to transcendental
 83 | functions, all floating point computation is also accurate to <= 1 ULP of error.
 84 | 
 85 | Currently there is support for 32-bit single-precision floating-point
 86 | `float32.{h,c}` and 64-bit double-precision floating-point `float64.{h,c}`, as
 87 | double-precision is necessary for 32-bit single-precision kernels
 88 | `kernel32.{h,c}` to produce correctly rounded and truncated results
 89 | to <= 1 ULP of error.
 90 | 
 91 | 64-bit double-precision floating-point makes use of 128-bit modular arithmetic
 92 | implemented in `uint128.{h,c}`
 93 | 
 94 | Accumulative error accounting is handled by `real32.{h,c}` and `real64.{h,c}`
 95 | for single-precision and double-precision floating-point, respectively.
 96 | 
 97 | > NOTE:
 98 | >
 99 | > There are currently no 64-bit kernels, as that would require either 80-bit 
100 | extended-precision floating-point, or 128-bit quadruple-precision floating-point
101 | to be implemented in software to have the precision necessary to produce
102 | correctly rounded and truncated results to <= 1 ULP of error.


--------------------------------------------------------------------------------
/array.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h> // malloc, realloc, free.
 2 | 
 3 | #include "array.h"
 4 | 
 5 | Bool array_grow(void **array, Size expand, Size type_size) {
 6 |   Array *meta = array_meta(*array);
 7 |   Size count = 0;
 8 |   void *data = NULL;
 9 | 
10 |   if (*array) {
11 |     count = 2 * meta->capacity + expand;
12 |     data = realloc(meta, type_size * count + sizeof *meta);
13 |     if (!data) {
14 |       return false;
15 |     }
16 |   } else {
17 |     count = expand + 1;
18 |     data = malloc(type_size * count + sizeof *meta);
19 |     if (!data) {
20 |       return false;
21 |     }
22 |     ((Array*)data)->size = 0;
23 |   }
24 | 
25 |   meta = (Array*)data;
26 |   meta->capacity = count;
27 | 
28 |   *array = meta + 1;
29 | 
30 |   return true;
31 | }
32 | 
33 | void array_delete(void *array) {
34 |   free(array_meta(array));
35 | }


--------------------------------------------------------------------------------
/array.h:
--------------------------------------------------------------------------------
 1 | #ifndef ARRAY_H
 2 | #define ARRAY_H
 3 | #include "types.h"
 4 | 
 5 | typedef struct Array Array;
 6 | 
 7 | struct Array {
 8 |   Size size;
 9 |   Size capacity;
10 | };
11 | 
12 | #define ARRAY(T) T*
13 | 
14 | #define array_meta(array) \
15 |   ((Array*)(((Uint8*)(array)) - sizeof(Array)))
16 | 
17 | // grow [array] by [expand] elements.
18 | #define array_try_grow(array, expand) \
19 |   (((!(array) || array_meta(array)->size + (expand) >= array_meta(array)->capacity)) \
20 |     ? array_grow(((void **)&(array)), (expand), sizeof(*(array))) \
21 |     : true)
22 | 
23 | // push [value] into [array]
24 | #define array_push(array, value) \
25 |   (array_try_grow((array), 1) \
26 |     ? ((array)[array_meta(array)->size++] = (value), true) \
27 |     : false)
28 | 
29 | // free [array]
30 | #define array_free(array) \
31 |   ((void)((array) ? (array_delete((void*)(array)), (array) = 0) : 0))
32 | 
33 | // size of [array]
34 | #define array_size(array) \
35 |   ((array) ? array_meta(array)->size : 0)
36 | 
37 | Bool array_grow(void**, Size, Size);
38 | void array_delete(void*);
39 | 
40 | #endif // ARRAY_H


--------------------------------------------------------------------------------
/eval.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h> // calloc, free
  2 | #include <string.h> // strchr
  3 | #include <stdio.h> // fprintf, stderr
  4 | 
  5 | #include "eval.h"
  6 | 
  7 | typedef struct Parser Parser;
  8 | typedef struct Expression Expression;
  9 | 
 10 | struct Expression {
 11 |   enum {
 12 |     EXPR_VALUE,
 13 |     EXPR_CONST,
 14 |     EXPR_FUNC1, EXPR_FUNC2,
 15 |     EXPR_EQ, EXPR_LTE, EXPR_LT,
 16 |     EXPR_NE, EXPR_GTE, EXPR_GT,
 17 |     EXPR_ADD, EXPR_SUB, EXPR_MUL, EXPR_DIV,
 18 |     EXPR_LAST
 19 |   } type;
 20 |   Real32 value;
 21 |   union {
 22 |     Size constant;
 23 |     enum {
 24 |       // EXPR_FUNC1
 25 |       FUNC_FLOOR,
 26 |       FUNC_CEIL,
 27 |       FUNC_TRUNC,
 28 |       FUNC_SQRT,
 29 |       FUNC_ABS,
 30 |       // EXPR_FUNC2
 31 |       FUNC_MIN,
 32 |       FUNC_MAX,
 33 |       FUNC_COPYSIGN
 34 |     } func;
 35 |   };
 36 |   Expression* params[2];
 37 | };
 38 | 
 39 | static const struct {
 40 |   const char *identifier;
 41 |   const Real32 value;
 42 | } CONSTANTS[] = {
 43 |   { "e",    {{LIT32(0x402df854)}, {0}} },
 44 |   { "pi",   {{LIT32(0x40490fdb)}, {0}} },
 45 |   { "phi",  {{LIT32(0x3fcf1bbd)}, {0}} },
 46 |   { "fmin", {{LIT32(0x00800000)}, {0}} }, // FLT_MIN
 47 |   { "fmax", {{LIT32(0x7f7fffff)}, {0}} }, // FLT_MAX
 48 | };
 49 | 
 50 | static const struct {
 51 |   const char *match;
 52 |   Uint32 func;
 53 | } FUNCS1[] = {
 54 |   { "floor", FUNC_FLOOR },
 55 |   { "ceil",  FUNC_CEIL  },
 56 |   { "trunc", FUNC_TRUNC },
 57 |   { "sqrt",  FUNC_SQRT  },
 58 |   { "abs",   FUNC_ABS   }
 59 | };
 60 | 
 61 | static const struct {
 62 |   const char *match;
 63 |   Uint32 func;
 64 | } FUNCS2[] = {
 65 |   { "min",      FUNC_MIN      },
 66 |   { "max",      FUNC_MAX      },
 67 |   { "copysign", FUNC_COPYSIGN }
 68 | };
 69 | 
 70 | #define ARRAY_COUNT(x) \
 71 |   (sizeof (x) / sizeof (*(x)))
 72 | 
 73 | static const char* func1_name(Uint32 func) {
 74 |   for (Size i = 0; i < ARRAY_COUNT(FUNCS1); i++) {
 75 |     if (FUNCS1[i].func == func) {
 76 |       return FUNCS1[i].match;
 77 |     }
 78 |   }
 79 |   return NULL;
 80 | }
 81 | 
 82 | static const char *func2_name(Uint32 func) {
 83 |   for (Size i = 0; i < ARRAY_COUNT(FUNCS2); i++) {
 84 |     if (FUNCS2[i].func == func) {
 85 |       return FUNCS2[i].match;
 86 |     }
 87 |   }
 88 |   return NULL;
 89 | }
 90 | 
 91 | // This is cheating for now until we implement an accurate strtof, strtod, etc.
 92 | static Real32 real32_from_string(const char *string, char **next) {
 93 |   union { float f; Float32 s; } u = {strtof(string, next)};
 94 |   return (Real32){u.s, {0}};
 95 | }
 96 | 
 97 | static Bool is_identifier(int ch) {
 98 |   return ((unsigned)ch - '0' <= 9u)
 99 |       || ((unsigned)ch - 'a' <= 25u)
100 |       || ((unsigned)ch - 'A' <= 25u)
101 |       || ch == '_';
102 | }
103 | 
104 | static bool match(const char *s, const char *prefix) {
105 |   Size i = 0;
106 |   for (; prefix[i]; i++) {
107 |     if (prefix[i] != s[i]) {
108 |       return false;
109 |     }
110 |   }
111 |   return !is_identifier(s[i]); // Should be terminated identifier.
112 | }
113 | 
114 | struct Parser {
115 |   Sint32 level;
116 |   char *s;
117 | };
118 | 
119 | #define ALU(fp, op) \
120 |     fprintf(fp, "("); \
121 |     expr_print(fp, expression->params[0]); \
122 |     fprintf(fp, " %s ", op); \
123 |     expr_print(fp, expression->params[1]); \
124 |     fprintf(fp, ")"); \
125 |     break
126 | 
127 | void expr_print(FILE *fp, Expression *expression) {
128 |   switch (expression->type) {
129 |   case EXPR_VALUE:
130 |     fprintf(fp, "%f", float32_cast(expression->value.value));
131 |     break;
132 |   case EXPR_CONST:
133 |     fprintf(fp, "%s", CONSTANTS[expression->constant].identifier);
134 |     break;
135 |   case EXPR_FUNC1:
136 |     fprintf(fp, "%s(", func1_name(expression->func));
137 |     expr_print(fp, expression->params[0]);
138 |     fprintf(fp, ")");
139 |     break;
140 |   case EXPR_FUNC2:
141 |     fprintf(fp, "%s(", func2_name(expression->func));
142 |     expr_print(fp, expression->params[0]);
143 |     fprintf(fp, ", ");
144 |     expr_print(fp, expression->params[1]);
145 |     fprintf(fp, ")");
146 |     break;
147 |   case EXPR_ADD: ALU(fp, "+");
148 |   case EXPR_SUB: ALU(fp, "-");
149 |   case EXPR_MUL: ALU(fp, "*");
150 |   case EXPR_DIV: ALU(fp, "/");
151 |   default:
152 |     break;
153 |   }
154 | }
155 | 
156 | static Real32 eval_func1_32(Context *ctx, Uint32 func, Real32 a) {
157 |   switch (func) {
158 |   case FUNC_FLOOR:
159 |     return real32_floor(ctx, a);
160 |   case FUNC_CEIL:
161 |     return real32_ceil(ctx, a);
162 |   case FUNC_TRUNC:
163 |     return real32_trunc(ctx, a);
164 |   case FUNC_SQRT:
165 |     return real32_sqrt(ctx, a);
166 |   case FUNC_ABS:
167 |     return real32_abs(ctx, a);
168 |   }
169 |   return (Real32){FLOAT32_ZERO, {0}};
170 | }
171 | 
172 | static Real32 eval_func2_32(Context *ctx, Uint32 func, Real32 a, Real32 b) {
173 |   switch (func) {
174 |   case FUNC_MIN:
175 |     return real32_min(ctx, a, b);
176 |   case FUNC_MAX:
177 |     return real32_max(ctx, a, b);
178 |   case FUNC_COPYSIGN:
179 |     return real32_copysign(ctx, a, b);
180 |   }
181 |   return (Real32){FLOAT32_ZERO, {0}};
182 | }
183 | 
184 | Real32 expr_eval32(Context *ctx, Expression *expression) {
185 |   if (!expression) {
186 |     return REAL32_ZERO;
187 |   }
188 | 
189 |   Real32 a = expr_eval32(ctx, expression->params[0]);
190 |   Real32 b = expr_eval32(ctx, expression->params[1]);
191 | 
192 |   Real32 result = REAL32_ZERO;
193 | 
194 |   switch (expression->type) {
195 |   /****/ case EXPR_VALUE: result = expression->value;
196 |   break; case EXPR_CONST: result = CONSTANTS[expression->constant].value;
197 |   break; case EXPR_FUNC1: result = eval_func1_32(ctx, expression->func, a);
198 |   break; case EXPR_FUNC2: result = eval_func2_32(ctx, expression->func, a, b);
199 |   break; case EXPR_EQ:    result = real32_eq(ctx, a, b);
200 |   break; case EXPR_LTE:   result = real32_lte(ctx, a, b);
201 |   break; case EXPR_LT:    result = real32_lt(ctx, a, b);
202 |   break; case EXPR_NE:    result = real32_ne(ctx, a, b);
203 |   break; case EXPR_GTE:   result = real32_gte(ctx, a, b);
204 |   break; case EXPR_GT:    result = real32_gt(ctx, a, b);
205 |   break; case EXPR_ADD:   result = real32_add(ctx, a, b);
206 |   break; case EXPR_SUB:   result = real32_sub(ctx, a, b);
207 |   break; case EXPR_MUL:   result = real32_mul(ctx, a, b);
208 |   break; case EXPR_DIV:   result = real32_div(ctx, a, b);
209 |   break; case EXPR_LAST:  // Empty.
210 |   break;
211 |   }
212 | 
213 |   Size n_operations = array_size(ctx->operations);
214 |   Size n_exceptions = array_size(ctx->exceptions);
215 | 
216 |   for (Size i = 0; i < n_exceptions; i++) {
217 |     Exception exception = ctx->exceptions[i];
218 |     fprintf(stderr, "Exception: %zu (%zu roundings) ", i, ctx->roundings);
219 |     Bool flag = false;
220 |     if (exception & EXCEPTION_INVALID) {
221 |       fprintf(stderr, "%sINVALID", flag ? "|" : ""), flag = true;
222 |     }
223 |     if (exception & EXCEPTION_INFINITE) {
224 |       fprintf(stderr, "%sINFINITE", flag ? "|" : ""), flag = true;
225 |     }
226 |     if (exception & EXCEPTION_OVERFLOW) {
227 |       fprintf(stderr, "%sOVERFLOW", flag ? "|" : ""), flag = true;
228 |     }
229 |     if (exception & EXCEPTION_UNDERFLOW) {
230 |       fprintf(stderr, "%sUNDERFLOW", flag ? "|" : ""), flag = true;
231 |     }
232 |     if (exception & EXCEPTION_INEXACT) {
233 |       fprintf(stderr, "%sINEXACT", flag ? "|" : ""), flag = true;
234 |     }
235 |     fprintf(stderr, " ");
236 |     expr_print(stderr, expression);
237 |     fprintf(stderr, "\n");
238 |   }
239 |   if (n_operations && n_exceptions) {
240 |     fprintf(stderr, "  Trace (%zu operations) ", n_operations);
241 |     Bool hit = false;
242 |     for (Size i = 0; i < n_operations; i++) {
243 |       Operation operation = ctx->operations[i];
244 |       switch (operation) {
245 |       case OPERATION_ADD: fprintf(stderr, "%sADD", hit ? " " : ""), hit = true; break;
246 |       case OPERATION_SUB: fprintf(stderr, "%sSUB", hit ? " " : ""), hit = true; break;
247 |       case OPERATION_MUL: fprintf(stderr, "%sMUL", hit ? " " : ""), hit = true; break;
248 |       case OPERATION_DIV: fprintf(stderr, "%sDIV", hit ? " " : ""), hit = true; break;
249 |       }
250 |     }
251 |     fprintf(stderr, "\n");
252 |     fprintf(stderr, "\n");
253 |   }
254 | 
255 |   return result;
256 | }
257 | 
258 | static Expression *create(int type, Expression *e0, Expression *e1) {
259 |   Expression *e = calloc(1, sizeof *e);
260 |   if (!e) {
261 |     return NULL;
262 |   }
263 |   e->type = type;
264 |   e->value = REAL32_ONE;
265 |   e->params[0] = e0;
266 |   e->params[1] = e1;
267 |   return e;
268 | }
269 | 
270 | static Bool parse_expr(Expression **e, Parser *p);
271 | static Bool parse_primary(Expression **e, Parser *p, Flag sign) {
272 |   Expression *d = calloc(1, sizeof *d);
273 |   if (!d) {
274 |     return false;
275 |   }
276 | 
277 |   char *next = p->s;
278 |   char *s0 = p->s;
279 |   d->value = real32_from_string(sign ? p->s - 1 : p->s, &next);
280 |   if (next != p->s) {
281 |     d->type = EXPR_VALUE;
282 |     p->s = next;
283 |     *e = d;
284 |     return true;
285 |   }
286 | 
287 |   d->value = REAL32_ONE;
288 | 
289 |   for (Size i = 0; i < sizeof CONSTANTS / sizeof *CONSTANTS; i++) {
290 |     if (!match(p->s, CONSTANTS[i].identifier)) {
291 |       continue;
292 |     }
293 |     p->s += strlen(CONSTANTS[i].identifier);
294 |     d->type = EXPR_CONST;
295 |     d->constant = i;
296 |     *e = d;
297 |     return true;
298 |   }
299 | 
300 |   p->s = strchr(p->s, '(');
301 |   if (!p->s) {
302 |     fprintf(stderr, "Undefined constant or missing '(' in '%s'\n", s0);
303 |     p->s = next;
304 |     expr_free(d);
305 |     return false;
306 |   }
307 | 
308 |   p->s++; // '('
309 |   if (*next == '(') {
310 |     expr_free(d);
311 |     if (!parse_expr(&d, p)) {
312 |       return false;
313 |     }
314 |     if (*p->s != ')') {
315 |       fprintf(stderr, "Missing ')' in '%s'\n", s0);
316 |       expr_free(d);
317 |       return false;
318 |     }
319 |     p->s++; // ')'
320 |     *e = d;
321 |     return true;
322 |   }
323 |   if (!parse_expr(&d->params[0], p)) {
324 |     expr_free(d);
325 |     return false;
326 |   }
327 |   if (*p->s == ',') {
328 |     p->s++; // ','
329 |     parse_expr(&d->params[1], p); // ignore?
330 |   }
331 |   if (*p->s != ')') {
332 |     fprintf(stderr, "Missing ')' or too many arguments in '%s'\n", s0);
333 |     expr_free(d);
334 |     return false;
335 |   }
336 |   p->s++; // ')'
337 | 
338 |   for (Size i = 0; i < ARRAY_COUNT(FUNCS1); i++) {
339 |     if (match(next, FUNCS1[i].match)) {
340 |       d->type = EXPR_FUNC1;
341 |       d->func = FUNCS1[i].func;
342 |       *e = d;
343 |       return true;
344 |     }
345 |   }
346 | 
347 |   for (Size i = 0; i < ARRAY_COUNT(FUNCS2); i++) {
348 |     if (match(next, FUNCS2[i].match)) {
349 |       d->type = EXPR_FUNC2;
350 |       d->func = FUNCS2[i].func;
351 |       *e = d;
352 |       return true;
353 |     }
354 |   }
355 | 
356 |   fprintf(stderr, "Unknown identifier '%s'", s0);
357 |   expr_free(d);
358 | 
359 |   return true;
360 | }
361 | 
362 | static Bool parse_top(Expression **e, Parser *p) {
363 |   Flag sign = false;
364 |   if (*p->s == '+') p->s++; // skip unary '+'
365 |   else if (*p->s == '-') p->s++, sign = true; // skip unary '-'
366 |   return parse_primary(e, p, sign);
367 | }
368 | 
369 | static Bool parse_factor(Expression **e, Parser *p) {
370 |   Expression *e0;
371 |   if (!parse_top(&e0, p)) {
372 |     return false;
373 |   }
374 |   // TODO(dweiler): Handle other operations here.
375 |   *e = e0;
376 |   return true;
377 | }
378 | 
379 | static Bool parse_term(Expression **e, Parser *p) {
380 |   Expression *e0, *e1, *e2;
381 |   if (!parse_factor(&e0, p)) {
382 |     return false;
383 |   }
384 |   while (*p->s == '*' || *p->s == '/') {
385 |     int ch = *p->s++;
386 |     e1 = e0;
387 |     if (!parse_factor(&e2, p)) {
388 |       expr_free(e1);
389 |       return false;
390 |     }
391 |     e0 = create(ch == '*' ? EXPR_MUL : EXPR_DIV, e1, e2);
392 |     if (!e0) {
393 |       expr_free(e1);
394 |       expr_free(e2);
395 |       return false;
396 |     }
397 |   }
398 |   *e = e0;
399 |   return true;
400 | }
401 | 
402 | static Bool parse_subexpr(Expression **e, Parser *p) {
403 |   Expression *e0, *e1, *e2;
404 |   if (!parse_term(&e0, p)) {
405 |     return false;
406 |   }
407 |   while (*p->s == '+' || *p->s == '-') {
408 |     int ch = *p->s++;
409 |     e1 = e0;
410 |     if (!parse_term(&e2, p)) {
411 |       expr_free(e1);
412 |       return false;
413 |     }
414 |     e0 = create(ch == '+' ? EXPR_ADD : EXPR_SUB, e1, e2);
415 |     if (!e0) {
416 |       expr_free(e1);
417 |       expr_free(e2);
418 |       return false;
419 |     }
420 |   }
421 |   *e = e0;
422 |   return true;
423 | }
424 | 
425 | static Bool parse_expr(Expression **e, Parser *p) {
426 |   Expression *e0, *e1, *e2;
427 |   if (!parse_subexpr(&e0, p)) {
428 |     return false;
429 |   }
430 |   while (*p->s == ';') {
431 |     p->s++;
432 |     e1 = e0;
433 |     if (!parse_subexpr(&e2, p)) {
434 |       expr_free(e1);
435 |       return false;
436 |     }
437 |     e0 = create(EXPR_LAST, e1, e2);
438 |     if (!e0) {
439 |       expr_free(e1);
440 |       expr_free(e2);
441 |       return false;
442 |     }
443 |   }
444 |   *e = e0;
445 |   return true;
446 | }
447 | 
448 | static Bool parse_verify(Expression *expression) {
449 |   if (!expression) {
450 |     return false;
451 |   }
452 |   switch (expression->type) {
453 |   case EXPR_VALUE: // fallthrough
454 |   case EXPR_CONST:
455 |     return true;
456 |   case EXPR_FUNC1:
457 |     return parse_verify(expression->params[0]) && !expression->params[1];
458 |   default:
459 |     return parse_verify(expression->params[0]) && parse_verify(expression->params[1]);
460 |   }
461 | }
462 | 
463 | Bool expr_parse(Expression **expression, const char *string) {
464 |   Parser p = { 0 };
465 |   char *w = malloc(strlen(string) + 1);
466 |   char *wp = w;
467 |   const char *s0 = string;
468 | 
469 |   if (!w) {
470 |     return false;
471 |   }
472 | 
473 |   while (*string) {
474 |     if (*string != ' ') {
475 |       *wp++ = *string;
476 |     }
477 |     string++;
478 |   }
479 |   *wp++ = '\0';
480 | 
481 |   p.s = w;
482 | 
483 |   Expression *e = NULL;
484 |   if (!parse_expr(&e, &p)) {
485 |     free(w);
486 |     return false;
487 |   }
488 | 
489 |   if (*p.s) {
490 |     expr_free(e);
491 |     free(w);
492 |     fprintf(stderr, "Unexpected end of expression '%s'\n", s0);
493 |     return false;
494 |   }
495 | 
496 |   if (!parse_verify(e)) {
497 |     expr_free(e);
498 |     free(w);
499 |     return false;
500 |   }
501 | 
502 |   free(w);
503 | 
504 |   *expression = e;
505 |   return true;
506 | }
507 | 
508 | void expr_free(Expression *expression) {
509 |   if (expression) {
510 |     expr_free(expression->params[0]);
511 |     expr_free(expression->params[1]);
512 |     free(expression);
513 |   }
514 | }


--------------------------------------------------------------------------------
/eval.h:
--------------------------------------------------------------------------------
 1 | #ifndef EVAL_H
 2 | #define EVAL_H
 3 | #include "real32.h"
 4 | 
 5 | typedef struct Expression Expression;
 6 | 
 7 | Bool expr_parse(Expression**, const char*);
 8 | Real32 expr_eval32(Context*, Expression*);
 9 | void expr_free(Expression*);
10 | void expr_print(FILE*, Expression*);
11 | 
12 | #endif // EVAL_H


--------------------------------------------------------------------------------
/float32.c:
--------------------------------------------------------------------------------
  1 | #include "float32.h"
  2 | 
  3 | // Count leading zero bits.
  4 | static inline Sint8 count_leading_zeros_u32(Uint32 a) {
  5 |   return a == 0 ? 32 : __builtin_clz(a);
  6 | }
  7 | 
  8 | // Take two single-precision float values, one which must be NaN, and produce
  9 | // the correct NaN result, taking care to raise an invalid exception when either
 10 | // is a signaling NaN.
 11 | static Float32 float32_propagate_nan(Context *ctx, Float32 a, Float32 b) {
 12 |   const Flag a_is_nan = float32_is_nan(a);
 13 |   const Flag a_is_snan = float32_is_snan(a);
 14 |   const Flag b_is_nan = float32_is_nan(b);
 15 |   const Flag b_is_snan = float32_is_snan(b);
 16 |   a.bits |= LIT32(0x00400000);
 17 |   b.bits |= LIT32(0x00400000);
 18 |   if (a_is_snan | b_is_snan) {
 19 |     context_raise(ctx, EXCEPTION_INVALID);
 20 |   }
 21 |   if (a_is_nan) {
 22 |     return (a_is_snan & b_is_nan) ? b : a;
 23 |   }
 24 |   return b;
 25 | }
 26 | 
 27 | CanonicalNaN float32_to_canonical_nan(Context* ctx, Float32 a) {
 28 |   if (float32_is_snan(a)) {
 29 |     context_raise(ctx, EXCEPTION_INVALID);
 30 |   }
 31 |   CanonicalNaN nan;
 32 |   nan.sign = a.bits >> 31;
 33 |   nan.lo = 0;
 34 |   nan.hi = (Uint64)a.bits << 41;
 35 |   return nan;
 36 | }
 37 | 
 38 | Float32 float32_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint32 sig) {
 39 |   const Round rounding_mode = ctx->round;
 40 |   const Flag round_nearest_even = rounding_mode == ROUND_NEAREST_EVEN;
 41 |   Sint8 round_increment = 0x40;
 42 |   if (!round_nearest_even) {
 43 |     if (rounding_mode == ROUND_TO_ZERO) {
 44 |       round_increment = 0;
 45 |     } else {
 46 |       round_increment = 0x7f;
 47 |       if (sign) {
 48 |         if (rounding_mode == ROUND_UP) {
 49 |           round_increment = 0;
 50 |         }
 51 |       } else {
 52 |         if (rounding_mode == ROUND_DOWN) {
 53 |           round_increment = 0;
 54 |         }
 55 |       }
 56 |     }
 57 |   }
 58 | 
 59 |   Sint8 round_bits = sig & 0x7f;
 60 | 
 61 |   if (round_bits) {
 62 |     ctx->roundings++;
 63 |   }
 64 |   
 65 |   if (0xfd <= (Uint16)exp) {
 66 |     if ((0xfd < exp) || ((exp == 0xfd) && ((Sint32)(sig + round_increment) < 0))) {
 67 |       context_raise(ctx, EXCEPTION_OVERFLOW | EXCEPTION_INEXACT);
 68 |       const Float32 pack = float32_pack(sign, 0xff, 0);
 69 |       return (Float32){pack.bits - (round_increment == 0 ? 0 : 1)};
 70 |     }
 71 |     if (exp < 0) {
 72 |       const Flag is_tiny = (ctx->tininess == TININESS_BEFORE_ROUNDING)
 73 |         || (exp < -1)
 74 |         || (sig + round_increment < LIT32(0x80000000));
 75 |       sig = rshr32(sig, -exp);
 76 |       exp = 0;
 77 |       round_bits = sig & 0x7f;
 78 |       if (is_tiny && round_bits) {
 79 |         context_raise(ctx, EXCEPTION_UNDERFLOW);
 80 |       }
 81 |     }
 82 |   }
 83 |   if (round_bits) {
 84 |     context_raise(ctx, EXCEPTION_INEXACT);
 85 |   }
 86 |   sig = (sig + round_increment) >> 7;
 87 |   sig &= ~(((round_bits ^ 0x40) == 0) & round_nearest_even);
 88 |   return float32_pack(sign, sig == 0 ? 0 : exp, sig);
 89 | }
 90 | 
 91 | static inline Float32 float32_normalize_round_and_pack(Context *ctx, Flag sign, Sint16 exp, Uint32 sig) {
 92 |   const Sint8 shift = count_leading_zeros_u32(sig) - 1;
 93 |   return float32_round_and_pack(ctx, sign, exp - shift, sig << shift);
 94 | }
 95 | 
 96 | Normal32 float32_normalize_subnormal(Uint32 sig) {
 97 |   const Sint8 shift = count_leading_zeros_u32(sig) - 8;
 98 |   return (Normal32){sig << shift, 1 - shift};
 99 | }
100 | 
101 | static Float32 float32_add_sig(Context *ctx, Float32 a, Float32 b, Flag sign) {
102 |   Sint16 a_exp = float32_exp(a);
103 |   Sint16 b_exp = float32_exp(b);
104 |   Uint32 a_sig = float32_fract(a) << 6;
105 |   Uint32 b_sig = float32_fract(b) << 6;
106 |   Sint16 exp_diff = a_exp - b_exp;
107 | 
108 |   Sint16 exp;
109 |   Uint32 sig;
110 |   if (0 < exp_diff) {
111 |     if (a_exp == 0xff) {
112 |       return a_sig ? float32_propagate_nan(ctx, a, b) : a;
113 |     }
114 |     if (b_exp == 0) {
115 |       exp_diff--;
116 |     } else {
117 |       b_sig |= LIT32(0x20000000);
118 |     }
119 |     b_sig = rshr32(b_sig, exp_diff);
120 |     exp = a_exp;
121 |   } else if (exp_diff < 0) {
122 |     if (b_exp == 0xff) {
123 |       if (b_sig) {
124 |         return float32_propagate_nan(ctx, a, b);
125 |       }
126 |       return float32_pack(sign, 0xff, 0);
127 |     }
128 |     if (a_exp == 0) {
129 |       exp_diff++;
130 |     } else {
131 |       a_sig |= LIT32(0x20000000);
132 |     }
133 |     a_sig = rshr32(a_sig, -exp_diff);
134 |     exp = b_exp;
135 |   } else {
136 |     if (a_exp == 0xff) {
137 |       return (a_sig | b_sig) ? float32_propagate_nan(ctx, a, b) : a;
138 |     }
139 |     if (a_exp == 0) {
140 |       return float32_pack(sign, 0, (a_sig + b_sig) >> 6);
141 |     }
142 |     sig = LIT32(0x40000000) + a_sig + b_sig;
143 |     exp = a_exp;
144 |     goto round_and_pack;
145 |   }
146 |   a_sig |= LIT32(0x20000000);
147 |   sig = (a_sig + b_sig) << 1;
148 |   exp--;
149 |   if ((Sint32)sig < 0) {
150 |     sig = a_sig + b_sig;
151 |     exp++;
152 |   }
153 | round_and_pack:
154 |   return float32_round_and_pack(ctx, sign, exp, sig);
155 | }
156 | 
157 | static Float32 float32_sub_sig(Context *ctx, Float32 a, Float32 b, Flag sign) {
158 |   Sint16 a_exp = float32_exp(a);
159 |   Sint16 b_exp = float32_exp(b);
160 |   Uint32 a_sig = float32_fract(a) << 7;
161 |   Uint32 b_sig = float32_fract(b) << 7;
162 |   Sint16 exp_diff = a_exp - b_exp;
163 | 
164 |   // Needed because goto crosses initialization.
165 |   Sint16 exp;
166 |   Uint32 sig;
167 |   if (0 < exp_diff) {
168 |     goto a_exp_bigger;
169 |   }
170 |   if (exp_diff < 0) {
171 |     goto b_exp_bigger;
172 |   }
173 |   if (a_exp == 0xff) {
174 |     if (a_sig | b_sig) {
175 |       return float32_propagate_nan(ctx, a, b);
176 |     }
177 |     context_raise(ctx, EXCEPTION_INVALID);
178 |     return FLOAT32_NAN;
179 |   }
180 |   if (a_exp == 0) {
181 |     a_exp = 1;
182 |     b_exp = 1;
183 |   }
184 |   if (b_sig < a_sig) {
185 |     goto a_bigger;
186 |   }
187 |   if (a_sig < b_sig) {
188 |     goto b_bigger;
189 |   }
190 |   return float32_pack(ctx->round == ROUND_DOWN, 0, 0);
191 | b_exp_bigger:
192 |   if (b_exp == 0xff) {
193 |     return b_sig
194 |       ? float32_propagate_nan(ctx, a, b)
195 |       : float32_pack(sign ^ 1, 0xff, 0);
196 |   }
197 |   if (a_exp == 0) {
198 |     exp_diff++;
199 |   } else {
200 |     a_sig |= LIT32(0x40000000);
201 |   }
202 |   a_sig = rshr32(a_sig, -exp_diff);
203 |   b_sig |=  LIT32(0x40000000);
204 | b_bigger:
205 |   sig = b_sig - a_sig;
206 |   exp = b_exp;
207 |   sign ^= 1;
208 |   goto normalize_round_and_pack;
209 | a_exp_bigger:
210 |   if (a_exp == 0xff) {
211 |     return a_sig ? float32_propagate_nan(ctx, a, b) : a;
212 |   }
213 |   if (b_exp == 0) {
214 |     exp_diff--;
215 |   } else {
216 |     b_sig |=  LIT32(0x40000000);
217 |   }
218 |   b_sig = rshr32(b_sig, exp_diff);
219 |   a_sig |=  LIT32(0x40000000);
220 | a_bigger:
221 |   sig = a_sig - b_sig;
222 |   exp = a_exp;
223 | normalize_round_and_pack:
224 |   exp--;
225 |   return float32_normalize_round_and_pack(ctx, sign, exp, sig);
226 | }
227 | 
228 | Float32 float32_add(Context *ctx, Float32 a, Float32 b) {
229 |   array_push(ctx->operations, OPERATION_ADD);
230 |   const Flag a_sign = float32_sign(a);
231 |   const Flag b_sign = float32_sign(b);
232 |   return a_sign == b_sign
233 |     ? float32_add_sig(ctx, a, b, a_sign)
234 |     : float32_sub_sig(ctx, a, b, a_sign);
235 | }
236 | 
237 | Float32 float32_sub(Context *ctx, Float32 a, Float32 b) {
238 |   array_push(ctx->operations, OPERATION_SUB);
239 |   const Flag a_sign = float32_sign(a);
240 |   const Flag b_sign = float32_sign(b);
241 |   return a_sign == b_sign
242 |     ? float32_sub_sig(ctx, a, b, a_sign)
243 |     : float32_add_sig(ctx, a, b, a_sign);
244 | }
245 | 
246 | Float32 float32_mul(Context *ctx, Float32 a, Float32 b) {
247 |   array_push(ctx->operations, OPERATION_MUL);
248 |   Sint16 a_exp = float32_exp(a);
249 |   Sint16 b_exp = float32_exp(b);
250 |   Uint32 a_sig = float32_fract(a);
251 |   Uint32 b_sig = float32_fract(b);
252 |   const Flag a_sign = float32_sign(a);
253 |   const Flag b_sign = float32_sign(b);
254 |   const Flag sign = a_sign ^ b_sign;
255 |   Uint32 mag_bits = 0;
256 |   if (a_exp == 0xff) {
257 |     if (a_sig || (b_exp == 0xff && b_sig)) goto propagate_nan;
258 |     mag_bits = b_exp | b_sig;
259 |     goto infinity;
260 |   }
261 |   if (b_exp == 0xff) {
262 |     if (b_sig) goto propagate_nan;
263 |     mag_bits = a_exp | a_sig;
264 |     goto infinity;
265 |   }
266 |   if (a_exp == 0) {
267 |     if (a_sig == 0) goto zero;
268 |     const Normal32 n = float32_normalize_subnormal(a_sig);
269 |     a_exp = n.exp;
270 |     a_sig = n.sig;
271 |   }
272 |   if (b_exp == 0) {
273 |     if (b_sig == 0) goto zero;
274 |     const Normal32 n = float32_normalize_subnormal(b_sig);
275 |     b_exp = n.exp;
276 |     b_sig = n.sig;
277 |   }
278 |   Sint16 exp = a_exp + b_exp - 0x7f;
279 |   a_sig = (a_sig | LIT32(0x00800000)) << 7;
280 |   b_sig = (b_sig | LIT32(0x00800000)) << 8;
281 | 
282 |   // Compute with 64-bit mul, truncate to 32-bit.
283 |   Uint32 sig = rshr64((Uint64)a_sig * b_sig, 32);
284 |   if (sig < LIT32(0x40000000)) {
285 |     exp--;
286 |     sig <<= 1;
287 |   }
288 |   return float32_round_and_pack(ctx, sign, exp, sig);
289 | propagate_nan:
290 |   return float32_propagate_nan(ctx, a, b);
291 | infinity:
292 |   if (!mag_bits) {
293 |     context_raise(ctx, EXCEPTION_INVALID);
294 |     return FLOAT32_NAN;
295 |   } else {
296 |     return float32_pack(sign, 0xff, 0);
297 |   }
298 | zero:
299 |   return float32_pack(sign, 0, 0);
300 | }
301 | 
302 | Float32 float32_div(Context *ctx, Float32 a, Float32 b) {
303 |   array_push(ctx->operations, OPERATION_DIV);
304 |   Sint16 a_exp = float32_exp(a);
305 |   Sint16 b_exp = float32_exp(b);
306 |   Uint32 a_sig = float32_fract(a);
307 |   Uint32 b_sig = float32_fract(b);
308 |   const Flag a_sign = float32_sign(a);
309 |   const Flag b_sign = float32_sign(b);
310 |   const Flag sign = a_sign ^ b_sign;
311 |   if (a_exp == 0xff) {
312 |     if (a_sig) goto propagate_nan;
313 |     if (b_exp == 0xff) {
314 |       if (b_sig) goto propagate_nan;
315 |       goto invalid;
316 |     }
317 |     goto infinity;
318 |   }
319 |   if (b_exp == 0xff) {
320 |     if (b_sig) goto propagate_nan;
321 |     goto zero;
322 |   }
323 |   if (b_exp == 0) {
324 |     if (b_sig == 0) {
325 |       if ((a_exp | a_sig) == 0) goto invalid;
326 |       context_raise(ctx, EXCEPTION_INFINITE);
327 |       goto infinity;
328 |     }
329 |     const Normal32 n = float32_normalize_subnormal(b_sig);
330 |     b_exp = n.exp;
331 |     b_sig = n.sig;
332 |   }
333 |   if (a_exp == 0) {
334 |     if (a_sig == 0) goto zero;
335 |     const Normal32 n = float32_normalize_subnormal(a_sig);
336 |     a_exp = n.exp;
337 |     a_sig = n.sig;
338 |   }
339 |   Sint16 exp = a_exp - b_exp + 0x7e;
340 |   a_sig = (a_sig | LIT32(0x00800000));
341 |   b_sig = (b_sig | LIT32(0x00800000));
342 |   // Use 64-bit divide for 32-bit significand.
343 |   Uint64 a_sig_64;
344 |   if (a_sig < b_sig) {
345 |     exp--;
346 |     a_sig_64 = (Uint64)a_sig << 31;
347 |   } else {
348 |     a_sig_64 = (Uint64)a_sig << 30;
349 |   }
350 |   Uint32 sig = a_sig_64 / b_sig;
351 |   if (!(sig & 0x3f)) {
352 |     sig |= ((Uint64)b_sig * sig != a_sig_64);
353 |   }
354 |   return float32_round_and_pack(ctx, sign, exp, sig);
355 | propagate_nan:
356 |   return float32_propagate_nan(ctx, a, b);
357 | invalid:
358 |   context_raise(ctx, EXCEPTION_INVALID);
359 |   return FLOAT32_NAN;
360 | infinity:
361 |   return float32_pack(sign, 0xff, 0);
362 | zero:
363 |   return float32_pack(sign, 0, 0);
364 | }
365 | 
366 | // a == b
367 | Flag float32_eq(Context *ctx, Float32 a, Float32 b) {
368 |   if ((float32_exp(a) == 0xff && float32_fract(a)) ||
369 |       (float32_exp(b) == 0xff && float32_fract(b)))
370 |   {
371 |     if (float32_is_snan(a) || float32_is_snan(b)) {
372 |       context_raise(ctx, EXCEPTION_INVALID);
373 |     }
374 |     return 0;
375 |   }
376 |   return a.bits == b.bits || (Uint32)((a.bits | b.bits) << 1) == 0;
377 | }
378 | 
379 | // a <= b
380 | Flag float32_lte(Context *ctx, Float32 a, Float32 b) {
381 |   if ((float32_exp(a) == 0xff && float32_fract(a)) ||
382 |       (float32_exp(b) == 0xff && float32_fract(b)))
383 |   {
384 |     context_raise(ctx, EXCEPTION_INVALID);
385 |     return 0;
386 |   }
387 | 
388 |   const Flag a_sign = float32_sign(a);
389 |   const Flag b_sign = float32_sign(b);
390 | 
391 |   if (a_sign != b_sign) {
392 |     return a_sign || (Uint32)((a.bits | b.bits) << 1) == 0;
393 |   }
394 | 
395 |   return a.bits == b.bits || (a_sign ^ (a.bits < b.bits));
396 | }
397 | 
398 | // a < b
399 | Flag float32_lt(Context *ctx, Float32 a, Float32 b) {
400 |   if ((float32_exp(a) == 0xff && float32_fract(a)) ||
401 |       (float32_exp(b) == 0xff && float32_fract(b)))
402 |   {
403 |     context_raise(ctx, EXCEPTION_INVALID);
404 |     return 0;
405 |   }
406 | 
407 |   const Flag a_sign = float32_sign(a);
408 |   const Flag b_sign = float32_sign(b);
409 | 
410 |   if (a_sign != b_sign) {
411 |     return a_sign && (Uint32)((a.bits | b.bits) << 1) != 0;
412 |   }
413 | 
414 |   return a.bits != b.bits && (a_sign ^ (a.bits< b.bits));
415 | }
416 | 
417 | // The others are implemented with a not on the flag. IEEE 754 requires
418 | // these identities be held, so this is safe.
419 | // a != b => !(a == b)
420 | Flag float32_ne(Context *ctx, Float32 a, Float32 b) {
421 |   return !float32_eq(ctx, a, b);
422 | }
423 | 
424 | // a >= b => !(a < b)
425 | Flag float32_gte(Context *ctx, Float32 a, Float32 b) {
426 |   return !float32_lt(ctx, a, b);
427 | }
428 | 
429 | // a > b  => !(a <= b)
430 | Flag float32_gt(Context *ctx, Float32 a, Float32 b) {
431 |   return !float32_lte(ctx, a, b);
432 | }
433 | 
434 | Float32 float32_from_sint32(Context *ctx, Sint32 a) {
435 |   if (a == 0) {
436 |     return (Float32){0};
437 |   }
438 |   if (a == (Sint32)0x80000000) {
439 |     return float32_pack(1, 0x9e, 0);
440 |   }
441 |   const Flag sign = a < 0;
442 |   return float32_normalize_round_and_pack(ctx, sign, 0x9c, sign ? -a : a);
443 | }


--------------------------------------------------------------------------------
/float32.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOFT32_H
 2 | #define SOFT32_H
 3 | #include "soft.h"
 4 | 
 5 | static inline Uint32 float32_fract(Float32 a) {
 6 |   return a.bits & LIT32(0x007FFFFF);
 7 | }
 8 | 
 9 | static inline Sint16 float32_exp(Float32 a) {
10 |   return (a.bits >> 23) & 0xff;
11 | }
12 | 
13 | static inline Flag float32_sign(Float32 a) {
14 |   return a.bits >> 31;
15 | }
16 | 
17 | static inline Flag float32_is_nan(Float32 a) {
18 |   return LIT32(0xFF000000) << (Uint32)(a.bits << 1);
19 | }
20 | 
21 | static inline Flag float32_is_snan(Float32 a) {
22 |   return ((a.bits >> 22) & 0x1ff) == 0x1fe && (a.bits & LIT32(0x003FFFFF));
23 | }
24 | 
25 | static inline Flag float32_is_any_nan(Float32 a) {
26 |   return (a.bits & LIT32(0x7fffffff)) > LIT32(0x7f800000);
27 | }
28 | 
29 | // Pack sign, exponent, and significant into single-precision float.
30 | static inline Float32 float32_pack(Flag sign, Sint16 exp, Uint32 sig) {
31 |   return (Float32){(((Uint32)sign) << 31) + (((Uint32)exp) << 23) + sig};
32 | }
33 | 
34 | // Common constants.
35 | #define FLOAT32_NAN        (Float32){LIT32(0xffffffff)} //  NaN
36 | #define FLOAT32_EPSILON    (Float32){LIT32(0x34000000)} //  0x0.000002p0
37 | #define FLOAT32_ZERO       (Float32){LIT32(0x00000000)} //  0.0
38 | #define FLOAT32_HALF       (Float32){LIT32(0x3f000000)} //  0.5
39 | #define FLOAT32_ONE        (Float32){LIT32(0x3f800000)} //  1.0
40 | #define FLOAT32_MINUS_ONE  (Float32){LIT32(0xbf800000)} // -1.0
41 | 
42 | // Conversion of float32 NaN to CanonicalNaN format.
43 | CanonicalNaN float32_to_canonical_nan(Context*, Float32);
44 | 
45 | // Normalize a subnormal.
46 | Normal32 float32_normalize_subnormal(Uint32 sig);
47 | 
48 | // Build a float from sign, exponent, and significant with correct rounding.
49 | Float32 float32_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint32 sig);
50 | 
51 | // Arithmetic functions.
52 | Float32 float32_add(Context*, Float32, Float32); // a + b
53 | Float32 float32_sub(Context*, Float32, Float32); // a - b
54 | Float32 float32_mul(Context*, Float32, Float32); // a * b
55 | Float32 float32_div(Context*, Float32, Float32); // a / b
56 | 
57 | // Relational functions.
58 | Flag float32_eq(Context*, Float32, Float32); // a == b
59 | Flag float32_lte(Context*, Float32, Float32); // a <= b
60 | Flag float32_lt(Context*, Float32, Float32); // a < b
61 | Flag float32_ne(Context*, Float32, Float32); // a != b
62 | Flag float32_gte(Context*, Float32, Float32); // a >= b
63 | Flag float32_gt(Context*, Float32, Float32); // a > b
64 | 
65 | // Conversion functions.
66 | Float32 float32_from_sint32(Context *ctx, Sint32 x);
67 | 
68 | // Needed temporarily for printing.
69 | static inline float float32_cast(Float32 x) {
70 |   union { Float32 s; float h; } u = {x};
71 |   return u.h;
72 | }
73 | 
74 | #endif // FLOAT32_H


--------------------------------------------------------------------------------
/float64.c:
--------------------------------------------------------------------------------
  1 | #include "float64.h"
  2 | #include "uint128.h"
  3 | 
  4 | // Count leading zero bits.
  5 | static inline Sint8 count_leading_zeros_u64(Uint64 a) {
  6 |   return a == 0 ? 64 : __builtin_clzl(a);
  7 | }
  8 | 
  9 | // Take two double-precision float values, one which must be NaN, and produce
 10 | // the correct NaN result, taking care to raise an invalid exception when either
 11 | // is a signaling NaN.
 12 | static Float64 float64_propagate_nan(Context *ctx, Float64 a, Float64 b) {
 13 |   const Flag a_is_nan = float64_is_nan(a);
 14 |   const Flag a_is_snan = float64_is_snan(a);
 15 |   const Flag b_is_nan = float64_is_nan(b);
 16 |   const Flag b_is_snan = float64_is_snan(b);
 17 |   a.bits |= LIT64(0x0008000000000000);
 18 |   b.bits |= LIT64(0x0008000000000000);
 19 |   if (a_is_snan | b_is_snan) {
 20 |     context_raise(ctx, EXCEPTION_INVALID);
 21 |   }
 22 |   if (a_is_nan) {
 23 |     return (a_is_snan & b_is_nan) ? b : a;
 24 |   }
 25 |   return b;
 26 | }
 27 | 
 28 | CanonicalNaN float64_to_canonical_nan(Context* ctx, Float64 a) {
 29 |   if (float64_is_snan(a)) {
 30 |     context_raise(ctx, EXCEPTION_INVALID);
 31 |   }
 32 |   CanonicalNaN nan;
 33 |   nan.sign = a.bits >> 63;
 34 |   nan.lo = 0;
 35 |   nan.hi = a.bits << 12; 
 36 |   return nan;
 37 | }
 38 | 
 39 | Float64 float64_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint64 sig) {
 40 |   const Round rounding_mode = ctx->round;
 41 |   const Flag round_nearest_even = rounding_mode == ROUND_NEAREST_EVEN;
 42 |   Sint16 round_increment = 0x200;
 43 |   if (!round_nearest_even) {
 44 |     if (rounding_mode == ROUND_TO_ZERO) {
 45 |       round_increment = 0;
 46 |     } else {
 47 |       round_increment = 0x3ff;
 48 |       if (sign) {
 49 |         if (rounding_mode == ROUND_UP) {
 50 |           round_increment = 0;
 51 |         }
 52 |       } else {
 53 |         if (rounding_mode == ROUND_DOWN) {
 54 |           round_increment = 0;
 55 |         }
 56 |       }
 57 |     }
 58 |   }
 59 | 
 60 |   Sint16 round_bits = sig & 0x3ff;
 61 | 
 62 |   if (round_bits) {
 63 |     ctx->roundings++;
 64 |   }
 65 |   
 66 |   if (0x7fd <= (Uint16)exp) {
 67 |     if ((0x7fd < exp) || ((exp == 0x7fd) && ((Sint64)(sig + round_increment) < 0))) {
 68 |       context_raise(ctx, EXCEPTION_OVERFLOW | EXCEPTION_INEXACT);
 69 |       const Float64 pack = float64_pack(sign, 0x7ff, 0);
 70 |       return (Float64){pack.bits - (round_increment == 0 ? 0 : 1)};
 71 |     }
 72 |     if (exp < 0) {
 73 |       const Flag is_tiny = (ctx->tininess == TININESS_BEFORE_ROUNDING)
 74 |         || (exp < -1)
 75 |         || (sig + round_increment < LIT64(0x8000000000000000));
 76 |       sig = rshr64(sig, -exp);
 77 |       exp = 0;
 78 |       round_bits = sig & 0x3ff;
 79 |       if (is_tiny && round_bits) {
 80 |         context_raise(ctx, EXCEPTION_UNDERFLOW);
 81 |       }
 82 |     }
 83 |   }
 84 |   if (round_bits) {
 85 |     context_raise(ctx, EXCEPTION_INEXACT);
 86 |   }
 87 |   sig = (sig + round_increment) >> 10;
 88 |   sig &= ~(((round_bits ^ 0x200) == 0) & round_nearest_even);
 89 |   return float64_pack(sign, sig == 0 ? 0 : exp, sig);
 90 | }
 91 | 
 92 | static inline Float64 float64_normalize_round_and_pack(Context *ctx, Flag sign, Sint16 exp, Uint64 sig) {
 93 |   const Sint8 shift = count_leading_zeros_u64(sig) - 1;
 94 |   return float64_round_and_pack(ctx, sign, exp - shift, sig << shift);
 95 | }
 96 | 
 97 | Normal64 float64_normalize_subnormal(Uint64 sig) {
 98 |   const Sint8 shift = count_leading_zeros_u64(sig) - 11;
 99 |   return (Normal64){sig << shift, 1 - shift};
100 | }
101 | 
102 | static Float64 float64_add_sig(Context *ctx, Float64 a, Float64 b, Flag sign) {
103 |   Sint16 a_exp = float64_exp(a);
104 |   Sint16 b_exp = float64_exp(b);
105 |   Uint64 a_sig = float64_fract(a) << 9;
106 |   Uint64 b_sig = float64_fract(b) << 9;
107 |   Sint16 exp_diff = a_exp - b_exp;
108 | 
109 |   Sint16 exp;
110 |   Uint64 sig;
111 |   if (0 < exp_diff) {
112 |     if (a_exp == 0x7ff) {
113 |       return a_sig ? float64_propagate_nan(ctx, a, b) : a;
114 |     }
115 |     if (b_exp == 0) {
116 |       exp_diff--;
117 |     } else {
118 |       b_sig |= LIT64(0x2000000000000000);
119 |     }
120 |     b_sig = rshr64(b_sig, exp_diff);
121 |     exp = a_exp;
122 |   } else if (exp_diff < 0) {
123 |     if (b_exp == 0x7ff) {
124 |       return b_sig
125 |         ? float64_propagate_nan(ctx, a, b)
126 |         : float64_pack(sign, 0x7ff, 0);
127 |     }
128 |     if (a_exp == 0) {
129 |       exp_diff++;
130 |     } else {
131 |       a_sig |= LIT64(0x2000000000000000);
132 |     }
133 |     a_sig = rshr64(a_sig, -exp_diff);
134 |     exp = b_exp;
135 |   } else {
136 |     if (a_exp == 0x7ff) {
137 |       return (a_sig | b_sig) ? float64_propagate_nan(ctx, a, b) : a;
138 |     }
139 |     if (a_exp == 0) {
140 |       return float64_pack(sign, 0, (a_sig + b_sig) >> 9);
141 |     }
142 |     sig = LIT64(0x4000000000000000) + a_sig + b_sig;
143 |     exp = a_exp;
144 |     goto round_and_pack;
145 |   }
146 |   a_sig |= LIT64(0x2000000000000000);
147 |   sig = (a_sig + b_sig) << 1;
148 |   exp--;
149 |   if ((Sint64)sig < 0) {
150 |     sig = a_sig + b_sig;
151 |     exp++;
152 |   }
153 | round_and_pack:
154 |   return float64_round_and_pack(ctx, sign, exp, sig);
155 | }
156 | 
157 | static Float64 float64_sub_sig(Context *ctx, Float64 a, Float64 b, Flag sign) {
158 |   Sint16 a_exp = float64_exp(a);
159 |   Sint16 b_exp = float64_exp(b);
160 |   Uint64 a_sig = float64_fract(a) << 10;
161 |   Uint64 b_sig = float64_fract(b) << 10;
162 |   Sint16 exp_diff = a_exp - b_exp;
163 | 
164 |   // Needed because goto crosses initialization.
165 |   Sint16 exp;
166 |   Uint64 sig;
167 |   if (0 < exp_diff) {
168 |     goto a_exp_bigger;
169 |   }
170 |   if (exp_diff < 0) {
171 |     goto b_exp_bigger;
172 |   }
173 |   if (a_exp == 0x7ff) {
174 |     if (a_sig | b_sig) {
175 |       return float64_propagate_nan(ctx, a, b);
176 |     }
177 |     context_raise(ctx, EXCEPTION_INVALID);
178 |     return FLOAT64_NAN;
179 |   }
180 |   if (a_exp == 0) {
181 |     a_exp = 1;
182 |     b_exp = 1;
183 |   }
184 |   if (b_sig < a_sig) {
185 |     goto a_bigger;
186 |   }
187 |   if (a_sig < b_sig) {
188 |     goto b_bigger;
189 |   }
190 |   return float64_pack(ctx->round == ROUND_DOWN, 0, 0);
191 | b_exp_bigger:
192 |   if (b_exp == 0x7ff) {
193 |     return b_sig
194 |       ? float64_propagate_nan(ctx, a, b)
195 |       : float64_pack(sign ^ 1, 0xff, 0);
196 |   }
197 |   if (a_exp == 0) {
198 |     exp_diff++;
199 |   } else {
200 |     a_sig |= LIT64(0x4000000000000000);
201 |   }
202 |   a_sig = rshr64(a_sig, -exp_diff);
203 |   b_sig |= LIT64(0x4000000000000000);
204 | b_bigger:
205 |   sig = b_sig - a_sig;
206 |   exp = b_exp;
207 |   sign ^= 1;
208 |   goto normalize_round_and_pack;
209 | a_exp_bigger:
210 |   if (a_exp == 0x7ff) {
211 |     return a_sig ? float64_propagate_nan(ctx, a, b) : a;
212 |   }
213 |   if (b_exp == 0) {
214 |     exp_diff--;
215 |   } else {
216 |     b_sig |= LIT64(0x4000000000000000);
217 |   }
218 |   b_sig = rshr64(b_sig, exp_diff);
219 |   a_sig |= LIT64(0x4000000000000000);
220 | a_bigger:
221 |   sig = a_sig - b_sig;
222 |   exp = a_exp;
223 | normalize_round_and_pack:
224 |   exp--;
225 |   return float64_normalize_round_and_pack(ctx, sign, exp, sig);
226 | }
227 | 
228 | Float64 float64_add(Context *ctx, Float64 a, Float64 b) {
229 |   const Flag a_sign = float64_sign(a);
230 |   const Flag b_sign = float64_sign(b);
231 |   return a_sign == b_sign
232 |     ? float64_add_sig(ctx, a, b, a_sign)
233 |     : float64_sub_sig(ctx, a, b, b_sign);
234 | }
235 | 
236 | Float64 float64_sub(Context *ctx, Float64 a, Float64 b) {
237 |   const Flag a_sign = float64_sign(a);
238 |   const Flag b_sign = float64_sign(b);
239 |   return a_sign == b_sign
240 |     ? float64_sub_sig(ctx, a, b, a_sign)
241 |     : float64_add_sig(ctx, a, b, a_sign);
242 | }
243 | 
244 | Float64 float64_mul(Context *ctx, Float64 a, Float64 b) {
245 |   Sint16 a_exp = float64_exp(a);
246 |   Sint16 b_exp = float64_exp(b);
247 |   Uint64 a_sig = float64_fract(a);
248 |   Uint64 b_sig = float64_fract(b);
249 |   Flag a_sign = float64_sign(a);
250 |   Flag b_sign = float64_sign(b);
251 |   Flag sign = a_sign ^ b_sign;
252 |   if (a_exp == 0x7ff) {
253 |     if (a_sig || (b_exp == 0x7ff && b_sig)) {
254 |       return float64_propagate_nan(ctx, a, b);
255 |     }
256 |     if ((b_exp | b_sig) == 0) {
257 |       context_raise(ctx, EXCEPTION_INVALID);
258 |       return FLOAT64_NAN;
259 |     }
260 |     return float64_pack(sign, 0x7ff, 0);
261 |   }
262 |   if (b_exp == 0x7ff) {
263 |     if (b_sig) {
264 |       return float64_propagate_nan(ctx, a, b);
265 |     }
266 |     if ((a_exp | a_sig) == 0) {
267 |       context_raise(ctx, EXCEPTION_INVALID);
268 |       return FLOAT64_NAN;
269 |     }
270 |     return float64_pack(sign, 0x7ff, 0);
271 |   }
272 |   if (a_exp == 0) {
273 |     if (a_sig == 0) {
274 |       return float64_pack(sign, 0, 0);
275 |     }
276 |     const Normal64 n = float64_normalize_subnormal(a_sig);
277 |     a_exp = n.exp;
278 |     a_sig = n.sig;
279 |   }
280 |   if (b_exp == 0) {
281 |     if (b_sig == 0) {
282 |       return float64_pack(sign, 0, 0);
283 |       const Normal64 n = float64_normalize_subnormal(b_sig);
284 |       b_exp = n.exp;
285 |       b_sig = n.sig;
286 |     }
287 |   }
288 |   Sint16 exp = a_exp + b_exp - 0x3ff;
289 |   a_sig = (a_sig | LIT64(0x0010000000000000)) << 10;
290 |   b_sig = (b_sig | LIT64(0x0010000000000000)) << 11;
291 | 
292 |   // Compute with 128-bit mul, truncate to 64-bit.
293 |   Uint128 mul = uint128_mul64x64(a_sig, b_sig);
294 |   mul.z0 |= mul.z1 != 0;
295 |   if (0 <= (Sint64)(mul.z0 << 1)) {
296 |     mul.z0 <<= 1;
297 |     exp--;
298 |   }
299 |   return float64_round_and_pack(ctx, sign, exp, mul.z0);
300 | }
301 | 
302 | Float64 float64_div(Context *ctx, Float64 a, Float64 b) {
303 |   Sint16 a_exp = float64_exp(a);
304 |   Sint16 b_exp = float64_exp(b);
305 |   Uint64 a_sig = float64_fract(a);
306 |   Uint64 b_sig = float64_fract(b);
307 |   Flag a_sign = float64_sign(a);
308 |   Flag b_sign = float64_sign(b);
309 |   Flag sign = a_sign ^ b_sign;
310 |   if (a_exp == 0x7ff) {
311 |     if (a_sig) {
312 |       return float64_propagate_nan(ctx, a, b);
313 |     }
314 |     if (b_exp == 0x7ff) {
315 |       if (b_sig) {
316 |         return float64_propagate_nan(ctx, a, b);
317 |       }
318 |       context_raise(ctx, EXCEPTION_INVALID);
319 |       return FLOAT64_NAN;
320 |     }
321 |     return float64_pack(sign, 0xff, 0);
322 |   }
323 |   if (b_exp == 0x7ff) {
324 |     return b_sig
325 |       ? float64_propagate_nan(ctx, a, b)
326 |       : float64_pack(sign, 0, 0);
327 |   }
328 |   if (b_exp == 0) {
329 |     if (b_sig == 0) {
330 |       if ((a_exp | a_sig) == 0) {
331 |         context_raise(ctx, EXCEPTION_INVALID);
332 |         return FLOAT64_NAN;
333 |       }
334 |       context_raise(ctx, EXCEPTION_INFINITE);
335 |       return float64_pack(sign, 0xff, 0);
336 |     }
337 |     const Normal64 n = float64_normalize_subnormal(b_sig);
338 |     b_exp = n.exp;
339 |     b_sig = n.sig;
340 |   }
341 |   if (a_exp == 0) {
342 |     if (a_sig == 0) {
343 |       return float64_pack(sign, 0, 0);
344 |     }
345 |     const Normal64 n = float64_normalize_subnormal(a_sig);
346 |     a_exp = n.exp;
347 |     a_sig = n.sig;
348 |   }
349 |   Sint16 exp = a_exp - b_exp + 0x7d;
350 |   a_sig = (a_sig | LIT64(0x0010000000000000)) << 10;
351 |   b_sig = (b_sig | LIT64(0x0010000000000000)) << 11;
352 |   if (b_sig <= a_sig + a_sig) {
353 |     a_sig >>= 1;
354 |     exp++;
355 |   }
356 | 
357 |   Uint64 sig = uint128_div128x64((Uint128){a_sig, 0}, b_sig);
358 |   if ((sig & 0x1ff) <= 2) {
359 |     Uint128 term = uint128_mul64x64(b_sig, sig);
360 |     Uint128 rem = uint128_sub((Uint128){a_sig, 0}, term);
361 |     while ((Sint64)rem.z0 < 0) {
362 |       sig--;
363 |       rem = uint128_add(rem, (Uint128){0, b_sig});
364 |     }
365 |     sig |= rem.z1 != 0;
366 |   }
367 | 
368 |   return float64_round_and_pack(ctx, sign, exp, sig);
369 | }


--------------------------------------------------------------------------------
/float64.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOFT64_H
 2 | #define SOFT64_H
 3 | #include "soft.h"
 4 | 
 5 | static inline Uint64 float64_fract(Float64 a) {
 6 |   return a.bits & LIT64(0x000FFFFFFFFFFFFF);
 7 | }
 8 | 
 9 | static inline Sint16 float64_exp(Float64 a) {
10 |   return (a.bits >> 52) & 0x7ff;
11 | }
12 | 
13 | static inline Flag float64_sign(Float64 a) {
14 |   return a.bits >> 63;
15 | }
16 | 
17 | static inline Flag float64_is_nan(Float64 a) {
18 |   return LIT64(0xFFE0000000000000) < (Uint64)(a.bits << 1);
19 | }
20 | 
21 | static inline Flag float64_is_snan(Float64 a) {
22 |   return (((a.bits >> 51) & 0xfff) == 0xffe)
23 |     && (a.bits & LIT64(0x0007ffffffffffff));
24 | }
25 | 
26 | // Pack sign, exponent, and significant into double-precision float.
27 | static inline Float64 float64_pack(Flag sign, Sint16 exp, Uint64 sig) {
28 |   return (Float64){(((Uint64)sign) << 63) + (((Uint64)exp) << 52) + sig};
29 | }
30 | 
31 | // Common constants.
32 | static const Float64 FLOAT64_NAN = {LIT64(0xffffffffffffffff)};
33 | static const Float64 FLOAT64_ZERO = {0}; // 0x0p+0
34 | 
35 | // Conversion of float32 NaN to CanonicalNaN format.
36 | CanonicalNaN float64_to_canonical_nan(Context*, Float64);
37 | 
38 | // Normalize subnormal.
39 | Normal64 float64_normalize_subnormal(Uint64 sig);
40 | 
41 | // Build a float64 from sign, exponent, and significant with correct rounding.
42 | Float64 float64_round_and_pack(Context *ctx, Flag sign, Sint32 exp, Uint64 sig);
43 | 
44 | // Arithmetic functions.
45 | Float64 float64_add(Context*, Float64, Float64); // a + b
46 | Float64 float64_sub(Context*, Float64, Float64); // a - b
47 | Float64 float64_mul(Context*, Float64, Float64); // a * b
48 | Float64 float64_div(Context*, Float64, Float64); // a / b
49 | 
50 | // Needed temporarily for printing.
51 | static inline double float64_cast(Float64 x) {
52 |   union { Float64 s; double h; } u = {x};
53 |   return u.h;
54 | }
55 | 
56 | #endif // FLOAT64_H


--------------------------------------------------------------------------------
/kernel32.c:
--------------------------------------------------------------------------------
  1 | #include "kernel32.h"
  2 | 
  3 | static const Float32 HUGE = {LIT32(0x7b800000)}; // 0x1p120f
  4 | // When the result of evaluating something is not used the compiler will attempt
  5 | // to remove that dead code, even though in this case we want the evaluation
  6 | // of some expressions to happen to trigger exceptions.
  7 | static inline void float32_force_eval(Float32 x) {
  8 |   volatile Float32 y;
  9 |   y = x;
 10 |   (void)y; // Mark as used.
 11 | }
 12 | 
 13 | Float32 float32_floor(Context *ctx, Float32 x) {
 14 |   const Sint16 e = float32_exp(x) - 0x7f;
 15 |   if (e >= 23) {
 16 |     return x;
 17 |   }
 18 |   if (e >= 0) {
 19 |     const Uint32 m = LIT32(0x007fffff) >> e;
 20 |     if ((x.bits & m) == 0) {
 21 |       return x;
 22 |     }
 23 |     float32_force_eval(float32_add(ctx, x, HUGE)); 
 24 |     if (x.bits >> 31) {
 25 |       x.bits += m;
 26 |     }
 27 |     x.bits &= ~m;
 28 |   } else {
 29 |     float32_force_eval(float32_add(ctx, x, HUGE));
 30 |     if (x.bits >> 31 == 0) {
 31 |       x.bits = 0;
 32 |     } else if (x.bits << 1) {
 33 |       x.bits = LIT32(0xbf800000); // -1.0
 34 |     }
 35 |   }
 36 |   return x;
 37 | }
 38 | 
 39 | Float32 float32_ceil(Context *ctx, Float32 x) {
 40 |   const Sint16 e = float32_exp(x) - 0x7f;
 41 |   if (e >= 23) {
 42 |     return x;
 43 |   }
 44 |   if (e >= 0) {
 45 |     const Uint32 m = LIT32(0x007fffff) >> e;
 46 |     if ((x.bits & m) == 0) {
 47 |       return x;
 48 |     }
 49 |     float32_force_eval(float32_add(ctx, x, HUGE));
 50 |     if (x.bits >> 31 == 0) {
 51 |       x.bits += m;
 52 |     }
 53 |     x.bits &= ~m;
 54 |   } else {
 55 |     float32_force_eval(float32_add(ctx, x, HUGE));
 56 |     if (x.bits >> 31) {
 57 |       x.bits = LIT32(0x80000000); // -0.0
 58 |     } else if (x.bits << 1) {
 59 |       x.bits = LIT32(0x3f800000); // 1.0
 60 |     }
 61 |   }
 62 |   return x;
 63 | }
 64 | 
 65 | Float32 float32_trunc(Context *ctx, Float32 x) {
 66 |   Sint16 e = float32_exp(x) - 0x7f + 9;
 67 |   if (e >= 23 + 9) {
 68 |     return x;
 69 |   }
 70 |   if (e < 9) {
 71 |     e = 1;
 72 |   }
 73 |   const Uint32 m = -1u >> e;
 74 |   if ((x.bits & m) == 0) {
 75 |     return x;
 76 |   }
 77 |   float32_force_eval(float32_add(ctx, x, HUGE));
 78 |   x.bits &= ~m;
 79 |   return x;
 80 | }
 81 | 
 82 | // 32-bit multiplication without truncation.
 83 | static inline Uint32 mul32(Uint32 a, Uint32 b) {
 84 |   return (Uint64)a*b >> 32;
 85 | }
 86 | 
 87 | // Computes (x-x) / (x-x) to correctly raise an invalid exception and compute
 88 | // correct exceptional value of NaN, sNaN, +Inf, or -Inf for given x.
 89 | static Float32 float32_invalid(Context *ctx, Float32 x) {
 90 |   const Float32 sub = float32_sub(ctx, x, x);
 91 |   return float32_div(ctx, sub, sub);
 92 | }
 93 | 
 94 | Float32 float32_sqrt(Context *ctx, Float32 x) {
 95 |   // if x in [1,2): i = (Sint32)(64*x);
 96 |   // if x in [2,4): i = (Sint32)(32*x-64);
 97 |   // TABLE[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
 98 |   // |TABLE[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8
 99 |   static const Uint16 TABLE[128] = {
100 |     0xb451, 0xb2f0, 0xb196, 0xb044, 0xaef9, 0xadb6, 0xac79, 0xab43,
101 |     0xaa14, 0xa8eb, 0xa7c8, 0xa6aa, 0xa592, 0xa480, 0xa373, 0xa26b,
102 |     0xa168, 0xa06a, 0x9f70, 0x9e7b, 0x9d8a, 0x9c9d, 0x9bb5, 0x9ad1,
103 |     0x99f0, 0x9913, 0x983a, 0x9765, 0x9693, 0x95c4, 0x94f8, 0x9430,
104 |     0x936b, 0x92a9, 0x91ea, 0x912e, 0x9075, 0x8fbe, 0x8f0a, 0x8e59,
105 |     0x8daa, 0x8cfe, 0x8c54, 0x8bac, 0x8b07, 0x8a64, 0x89c4, 0x8925,
106 |     0x8889, 0x87ee, 0x8756, 0x86c0, 0x862b, 0x8599, 0x8508, 0x8479,
107 |     0x83ec, 0x8361, 0x82d8, 0x8250, 0x81c9, 0x8145, 0x80c2, 0x8040,
108 |     0xff02, 0xfd0e, 0xfb25, 0xf947, 0xf773, 0xf5aa, 0xf3ea, 0xf234,
109 |     0xf087, 0xeee3, 0xed47, 0xebb3, 0xea27, 0xe8a3, 0xe727, 0xe5b2,
110 |     0xe443, 0xe2dc, 0xe17a, 0xe020, 0xdecb, 0xdd7d, 0xdc34, 0xdaf1,
111 |     0xd9b3, 0xd87b, 0xd748, 0xd61a, 0xd4f1, 0xd3cd, 0xd2ad, 0xd192,
112 |     0xd07b, 0xcf69, 0xce5b, 0xcd51, 0xcc4a, 0xcb48, 0xca4a, 0xc94f,
113 |     0xc858, 0xc764, 0xc674, 0xc587, 0xc49d, 0xc3b7, 0xc2d4, 0xc1f4,
114 |     0xc116, 0xc03c, 0xbf65, 0xbe90, 0xbdbe, 0xbcef, 0xbc23, 0xbb59,
115 |     0xba91, 0xb9cc, 0xb90a, 0xb84a, 0xb78c, 0xb6d0, 0xb617, 0xb560,
116 |   };
117 | 
118 |   Uint32 ix = x.bits;
119 | 
120 |   if (ix - 0x00800000 >= 0x7f800000 - 0x00800000) {
121 |     // x < 0x1p-126, inf, or nan.
122 |     if (ix * 2 == 0) {
123 |       return x;
124 |     }
125 |     if (ix == LIT32(0x7f800000)) {
126 |       return x;
127 |     }
128 |     if (ix > LIT32(0x7f800000)) {
129 |       return float32_invalid(ctx, x);
130 |     }
131 |     // is subnormal, normalize it.
132 |     const Float32 n = float32_mul(ctx, x, (Float32){LIT32(0x4b000000)}); // 0x1p23f
133 |     ix = n.bits;
134 |     ix -= 23 << 23;
135 |   }
136 | 
137 |   // x = 4^e m; with int e and m in [1, 4).
138 |   Uint32 even = ix & LIT32(0x00800000);
139 |   Uint32 m1 = (ix << 8) | LIT32(0x80000000);
140 |   Uint32 m0 = (ix << 7) & LIT32(0x7fffffff);
141 |   Uint32 m = even ? m0 : m1;
142 | 
143 |   // 2^e is exponent part.
144 |   Uint32 ey = ix >> 1;
145 |   ey += LIT32(0x3f800000) >> 1;
146 |   ey &= LIT32(0x7f800000);
147 | 
148 |   // Compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 iterations.
149 |   static const Uint32 THREE = LIT32(0xc0000000);
150 |   const Uint32 i = (ix >> 17) % 128;
151 |   Uint32 r, s, d, u;
152 |   r = (Uint32)TABLE[i] << 16;
153 |   // |r*sqrt(m) - 1| < 0x1p-8
154 |   s = mul32(m, r);
155 |   // |s/sqrt(m) - 1| < 0x1p-8
156 |   d = mul32(s, r);
157 |   u = THREE - d;
158 |   r = mul32(r, u) << 1;
159 |   // |r*sqrt(m) - 1| < 0x1.7bp-16
160 |   s = mul32(s, u) << 1;
161 |   // |s/sqrt(m) - 1| < 0x1.7bp-16
162 |   d = mul32(s, r);
163 |   u = THREE - d;
164 |   s = mul32(s, u);
165 |   // -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31
166 |   s = (s - 1) >> 6;
167 |   // s < sqrt(m) < s + 0x1.08p-23
168 | 
169 |   // Compute nearest rounded result.
170 |   const Uint32 d0 = (m << 16) - s*s;
171 |   const Uint32 d1 = s - d0;
172 |   const Uint32 d2 = d1 + s + 1;
173 |   s += d1 >> 31;
174 |   s &= LIT32(0x007fffff);
175 |   s |= ey;
176 | 
177 |   const Float32 y = {s};
178 | 
179 |   // Handle rounding and inexact exceptions.
180 |   const Float32 t = {(d2 == 0 ? 0 : LIT32(0x01000000)) | ((d1 ^ d2) & LIT32(0x80000000))};
181 | 
182 |   return float32_add(ctx, y, t);
183 | }
184 | 
185 | Float32 float32_abs(Context *ctx, Float32 x) {
186 |   (void)ctx;
187 |   x.bits &= 0x7fffffff;
188 |   return x;
189 | }
190 | 
191 | Float32 float32_copysign(Context *ctx, Float32 x, Float32 y) {
192 |   (void)ctx;
193 |   x.bits &= LIT32(0x7fffffff); // abs
194 |   x.bits |= y.bits & LIT32(0x80000000); // copy sign bit
195 |   return x;
196 | }
197 | 
198 | Float32 float32_max(Context *ctx, Float32 x, Float32 y) {
199 |   if (float32_is_any_nan(x)) {
200 |     return y;
201 |   }
202 |   if (float32_is_any_nan(y)) {
203 |     return x;
204 |   }
205 | 
206 |   // Handle signed zeros.
207 |   const Flag sign_x = float32_sign(x);
208 |   const Flag sign_y = float32_sign(y);
209 |   if (sign_x != sign_y) {
210 |     return sign_x ? y : x;
211 |   }
212 | 
213 |   // IEEE makes it clear min and max should both use lt relational operation.
214 |   return float32_lt(ctx, x, y) ? y : x;
215 | }
216 | 
217 | Float32 float32_min(Context *ctx, Float32 x, Float32 y) {
218 |   if (float32_is_any_nan(x)) {
219 |     return y;
220 |   }
221 |   if (float32_is_any_nan(y)) {
222 |     return x;
223 |   }
224 | 
225 |   // Handle signed zeros.
226 |   const Flag sign_x = float32_sign(x);
227 |   const Flag sign_y = float32_sign(y);
228 |   if (sign_x != sign_y) {
229 |     return sign_x ? x : y;
230 |   }
231 | 
232 |   return float32_lt(ctx, x, y) ? x : y;
233 | }


--------------------------------------------------------------------------------
/kernel32.h:
--------------------------------------------------------------------------------
 1 | #ifndef KERNEL32_H
 2 | #define KERNEL32_H
 3 | #include "real32.h"
 4 | 
 5 | Float32 float32_floor(Context*, Float32);
 6 | Float32 float32_ceil(Context*, Float32);
 7 | Float32 float32_trunc(Context*, Float32);
 8 | Float32 float32_sqrt(Context*, Float32);
 9 | Float32 float32_abs(Context*, Float32);
10 | Float32 float32_copysign(Context*, Float32, Float32);
11 | Float32 float32_max(Context*, Float32, Float32);
12 | Float32 float32_min(Context*, Float32, Float32);
13 | 
14 | #endif


--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h> // printf
 2 | #include <float.h> // DBL_DIG
 3 | #include <stdlib.h> // atoi
 4 | 
 5 | #include "eval.h"
 6 | 
 7 | static int usage(const char *app) {
 8 |   fprintf(stderr, "%s [OPTION]... [EXPRESSION]\n", app);
 9 |   fprintf(stderr, "-r   rounding mode\n");
10 |   fprintf(stderr, "      0 - nearest even [default]\n");
11 |   fprintf(stderr, "      1 - to zero\n");
12 |   fprintf(stderr, "      2 - down\n");
13 |   fprintf(stderr, "      3 - up\n");
14 |   fprintf(stderr, "-t   tininess detection mode\n");
15 |   fprintf(stderr, "      0 - before rounding [default]\n");
16 |   fprintf(stderr, "      1 - after rounding\n");
17 |   return 1;
18 | }
19 | 
20 | int main(int argc, char **argv) {
21 |   argc--;
22 |   argv++;
23 |   if (argc == 0) {
24 |     return usage(argv[-1]);
25 |   }
26 | 
27 |   Context c;
28 |   c.round = ROUND_NEAREST_EVEN;
29 |   c.tininess = TININESS_BEFORE_ROUNDING;
30 |   context_init(&c);
31 | 
32 |   // Parse some command line options.
33 |   if (argv[0][0] == '-') {
34 |     if (argv[0][1] == 'r') {
35 |       int round = atoi(argv[1]);
36 |       if (round < 0 || round > 3) {
37 |         return usage(argv[-1]);
38 |       }
39 |       argv += 2; // skip -r %d
40 |       argc -= 2;
41 |       c.round = round;
42 |     } else if (argv[0][1] == 't') {
43 |       int tiny = atoi(argv[1]);
44 |       if (tiny < 0 || tiny > 1) {
45 |         return usage(argv[-1]);
46 |       }
47 |       argv += 2; // skip -t %d
48 |       argc -= 2;
49 |       c.tininess = tiny;
50 |     } else {
51 |       return usage(argv[-1]);
52 |     }
53 |   }
54 | 
55 |   if (argc == 0) {
56 |     return usage(argv[-1]);
57 |   }
58 | 
59 |   Expression *e;
60 |   if (!expr_parse(&e, argv[0])) {
61 |     return 2;
62 |   }
63 | 
64 |   const Real32 result = expr_eval32(&c, e);
65 |   expr_print(stdout, e);
66 |   printf("\n\tans: %.*f\n\terr: %.*f\n",
67 |     DBL_DIG - 1, float32_cast(result.value),
68 |     DBL_DIG - 1, float32_cast(result.eps));
69 |   expr_free(e);
70 | 
71 |   context_free(&c);
72 | 
73 |   return 0;
74 | }


--------------------------------------------------------------------------------
/real32.c:
--------------------------------------------------------------------------------
  1 | #include "real32.h"
  2 | #include "kernel32.h"
  3 | 
  4 | // When calculating error we don't want to muddy the value context. Use a copy
  5 | // of it with the same rounding and tininess mode ignoring everything else.
  6 | static inline Context eps_ctx(const Context *ctx) {
  7 |   Context c;
  8 |   context_copy(&c, ctx);
  9 |   return c;
 10 | }
 11 | 
 12 | Real32 real32_add(Context *ctx, Real32 a, Real32 b) {
 13 |   Context ec = eps_ctx(ctx);
 14 |   Real32 r;
 15 |   r.value = float32_add(ctx, a.value, b.value);
 16 |   r.eps = 
 17 |     float32_add(
 18 |       &ec,
 19 |       // err(a) + err(b)
 20 |       float32_add(&ec, a.eps, b.eps),
 21 |       // EPSILON * abs(value)
 22 |       float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value)));
 23 |   return r;
 24 | }
 25 | 
 26 | Real32 real32_sub(Context *ctx, Real32 a, Real32 b) {
 27 |   Context ec = eps_ctx(ctx);
 28 |   Real32 r;
 29 |   r.value = float32_sub(ctx, a.value, b.value);
 30 |   r.eps = 
 31 |     float32_add(
 32 |       &ec,
 33 |       // err(a) + err(b)
 34 |       float32_add(&ec, a.eps, b.eps),
 35 |       // EPSILON * abs(value)
 36 |       float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value)));
 37 |   return r;
 38 | }
 39 | 
 40 | Real32 real32_mul(Context *ctx, Real32 a, Real32 b) {
 41 |   Context ec = eps_ctx(ctx);
 42 |   Real32 r;
 43 |   r.value = float32_mul(ctx, a.value, b.value);
 44 |   r.eps = float32_add(
 45 |     &ec,
 46 |     float32_add(
 47 |       &ec,
 48 |       float32_add(
 49 |         &ec,
 50 |         // err(a) * abs(b)
 51 |         float32_mul(&ec, a.eps, float32_abs(&ec, b.value)),
 52 |         // err(b) * abs(a)
 53 |         float32_mul(&ec, b.eps, float32_abs(&ec, a.value))),
 54 |       // err(a) * err(b)
 55 |       float32_mul(&ec, a.eps, b.eps)),
 56 |     // EPSILON * abs(value)
 57 |     float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value)));
 58 |   return r;
 59 | }
 60 | 
 61 | // Calculating division error is non-trivial when the divisor is inaccurate,
 62 | // use the following to recover inaccuracies for inaccurate divisor
 63 | // r^2(-x) - r*x + 0 = 0
 64 | Real32 real32_div(Context *ctx, Real32 a, Real32 b) {
 65 |   Context ec = eps_ctx(ctx);
 66 |   Real32 r;
 67 |   r.value = float32_div(ctx, a.value, b.value);
 68 |   
 69 |   const Float32 abs_b = float32_abs(&ec, b.value);
 70 |   const Float32 abs_r = float32_abs(&ec, r.value);
 71 |   Float32 e = 
 72 |     float32_div(
 73 |       &ec,
 74 |       float32_add(
 75 |         &ec,
 76 |         a.eps,
 77 |         // abs(r) * eps(b)
 78 |         float32_mul(&ec, abs_r, b.eps)),
 79 |       abs_b);
 80 |   
 81 |   // Use more accurate for inaccurate divisors.
 82 |   static const Float32 EPS = {LIT32(0x3c23d70a)}; // 0.01f
 83 |   if (float32_gt(&ec, b.eps, float32_mul(&ec, EPS, abs_b))) {
 84 |     const Float32 r = float32_div(&ec, b.eps, b.value);
 85 |     // e = e * (1 + (1 + r) * r)
 86 |     e = float32_mul(
 87 |       &ec,
 88 |       e,
 89 |       // 1 + (1 + r) * r
 90 |       float32_add(
 91 |         &ec,
 92 |         float32_from_sint32(&ec, 1),
 93 |         // (1 + r) * r
 94 |         float32_mul(
 95 |           &ec,
 96 |           // 1 + r
 97 |           float32_add(
 98 |             &ec,
 99 |             float32_from_sint32(&ec, 1),
100 |             r),
101 |           r)));
102 |   }
103 | 
104 |   r.eps = 
105 |     // e + (EPSILON * abs(value))
106 |     float32_add(
107 |       &ec,
108 |       e,
109 |       // EPSILON * abs(value)
110 |       float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r.value)));
111 |   
112 |   return r;
113 | }
114 | 
115 | Real32 real32_sqrt(Context *ctx, Real32 x) {
116 |   Context ec = eps_ctx(ctx);
117 | 
118 |   // Calculate error.
119 |   Float32 d;
120 |   // Assume non-negative input.
121 |   if (float32_gte(&ec, x.value, FLOAT32_ZERO)) {
122 |     const Float32 r = float32_sqrt(&ec, x.value);
123 |     // if x > 10.0 * err(x)
124 |     const Float32 err = float32_mul(&ec, float32_from_sint32(&ec, 10), x.eps);
125 |     if (float32_gt(&ec, x.value, err)) {
126 |       // 0.5 * (err(x) / r)
127 |       d = float32_mul(&ec, FLOAT32_HALF, float32_div(&ec, x.eps, r));
128 |     } else {
129 |       // if x > err(x)
130 |       if (float32_gt(&ec, x.value, x.eps)) {
131 |         // r - sqrt(x - err(x))
132 |         d = float32_sub(&ec, r, float32_sqrt(&ec, float32_sub(&ec, x.value, x.eps)));
133 |       } else {
134 |         // max(r, sqrt(x + err(x)) - r)
135 |         d = float32_max(&ec, r, float32_sub(&ec, float32_sqrt(&ec, float32_add(&ec, x.value, x.eps)), r));
136 |       }
137 |     }
138 |     // d += EPSILON * abs(r)
139 |     d = float32_add(&ec, d, float32_mul(&ec, FLOAT32_EPSILON, float32_abs(&ec, r)));
140 |   } else {
141 |     // Assume negative input.
142 |     if (float32_lt(&ec, x.value, float32_mul(&ec, x.eps, FLOAT32_MINUS_ONE))) {
143 |       d = FLOAT32_NAN;
144 |     } else {
145 |       // Assume zero input.
146 |       d = float32_sqrt(&ec, x.eps);
147 |     }
148 |   }
149 | 
150 |   return (Real32){float32_sqrt(ctx, x.value), d};
151 | }
152 | 
153 | // Operations that cannot generate error.
154 | #define REAL32_WRAP1_NO_ERROR(name) \
155 |   Real32 real32_ ## name(Context *ctx, Real32 a) { \
156 |     return (Real32){float32_ ## name(ctx, a.value), {0}}; \
157 |   }
158 | 
159 | #define REAL32_WRAP2_NO_ERROR(name) \
160 |   Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b) { \
161 |     return (Real32){float32_ ## name(ctx, a.value, b.value), {0}}; \
162 |   }
163 | 
164 | #define REAL32_WRAP_RELATION_NO_ERROR(name) \
165 |   Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b) { \
166 |     return float32_ ## name(ctx, a.value, b.value) ? REAL32_ONE : REAL32_ZERO; \
167 |   }
168 | 
169 | REAL32_WRAP1_NO_ERROR(floor)
170 | REAL32_WRAP1_NO_ERROR(ceil)
171 | REAL32_WRAP1_NO_ERROR(trunc)
172 | 
173 | REAL32_WRAP1_NO_ERROR(abs)
174 | 
175 | REAL32_WRAP2_NO_ERROR(copysign)
176 | REAL32_WRAP2_NO_ERROR(max)
177 | REAL32_WRAP2_NO_ERROR(min)
178 | 
179 | REAL32_WRAP_RELATION_NO_ERROR(eq)
180 | REAL32_WRAP_RELATION_NO_ERROR(lte)
181 | REAL32_WRAP_RELATION_NO_ERROR(lt)
182 | REAL32_WRAP_RELATION_NO_ERROR(ne)
183 | REAL32_WRAP_RELATION_NO_ERROR(gte)
184 | REAL32_WRAP_RELATION_NO_ERROR(gt)


--------------------------------------------------------------------------------
/real32.h:
--------------------------------------------------------------------------------
 1 | #ifndef REAL32_H
 2 | #define REAL32_H
 3 | #include "float32.h"
 4 | #include "kernel32.h"
 5 | 
 6 | // Accumulative error accounting.
 7 | //
 8 | // The idea here is arithmetic results of soft float will always be close to
 9 | // the correct value +- 0.5 * EPSILON * value.
10 | //
11 | // That is:
12 | //   err(a+b) = err(a) + err(b) + EPSILON * abs(a+b)
13 | //
14 | // The error result of an elementary floating-point operation does not exceed
15 | // and is close to abs(result) * EPSILON.
16 | typedef struct Real32 Real32;
17 | 
18 | struct Real32 {
19 |   Float32 value;
20 |   Float32 eps;
21 | };
22 | 
23 | // Cannot use FLOAT32_ZERO here as it would be a non-const initializer in C.
24 | #define REAL32_NAN        (Real32){FLOAT32_NAN,       {0}} //  NaN
25 | #define REAL32_EPSILON    (Real32){FLOAT32_EPSILON,   {0}} //  0x0.000002p0
26 | #define REAL32_ZERO       (Real32){FLOAT32_ZERO,      {0}} //  0.0
27 | #define REAL32_HALF       (Real32){FLOAT32_HALF,      {0}} //  0.5
28 | #define REAL32_ONE        (Real32){FLOAT32_ONE,       {0}} //  1.0
29 | #define REAL32_MINUS_ONE  (Real32){FLOAT32_MINUS_ONE, {0}} // -1.0
30 | 
31 | Real32 real32_add(Context *ctx, Real32 a, Real32 b);
32 | Real32 real32_sub(Context *ctx, Real32 a, Real32 b);
33 | Real32 real32_mul(Context *ctx, Real32 a, Real32 b);
34 | Real32 real32_div(Context *ctx, Real32 a, Real32 b);
35 | 
36 | #define REAL32_WRAP1_NO_ERROR(name) \
37 |   Real32 real32_ ## name(Context *ctx, Real32 a)
38 | #define REAL32_WRAP2_NO_ERROR(name) \
39 |   Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b)
40 | #define REAL32_WRAP_RELATION_NO_ERROR(name) \
41 |   Real32 real32_ ## name(Context *ctx, Real32 a, Real32 b)
42 | 
43 | // Operations that cannot produce errors.
44 | // 1. Truncation.
45 | REAL32_WRAP1_NO_ERROR(floor);
46 | REAL32_WRAP1_NO_ERROR(ceil);
47 | REAL32_WRAP1_NO_ERROR(trunc);
48 | // 2. Absolute.
49 | REAL32_WRAP1_NO_ERROR(abs);
50 | // 3. Sign bit inspection.
51 | REAL32_WRAP2_NO_ERROR(copysign);
52 | REAL32_WRAP2_NO_ERROR(max);
53 | REAL32_WRAP2_NO_ERROR(min);
54 | // 4. Relational operators.
55 | REAL32_WRAP_RELATION_NO_ERROR(eq);
56 | REAL32_WRAP_RELATION_NO_ERROR(lte);
57 | REAL32_WRAP_RELATION_NO_ERROR(lt);
58 | REAL32_WRAP_RELATION_NO_ERROR(ne);
59 | REAL32_WRAP_RELATION_NO_ERROR(gte);
60 | REAL32_WRAP_RELATION_NO_ERROR(gt);
61 | 
62 | #undef REAL32_WRAP_RELATION_NO_ERROR
63 | #undef REAL32_WRAP2_NO_ERROR
64 | #undef REAL32_WRAP1_NO_ERROR
65 | 
66 | Real32 real32_sqrt(Context*, Real32);
67 | 
68 | #endif // ERROR_H


--------------------------------------------------------------------------------
/soft.c:
--------------------------------------------------------------------------------
 1 | #include "float32.h"
 2 | #include "float64.h"
 3 | 
 4 | void context_init(Context* context) {
 5 |   context->exceptions = NULL;
 6 |   context->operations = NULL;
 7 |   context->roundings = 0;
 8 | }
 9 | 
10 | void context_free(Context* context) {
11 |   array_free(context->exceptions);
12 |   array_free(context->operations);
13 | }
14 | 
15 | void context_copy(Context* dst, const Context *src) {
16 |   context_init(dst);
17 |   dst->round = src->round;
18 |   dst->tininess = src->tininess;
19 | }
20 | 
21 | bool context_raise(Context *context, Exception exception) {
22 |   return array_push(context->exceptions, exception);
23 | }
24 | 
25 | static Float32 canonical_nan_to_float32(CanonicalNaN nan) {
26 |   return (Float32){(((Uint32)nan.sign) << 31) | LIT32(0x7FC00000) | (nan.hi >> 41)};
27 | }
28 | 
29 | static Float64 canonical_nan_to_float64(CanonicalNaN nan) {
30 |   return (Float64){(((Uint64)nan.sign) << 63) | LIT64(0x7FF8000000000000) | (nan.hi >> 12)};
31 | }
32 | 
33 | Float32 float64_to_float32(Context *ctx, Float64 a) {
34 |   Uint64 a_sig = float64_fract(a);
35 |   Sint16 a_exp = float64_exp(a);
36 |   Flag a_sign = float64_sign(a);
37 |   if (a_exp == 0x7ff) {
38 |     return a_sig 
39 |       ? canonical_nan_to_float32(float64_to_canonical_nan(ctx, a))
40 |       : float32_pack(a_sign, 0xff, 0);
41 |   }
42 |   a_sig = rshr64(a_sig, 22);
43 |   Uint32 sig = a_sig;
44 |   if (a_exp || sig) {
45 |     sig |= LIT32(0x40000000);
46 |     a_exp -= 0x381;
47 |   }
48 |   return float32_round_and_pack(ctx, a_sign, a_exp, sig);
49 | }
50 | 
51 | Float64 float32_to_float64(Context *ctx, Float32 a) {
52 |   Uint32 a_sig = float32_fract(a);
53 |   Sint16 a_exp = float32_exp(a);
54 |   Flag a_sign = float32_sign(a);
55 |   if (a_exp == 0xff) {
56 |     return a_sig 
57 |       ? canonical_nan_to_float64(float32_to_canonical_nan(ctx, a))
58 |       : float64_pack(a_sign, 0x7ff, 0);
59 |   }
60 |   if (a_exp == 0) {
61 |     if (a_sig == 0) {
62 |       return float64_pack(a_sign, 0, 0);
63 |     }
64 |     Normal32 normal = float32_normalize_subnormal(a_sig);
65 |     a_exp = normal.exp;
66 |     a_sig = normal.sig;
67 |     a_exp--;
68 |   }
69 |   return float64_pack(a_sign, a_exp + 0x380, (Uint64)a_sig << 29);
70 | }


--------------------------------------------------------------------------------
/soft.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOFT_H
  2 | #define SOFT_H
  3 | #include "array.h"
  4 | 
  5 | typedef Sint8 Flag;
  6 | 
  7 | typedef enum Round Round;
  8 | typedef enum Exception Exception;
  9 | typedef enum Tininess Tininess;
 10 | typedef enum Operation Operation;
 11 | 
 12 | typedef struct Context Context;
 13 | 
 14 | typedef struct Float32 Float32;
 15 | typedef struct Float64 Float64;
 16 | typedef struct Normal32 Normal32;
 17 | typedef struct Normal64 Normal64;
 18 | 
 19 | typedef struct CanonicalNaN CanonicalNaN;
 20 | 
 21 | struct Float32 {
 22 |   Uint32 bits;
 23 | };
 24 | 
 25 | struct Float64 {
 26 |   Uint64 bits;
 27 | };
 28 | 
 29 | struct Normal32 {
 30 |   Uint32 sig;
 31 |   Sint16 exp;
 32 | };
 33 | 
 34 | struct Normal64 {
 35 |   Uint64 sig;
 36 |   Sint16 exp;
 37 | };
 38 | 
 39 | // Canonical NaN format for conversion between NaNs in different precisions.
 40 | struct CanonicalNaN {
 41 |   Flag sign;
 42 |   Uint64 hi;
 43 |   Uint64 lo;
 44 | };
 45 | 
 46 | enum Round {
 47 |   ROUND_NEAREST_EVEN,
 48 |   ROUND_TO_ZERO,
 49 |   ROUND_DOWN,
 50 |   ROUND_UP
 51 | };
 52 | 
 53 | enum Exception {
 54 |   EXCEPTION_INEXACT    = 1 << 0,
 55 |   EXCEPTION_UNDERFLOW  = 1 << 1,
 56 |   EXCEPTION_OVERFLOW   = 1 << 2,
 57 |   EXCEPTION_INFINITE   = 1 << 3,
 58 |   EXCEPTION_INVALID    = 1 << 4
 59 | };
 60 | 
 61 | enum Tininess {
 62 |   TININESS_AFTER_ROUNDING,
 63 |   TININESS_BEFORE_ROUNDING
 64 | };
 65 | 
 66 | enum Operation {
 67 |   OPERATION_ADD,
 68 |   OPERATION_SUB,
 69 |   OPERATION_MUL,
 70 |   OPERATION_DIV
 71 | };
 72 | 
 73 | struct Context {
 74 |   Round round;
 75 |   Size roundings;
 76 |   ARRAY(Exception) exceptions; ///< Array of flags of triggered exceptions.
 77 |   ARRAY(Operation) operations; ///< Array of all operations carried out 
 78 |   Tininess tininess;
 79 | };
 80 | 
 81 | void context_init(Context* context);
 82 | void context_free(Context* context);
 83 | void context_copy(Context* dst, const Context *src);
 84 | bool context_raise(Context *context, Exception exception);
 85 | 
 86 | // Special right shifts where the least significant bit of result is set when
 87 | // any non-zero bits are shifted off.
 88 | static inline Uint32 rshr32(Uint32 a, Sint16 count) {
 89 |   if (count == 0) {
 90 |     return a;
 91 |   } else if (count < 32) {
 92 |     return (a >> count) | ((a << ((-count) & 31)) != 0);
 93 |   }
 94 |   return a != 0 ? 1 : 0;
 95 | }
 96 | 
 97 | static inline Uint64 rshr64(Uint64 a, Sint16 count) {
 98 |   if (count == 0) {
 99 |     return a;
100 |   } else if (count < 64) {
101 |     return (a >> count) | ((a << ((-count) & 63)) != 0);
102 |   }
103 |   return a != 0 ? 1 : 0;
104 | }
105 | 
106 | Float32 float64_to_float32(Context*, Float64);
107 | Float64 float32_to_float64(Context*, Float32);
108 | 
109 | #endif // SOFT_H


--------------------------------------------------------------------------------
/types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TYPES_H
 2 | #define TYPES_H
 3 | #include <stdint.h> // u?int{8,16,32,64}_t
 4 | #include <stdbool.h> // bool, true, false
 5 | #include <stddef.h> // size_t
 6 | 
 7 | typedef uint8_t Uint8;
 8 | typedef uint16_t Uint16;
 9 | typedef uint32_t Uint32;
10 | typedef uint64_t Uint64;
11 | 
12 | typedef int8_t Sint8;
13 | typedef int16_t Sint16;
14 | typedef int32_t Sint32;
15 | typedef int64_t Sint64;
16 | 
17 | typedef bool Bool;
18 | 
19 | typedef size_t Size;
20 | 
21 | #define LIT32(x) ((Uint32)x ## ul)
22 | #define LIT64(x) ((Uint64)x ## ull)
23 | 
24 | #endif // TYPES_H


--------------------------------------------------------------------------------
/uint128.c:
--------------------------------------------------------------------------------
 1 | #include "uint128.h"
 2 | 
 3 | Uint128 uint128_mul64x64(Uint64 a, Uint64 b) {
 4 |   const Uint32 al = a;
 5 |   const Uint32 ah = a >> 32;
 6 |   const Uint32 bl = b;
 7 |   const Uint32 bh = b >> 32;
 8 |   Uint64 z0, z1;
 9 |   Uint64 ma, mb;
10 |   z1 = (Uint64)al * bl;
11 |   ma = (Uint64)al * bh;
12 |   mb = (Uint64)ah * bl;
13 |   z0 = (Uint64)ah * bh;
14 |   ma += mb;
15 |   z0 += ((Uint64)(ma < mb) << 32) + (ma >> 32);
16 |   ma <<= 32;
17 |   z1 += ma;
18 |   z0 += z1 < ma;
19 |   return (Uint128){z0, z1};
20 | }
21 | 
22 | Uint64 uint128_div128x64(Uint128 a, Uint64 b) {
23 |   if (b <= a.z0) {
24 |     return LIT64(0xFFFFFFFFFFFFFFFF);
25 |   }
26 | 
27 |   Uint64 b0 = b >> 32;
28 |   Uint64 b1;
29 | 
30 |   Uint64 z = (b0 << 32 <= a.z0)
31 |     ? LIT64(0xFFFFFFFF00000000) : (a.z0 / b0) << 32;
32 | 
33 |   Uint128 mul = uint128_mul64x64(b, z);
34 |   Uint128 rem = uint128_sub(a, mul);
35 | 
36 |   while ((Sint64)rem.z0 < 0) {
37 |     z -= LIT64(0x100000000);
38 |     b1 = b << 32;
39 |     rem = uint128_add(rem, (Uint128){b0, b1});
40 |   }
41 |   rem.z0 = (rem.z0 << 32) | (rem.z1 >> 32);
42 | 
43 |   z |= (b0 << 32 <= rem.z0) ? LIT32(0xffffffff) : rem.z0 / b0;
44 | 
45 |   return z;
46 | }


--------------------------------------------------------------------------------
/uint128.h:
--------------------------------------------------------------------------------
 1 | #ifndef UINT128_H
 2 | #define UINT128_H
 3 | #include "types.h"
 4 | 
 5 | typedef struct Uint128 Uint128;
 6 | 
 7 | struct Uint128 {
 8 |   Uint64 z0;
 9 |   Uint64 z1;
10 | };
11 | 
12 | // Multiplies two 64-bit integers to obtain a 128-bit product.
13 | Uint128 uint128_mul64x64(Uint64 a, Uint64 b);
14 | 
15 | // Calculate approximation to the 64-bit integer quotient obtained by dividing
16 | // 64-bit b into the 128-bit a. The divisor b must be at least 2^63.
17 | Uint64 uint128_div128x64(Uint128 a, Uint64 b);
18 | 
19 | // Subtraction is modulo 2^128
20 | static inline Uint128 uint128_sub(Uint128 a, Uint128 b) {
21 |   const Uint64 z1 = a.z1 - b.z1;
22 |   return (Uint128){a.z0 - b.z0 - z1, z1};
23 | }
24 | 
25 | // Addition is modulo 2^128
26 | static inline Uint128 uint128_add(Uint128 a, Uint128 b) {
27 |   const Uint64 z1 = a.z1 + b.z1;
28 |   return (Uint128){a.z0 + b.z0 + (z1 < a.z1), z1};
29 | }
30 | 
31 | #endif // UINT128_H


--------------------------------------------------------------------------------