├── include ├── fp16.h └── fp16 │ ├── macros.h │ ├── bitcasts.h │ └── fp16.h ├── .gitignore ├── cmake ├── DownloadGoogleTest.cmake └── DownloadGoogleBenchmark.cmake ├── test ├── tables.h ├── bitcasts.cc ├── alt-to-fp32-bits.cc ├── ieee-to-fp32-bits.cc ├── alt-to-fp32-value.cc ├── ieee-to-fp32-value.cc ├── alt-from-fp32-value.cc └── ieee-from-fp32-value.cc ├── LICENSE ├── bench ├── to-alt-array.cc ├── alt-element.cc ├── from-alt-array.cc ├── ieee-element.cc ├── to-ieee-array.cc └── from-ieee-array.cc ├── README.md ├── third-party ├── float16-compressor.h ├── THHalf.h ├── eigen-half.h └── npy-halffloat.h ├── .github └── workflows │ └── cmake.yml └── CMakeLists.txt /include/fp16.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef FP16_H 3 | #define FP16_H 4 | 5 | #include 6 | 7 | #endif /* FP16_H */ 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ninja files 2 | build.ninja 3 | 4 | # Build objects and artifacts 5 | deps/ 6 | build/ 7 | bin/ 8 | lib/ 9 | *.pyc 10 | *.pyo 11 | 12 | # System files 13 | .DS_Store 14 | .DS_Store? 15 | ._* 16 | .Spotlight-V100 17 | .Trashes 18 | ehthumbs.db 19 | Thumbs.db 20 | -------------------------------------------------------------------------------- /cmake/DownloadGoogleTest.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.16 FATAL_ERROR) 2 | 3 | PROJECT(googletest-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(googletest 7 | URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip 8 | URL_HASH SHA256=1f357c27ca988c3f7c6b4bf68a9395005ac6761f034046e9dde0896e3aba00e4 9 | SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-source" 10 | BINARY_DIR "${CMAKE_BINARY_DIR}/googletest" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadGoogleBenchmark.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.16 FATAL_ERROR) 2 | 3 | PROJECT(googlebenchmark-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(googlebenchmark 7 | URL https://github.com/google/benchmark/archive/refs/tags/v1.8.4.zip 8 | URL_HASH SHA256=84c49c4c07074f36fbf8b4f182ed7d75191a6fa72756ab4a17848455499f4286 9 | SOURCE_DIR "${CMAKE_BINARY_DIR}/googlebenchmark-source" 10 | BINARY_DIR "${CMAKE_BINARY_DIR}/googlebenchmark" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /test/tables.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | namespace fp16 { 5 | /* FP32 conversion results for FP16 numbers in range [1.0h, 2.0h) */ 6 | extern const uint32_t normalizedValues[1024]; 7 | /* FP32 conversion results for FP16 numbers in range [0.0h, HALF_MIN) */ 8 | extern const uint32_t denormalizedValues[1024]; 9 | /* FP32 numbers such that FP16(fp32) < as_half(as_uint16(1.0h) | index) for fp32 < normalizedRanges[index] */ 10 | extern const uint32_t normalizedRanges[1024]; 11 | /* FP32 numbers such that FP16(fp32) < as_half(index) for fp32 < denormalizedRanges[index] */ 12 | extern const uint32_t denormalizedRanges[1024]; 13 | } 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Facebook Inc. 4 | Copyright (c) 2017 Georgia Institute of Technology 5 | Copyright 2019 Google LLC 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 12 | -------------------------------------------------------------------------------- /bench/to-alt-array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | static void fp16_alt_from_fp32_value(benchmark::State& state) { 13 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 14 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 15 | 16 | std::vector fp32(state.range(0)); 17 | std::vector fp16(state.range(0)); 18 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 19 | 20 | while (state.KeepRunning()) { 21 | float* input = fp32.data(); 22 | benchmark::DoNotOptimize(input); 23 | 24 | uint16_t* output = fp16.data(); 25 | const size_t n = state.range(0); 26 | for (size_t i = 0; i < n; i++) { 27 | output[i] = fp16_alt_from_fp32_value(input[i]); 28 | } 29 | 30 | benchmark::DoNotOptimize(output); 31 | } 32 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 33 | } 34 | BENCHMARK(fp16_alt_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); 35 | 36 | BENCHMARK_MAIN(); 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FP16 2 | Header-only library for conversion to/from half-precision floating point formats 3 | 4 | ## Features 5 | 6 | - Supports IEEE and ARM alternative half-precision floating-point format 7 | - Property converts infinities and NaNs 8 | - Properly converts denormal numbers, even on systems without denormal support 9 | - Header-only library, no installation or build required 10 | - Compatible with C99 and C++11 11 | - Fully covered with unit tests and microbenchmarks 12 | 13 | ## Acknowledgements 14 | 15 | [![HPC Garage logo](https://github.com/Maratyszcza/PeachPy/blob/master/logo/hpcgarage.png)](http://hpcgarage.org) 16 | [![Georgia Tech College of Computing logo](https://github.com/Maratyszcza/PeachPy/blob/master/logo/college-of-computing.gif)](http://www.cse.gatech.edu/) 17 | 18 | The library is developed by [Marat Dukhan](http://www.maratdukhan.com) of Georgia Tech. FP16 is a research project at [Richard Vuduc](http://vuduc.org)'s HPC Garage lab in the Georgia Institute of Technology, College of Computing, School of Computational Science and Engineering. 19 | 20 | This material is based upon work supported by the U.S. National Science Foundation (NSF) Award Number 1339745. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect those of NSF. 21 | -------------------------------------------------------------------------------- /bench/alt-element.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | static inline uint16_t next_xorshift16(uint16_t x) { 6 | x ^= x >> 8; 7 | x ^= x << 9; 8 | x ^= x >> 5; 9 | return x; 10 | } 11 | 12 | static inline uint32_t next_xorshift32(uint32_t x) { 13 | x ^= x >> 13; 14 | x ^= x << 17; 15 | x ^= x >> 5; 16 | return x; 17 | } 18 | 19 | 20 | static void fp16_alt_to_fp32_bits(benchmark::State& state) { 21 | uint16_t fp16 = UINT16_C(0x7C00); 22 | while (state.KeepRunning()) { 23 | uint32_t fp32 = fp16_alt_to_fp32_bits(fp16); 24 | 25 | fp16 = next_xorshift16(fp16); 26 | benchmark::DoNotOptimize(fp32); 27 | } 28 | } 29 | BENCHMARK(fp16_alt_to_fp32_bits); 30 | 31 | static void fp16_alt_to_fp32_value(benchmark::State& state) { 32 | uint16_t fp16 = UINT16_C(0x7C00); 33 | while (state.KeepRunning()) { 34 | float fp32 = fp16_alt_to_fp32_value(fp16); 35 | 36 | fp16 = next_xorshift16(fp16); 37 | benchmark::DoNotOptimize(fp32); 38 | } 39 | } 40 | BENCHMARK(fp16_alt_to_fp32_value); 41 | 42 | static void fp16_alt_from_fp32_value(benchmark::State& state) { 43 | uint32_t fp32 = UINT32_C(0x7F800000); 44 | while (state.KeepRunning()) { 45 | uint16_t fp16 = fp16_alt_from_fp32_value(fp32_from_bits(fp32)); 46 | 47 | fp32 = next_xorshift32(fp32); 48 | benchmark::DoNotOptimize(fp16); 49 | } 50 | } 51 | BENCHMARK(fp16_alt_from_fp32_value); 52 | 53 | BENCHMARK_MAIN(); 54 | -------------------------------------------------------------------------------- /include/fp16/macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef FP16_MACROS_H 3 | #define FP16_MACROS_H 4 | 5 | #ifndef FP16_USE_NATIVE_CONVERSION 6 | #if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__) 7 | #define FP16_USE_NATIVE_CONVERSION 1 8 | #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__) 9 | #define FP16_USE_NATIVE_CONVERSION 1 10 | #elif defined(_MSC_VER) && defined(_M_ARM64) 11 | #define FP16_USE_NATIVE_CONVERSION 1 12 | #elif defined(__GNUC__) && defined(__aarch64__) 13 | #define FP16_USE_NATIVE_CONVERSION 1 14 | #endif 15 | #if !defined(FP16_USE_NATIVE_CONVERSION) 16 | #define FP16_USE_NATIVE_CONVERSION 0 17 | #endif // !defined(FP16_USE_NATIVE_CONVERSION) 18 | #endif // !define(FP16_USE_NATIVE_CONVERSION) 19 | 20 | #ifndef FP16_USE_FLOAT16_TYPE 21 | #if !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(__GNUC__) && (__GNUC__ >= 12) 22 | #if defined(__F16C__) 23 | #define FP16_USE_FLOAT16_TYPE 1 24 | #endif 25 | #endif 26 | #if !defined(FP16_USE_FLOAT16_TYPE) 27 | #define FP16_USE_FLOAT16_TYPE 0 28 | #endif // !defined(FP16_USE_FLOAT16_TYPE) 29 | #endif // !defined(FP16_USE_FLOAT16_TYPE) 30 | 31 | #ifndef FP16_USE_FP16_TYPE 32 | #if defined(__clang__) 33 | #if defined(__F16C__) || defined(__aarch64__) 34 | #define FP16_USE_FP16_TYPE 1 35 | #endif 36 | #elif defined(__GNUC__) 37 | #if defined(__aarch64__) 38 | #define FP16_USE_FP16_TYPE 1 39 | #endif 40 | #endif 41 | #if !defined(FP16_USE_FP16_TYPE) 42 | #define FP16_USE_FP16_TYPE 0 43 | #endif // !defined(FP16_USE_FP16_TYPE) 44 | #endif // !defined(FP16_USE_FP16_TYPE) 45 | 46 | #endif /* FP16_MACROS_H */ 47 | -------------------------------------------------------------------------------- /bench/from-alt-array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | static void fp16_alt_to_fp32_bits(benchmark::State& state) { 13 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 14 | auto rng = std::bind(std::uniform_int_distribution(0, 0x7BFF), std::mt19937(seed)); 15 | 16 | std::vector fp16(state.range(0)); 17 | std::vector fp32(state.range(0)); 18 | std::generate(fp16.begin(), fp16.end(), 19 | [&rng]{ return fp16_alt_from_fp32_value(rng()); }); 20 | 21 | while (state.KeepRunning()) { 22 | uint16_t* input = fp16.data(); 23 | benchmark::DoNotOptimize(input); 24 | 25 | uint32_t* output = fp32.data(); 26 | const size_t n = state.range(0); 27 | for (size_t i = 0; i < n; i++) { 28 | output[i] = fp16_alt_to_fp32_bits(input[i]); 29 | } 30 | 31 | benchmark::DoNotOptimize(output); 32 | } 33 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 34 | } 35 | BENCHMARK(fp16_alt_to_fp32_bits)->RangeMultiplier(2)->Range(1<<10, 64<<20); 36 | 37 | static void fp16_alt_to_fp32_value(benchmark::State& state) { 38 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 39 | auto rng = std::bind(std::uniform_int_distribution(0, 0x7BFF), std::mt19937(seed)); 40 | 41 | std::vector fp16(state.range(0)); 42 | std::vector fp32(state.range(0)); 43 | std::generate(fp16.begin(), fp16.end(), 44 | [&rng]{ return fp16_alt_from_fp32_value(rng()); }); 45 | 46 | while (state.KeepRunning()) { 47 | uint16_t* input = fp16.data(); 48 | benchmark::DoNotOptimize(input); 49 | 50 | float* output = fp32.data(); 51 | const size_t n = state.range(0); 52 | for (size_t i = 0; i < n; i++) { 53 | output[i] = fp16_alt_to_fp32_value(input[i]); 54 | } 55 | 56 | benchmark::DoNotOptimize(output); 57 | } 58 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 59 | } 60 | BENCHMARK(fp16_alt_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); 61 | 62 | BENCHMARK_MAIN(); 63 | -------------------------------------------------------------------------------- /third-party/float16-compressor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | * This code snippet posted by user Phernost on 5 | * https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion 6 | * 7 | * compress and decompress methods are made "inline" for performance 8 | */ 9 | 10 | class Float16Compressor 11 | { 12 | union Bits 13 | { 14 | float f; 15 | int32_t si; 16 | uint32_t ui; 17 | }; 18 | 19 | static int const shift = 13; 20 | static int const shiftSign = 16; 21 | 22 | static int32_t const infN = 0x7F800000; // flt32 infinity 23 | static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32 24 | static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 25 | static int32_t const signN = 0x80000000; // flt32 sign bit 26 | 27 | static int32_t const infC = infN >> shift; 28 | static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 29 | static int32_t const maxC = maxN >> shift; 30 | static int32_t const minC = minN >> shift; 31 | static int32_t const signC = signN >> shiftSign; // flt16 sign bit 32 | 33 | static int32_t const mulN = 0x52000000; // (1 << 23) / minN 34 | static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) 35 | 36 | static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted 37 | static int32_t const norC = 0x00400; // min flt32 normal down shifted 38 | 39 | static int32_t const maxD = infC - maxC - 1; 40 | static int32_t const minD = minC - subC - 1; 41 | 42 | public: 43 | 44 | inline static uint16_t compress(float value) 45 | { 46 | Bits v, s; 47 | v.f = value; 48 | uint32_t sign = v.si & signN; 49 | v.si ^= sign; 50 | sign >>= shiftSign; // logical shift 51 | s.si = mulN; 52 | s.si = s.f * v.f; // correct subnormals 53 | v.si ^= (s.si ^ v.si) & -(minN > v.si); 54 | v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); 55 | v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); 56 | v.ui >>= shift; // logical shift 57 | v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); 58 | v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); 59 | return v.ui | sign; 60 | } 61 | 62 | inline static float decompress(uint16_t value) 63 | { 64 | Bits v; 65 | v.ui = value; 66 | int32_t sign = v.si & signC; 67 | v.si ^= sign; 68 | sign <<= shiftSign; 69 | v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); 70 | v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); 71 | Bits s; 72 | s.si = mulC; 73 | s.f *= v.si; 74 | int32_t mask = -(norC > v.si); 75 | v.si <<= shift; 76 | v.si ^= (s.si ^ v.si) & mask; 77 | v.si |= sign; 78 | return v.f; 79 | } 80 | }; -------------------------------------------------------------------------------- /third-party/THHalf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This implementation is extracted from PyTorch: 3 | * Repo: github.com/pytorch/pytorch 4 | * File: torch/lib/TH/THHalf.c 5 | * Commit ID: 92481b59d31199df57420d4b14912348cc780d1d 6 | * Functions are made "static inline" for performance 7 | */ 8 | 9 | /* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. */ 10 | 11 | // Host functions for converting between FP32 and FP16 formats 12 | 13 | static inline void TH_halfbits2float(unsigned short* src, float* res) 14 | { 15 | unsigned h = *src; 16 | unsigned sign = ((h >> 15) & 1); 17 | unsigned exponent = ((h >> 10) & 0x1f); 18 | unsigned mantissa = ((h & 0x3ff) << 13); 19 | 20 | if (exponent == 0x1f) { /* NaN or Inf */ 21 | mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); 22 | exponent = 0xff; 23 | } else if (!exponent) { /* Denorm or Zero */ 24 | if (mantissa) { 25 | unsigned int msb; 26 | exponent = 0x71; 27 | do { 28 | msb = (mantissa & 0x400000); 29 | mantissa <<= 1; /* normalize */ 30 | --exponent; 31 | } while (!msb); 32 | mantissa &= 0x7fffff; /* 1.mantissa is implicit */ 33 | } 34 | } else { 35 | exponent += 0x70; 36 | } 37 | 38 | *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa); 39 | } 40 | 41 | static inline void TH_float2halfbits(float* src, unsigned short* dest) 42 | { 43 | unsigned x = *(unsigned*)src; 44 | unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; 45 | unsigned sign, exponent, mantissa; 46 | 47 | // Get rid of +NaN/-NaN case first. 48 | if (u > 0x7f800000) { 49 | *dest = 0x7fffU; 50 | return ; 51 | } 52 | 53 | sign = ((x >> 16) & 0x8000); 54 | 55 | // Get rid of +Inf/-Inf, +0/-0. 56 | if (u > 0x477fefff) { 57 | *dest = sign | 0x7c00U; 58 | return; 59 | } 60 | if (u < 0x33000001) { 61 | *dest = (sign | 0x0000); 62 | return; 63 | } 64 | 65 | exponent = ((u >> 23) & 0xff); 66 | mantissa = (u & 0x7fffff); 67 | 68 | if (exponent > 0x70) { 69 | shift = 13; 70 | exponent -= 0x70; 71 | } else { 72 | shift = 0x7e - exponent; 73 | exponent = 0; 74 | mantissa |= 0x800000; 75 | } 76 | lsb = (1 << shift); 77 | lsb_s1 = (lsb >> 1); 78 | lsb_m1 = (lsb - 1); 79 | 80 | // Round to nearest even. 81 | remainder = (mantissa & lsb_m1); 82 | mantissa >>= shift; 83 | if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { 84 | ++mantissa; 85 | if (!(mantissa & 0x3ff)) { 86 | ++exponent; 87 | mantissa = 0; 88 | } 89 | } 90 | 91 | *dest = (sign | (exponent << 10) | mantissa); 92 | } 93 | -------------------------------------------------------------------------------- /include/fp16/bitcasts.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef FP16_BITCASTS_H 3 | #define FP16_BITCASTS_H 4 | 5 | #if defined(__cplusplus) && (__cplusplus >= 201103L) 6 | #include 7 | #elif !defined(__OPENCL_VERSION__) 8 | #include 9 | #endif 10 | 11 | #if defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) 12 | #include 13 | #endif 14 | 15 | #if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) 16 | #include 17 | #endif 18 | 19 | #if defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) 20 | #include 21 | #endif 22 | 23 | 24 | static inline float fp32_from_bits(uint32_t w) { 25 | #if defined(__OPENCL_VERSION__) 26 | return as_float(w); 27 | #elif defined(__CUDA_ARCH__) 28 | return __uint_as_float((unsigned int) w); 29 | #elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) 30 | return _castu32_f32(w); 31 | #elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) 32 | return _CopyFloatFromInt32((__int32) w); 33 | #else 34 | union { 35 | uint32_t as_bits; 36 | float as_value; 37 | } fp32 = { w }; 38 | return fp32.as_value; 39 | #endif 40 | } 41 | 42 | static inline uint32_t fp32_to_bits(float f) { 43 | #if defined(__OPENCL_VERSION__) 44 | return as_uint(f); 45 | #elif defined(__CUDA_ARCH__) 46 | return (uint32_t) __float_as_uint(f); 47 | #elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) 48 | return _castf32_u32(f); 49 | #elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) 50 | return (uint32_t) _CopyInt32FromFloat(f); 51 | #else 52 | union { 53 | float as_value; 54 | uint32_t as_bits; 55 | } fp32 = { f }; 56 | return fp32.as_bits; 57 | #endif 58 | } 59 | 60 | static inline double fp64_from_bits(uint64_t w) { 61 | #if defined(__OPENCL_VERSION__) 62 | return as_double(w); 63 | #elif defined(__CUDA_ARCH__) 64 | return __longlong_as_double((long long) w); 65 | #elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) 66 | return _castu64_f64(w); 67 | #elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) 68 | return _CopyDoubleFromInt64((__int64) w); 69 | #else 70 | union { 71 | uint64_t as_bits; 72 | double as_value; 73 | } fp64 = { w }; 74 | return fp64.as_value; 75 | #endif 76 | } 77 | 78 | static inline uint64_t fp64_to_bits(double f) { 79 | #if defined(__OPENCL_VERSION__) 80 | return as_ulong(f); 81 | #elif defined(__CUDA_ARCH__) 82 | return (uint64_t) __double_as_longlong(f); 83 | #elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) 84 | return _castf64_u64(f); 85 | #elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) 86 | return (uint64_t) _CopyInt64FromDouble(f); 87 | #else 88 | union { 89 | double as_value; 90 | uint64_t as_bits; 91 | } fp64 = { f }; 92 | return fp64.as_bits; 93 | #endif 94 | } 95 | 96 | #endif /* FP16_BITCASTS_H */ 97 | -------------------------------------------------------------------------------- /test/bitcasts.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | 10 | TEST(FP32_TO_BITS, positive) { 11 | for (uint32_t bits = UINT32_C(0x00000000); bits <= UINT32_C(0x7F800000); bits++) { 12 | float value; 13 | memcpy(&value, &bits, sizeof(value)); 14 | 15 | ASSERT_EQ(bits, fp32_to_bits(value)) << 16 | std::hex << std::uppercase << std::setfill('0') << 17 | "BITS = 0x" << std::setw(8) << bits << ", " << 18 | "BITCAST(VALUE) = 0x" << std::setw(8) << fp32_to_bits(value); 19 | } 20 | } 21 | 22 | TEST(FP32_TO_BITS, negative) { 23 | for (uint32_t bits = UINT32_C(0xFF800000); bits >= UINT32_C(0x80000000); bits--) { 24 | float value; 25 | memcpy(&value, &bits, sizeof(value)); 26 | 27 | ASSERT_EQ(bits, fp32_to_bits(value)) << 28 | std::hex << std::uppercase << std::setfill('0') << 29 | "BITS = 0x" << std::setw(8) << bits << ", " << 30 | "BITCAST(VALUE) = 0x" << std::setw(8) << fp32_to_bits(value); 31 | } 32 | } 33 | 34 | TEST(FP32_TO_BITS, nan) { 35 | for (uint32_t bits = UINT32_C(0x7F800001); bits <= UINT32_C(0x7FFFFFFF); bits++) { 36 | float value; 37 | memcpy(&value, &bits, sizeof(value)); 38 | 39 | ASSERT_GT(fp32_to_bits(value) & UINT32_C(0x7FFFFFFF), UINT32_C(0x7F800000)) << 40 | std::hex << std::uppercase << std::setfill('0') << 41 | "BITS = 0x" << std::setw(8) << bits << ", " << 42 | "BITCAST(VALUE) = 0x" << std::setw(8) << fp32_to_bits(value); 43 | } 44 | 45 | for (uint32_t bits = UINT32_C(0xFFFFFFFF); bits >= UINT32_C(0xFF800001); bits--) { 46 | float value; 47 | memcpy(&value, &bits, sizeof(value)); 48 | 49 | ASSERT_GT(fp32_to_bits(value) & UINT32_C(0x7FFFFFFF), UINT32_C(0x7F800000)) << 50 | std::hex << std::uppercase << std::setfill('0') << 51 | "BITS = 0x" << std::setw(8) << bits << ", " << 52 | "BITCAST(VALUE) = 0x" << std::setw(8) << fp32_to_bits(value); 53 | } 54 | } 55 | 56 | TEST(FP32_FROM_BITS, positive) { 57 | for (uint32_t bits = UINT32_C(0x00000000); bits <= UINT32_C(0x7F800000); bits++) { 58 | const float value = fp32_from_bits(bits); 59 | uint32_t bitcast; 60 | memcpy(&bitcast, &value, sizeof(bitcast)); 61 | 62 | ASSERT_EQ(bits, bitcast) << 63 | std::hex << std::uppercase << std::setfill('0') << 64 | "BITS = 0x" << std::setw(8) << bits << ", " << 65 | "VALUE = 0x" << std::setw(8) << bitcast; 66 | } 67 | } 68 | 69 | TEST(FP32_FROM_BITS, negative) { 70 | for (uint32_t bits = UINT32_C(0xFF800000); bits >= UINT32_C(0x80000000); bits--) { 71 | const float value = fp32_from_bits(bits); 72 | uint32_t bitcast; 73 | memcpy(&bitcast, &value, sizeof(bitcast)); 74 | 75 | ASSERT_EQ(bits, bitcast) << 76 | std::hex << std::uppercase << std::setfill('0') << 77 | "BITS = 0x" << std::setw(8) << bits << ", " << 78 | "VALUE = 0x" << std::setw(8) << bitcast; 79 | } 80 | } 81 | 82 | TEST(FP32_FROM_BITS, nan) { 83 | for (uint32_t bits = UINT32_C(0x7F800001); bits <= UINT32_C(0x7FFFFFFF); bits++) { 84 | const float value = fp32_from_bits(bits); 85 | 86 | ASSERT_TRUE(std::isnan(value)) << 87 | std::hex << std::uppercase << std::setfill('0') << 88 | "BITS = 0x" << std::setw(8) << bits; 89 | } 90 | 91 | for (uint32_t bits = UINT32_C(0xFFFFFFFF); bits >= UINT32_C(0xFF800001); bits--) { 92 | const float value = fp32_from_bits(bits); 93 | 94 | ASSERT_TRUE(std::isnan(value)) << 95 | std::hex << std::uppercase << std::setfill('0') << 96 | "BITS = 0x" << std::setw(8) << bits; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /third-party/eigen-half.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This implementation is extracted from Eigen: 3 | * Repo: bitbucket.org/eigen/eigen 4 | * File: Eigen/src/Core/arch/CUDA/Half.h 5 | * Commit ID: 96e0f73a35de54f675d825bef5339b2f08e77eb4 6 | * 7 | * Removed a lot of redundant and cuda-specific code. 8 | */ 9 | 10 | #define EIGEN_STRONG_INLINE static inline 11 | #define EIGEN_DEVICE_FUNC 12 | 13 | // This file is part of Eigen, a lightweight C++ template library 14 | // for linear algebra. 15 | // 16 | // This Source Code Form is subject to the terms of the Mozilla 17 | // Public License v. 2.0. If a copy of the MPL was not distributed 18 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 19 | // 20 | // The conversion routines are Copyright (c) Fabian Giesen, 2016. 21 | // The original license follows: 22 | // 23 | // Copyright (c) Fabian Giesen, 2016 24 | // All rights reserved. 25 | // Redistribution and use in source and binary forms, with or without 26 | // modification, are permitted. 27 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | // “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 | // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 | 39 | 40 | // Standard 16-bit float type, mostly useful for GPUs. Defines a new 41 | // type Eigen::half (inheriting from CUDA's __half struct) with 42 | // operator overloads such that it behaves basically as an arithmetic 43 | // type. It will be quite slow on CPUs (so it is recommended to stay 44 | // in fp32 for CPUs, except for simple parameter conversions, I/O 45 | // to disk and the likes), but fast on GPUs. 46 | 47 | 48 | #ifndef EIGEN_HALF_CUDA_H 49 | #define EIGEN_HALF_CUDA_H 50 | 51 | namespace Eigen { 52 | 53 | namespace half_impl { 54 | 55 | // Make our own __half definition that is similar to CUDA's. 56 | struct __half { 57 | EIGEN_DEVICE_FUNC __half() : x(0) {} 58 | explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} 59 | unsigned short x; 60 | }; 61 | 62 | EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); 63 | EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); 64 | EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); 65 | 66 | // Conversion routines, including fallbacks for the host or older CUDA. 67 | // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of 68 | // these in hardware. If we need more performance on older/other CPUs, they are 69 | // also possible to vectorize directly. 70 | 71 | EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { 72 | __half h; 73 | h.x = x; 74 | return h; 75 | } 76 | 77 | union FP32 { 78 | unsigned int u; 79 | float f; 80 | }; 81 | 82 | EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { 83 | #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 84 | return __float2half(ff); 85 | 86 | #elif defined(EIGEN_HAS_FP16_C) 87 | __half h; 88 | h.x = _cvtss_sh(ff, 0); 89 | return h; 90 | 91 | #else 92 | FP32 f; f.f = ff; 93 | 94 | const FP32 f32infty = { 255 << 23 }; 95 | const FP32 f16max = { (127 + 16) << 23 }; 96 | const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; 97 | unsigned int sign_mask = 0x80000000u; 98 | __half o; 99 | o.x = static_cast(0x0u); 100 | 101 | unsigned int sign = f.u & sign_mask; 102 | f.u ^= sign; 103 | 104 | // NOTE all the integer compares in this function can be safely 105 | // compiled into signed compares since all operands are below 106 | // 0x80000000. Important if you want fast straight SSE2 code 107 | // (since there's no unsigned PCMPGTD). 108 | 109 | if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) 110 | o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf 111 | } else { // (De)normalized number or zero 112 | if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero 113 | // use a magic value to align our 10 mantissa bits at the bottom of 114 | // the float. as long as FP addition is round-to-nearest-even this 115 | // just works. 116 | f.f += denorm_magic.f; 117 | 118 | // and one integer subtract of the bias later, we have our final float! 119 | o.x = static_cast(f.u - denorm_magic.u); 120 | } else { 121 | unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd 122 | 123 | // update exponent, rounding bias part 1 124 | f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; 125 | // rounding bias part 2 126 | f.u += mant_odd; 127 | // take the bits! 128 | o.x = static_cast(f.u >> 13); 129 | } 130 | } 131 | 132 | o.x |= static_cast(sign >> 16); 133 | return o; 134 | #endif 135 | } 136 | 137 | EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { 138 | #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 139 | return __half2float(h); 140 | 141 | #elif defined(EIGEN_HAS_FP16_C) 142 | return _cvtsh_ss(h.x); 143 | 144 | #else 145 | const FP32 magic = { 113 << 23 }; 146 | const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift 147 | FP32 o; 148 | 149 | o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits 150 | unsigned int exp = shifted_exp & o.u; // just the exponent 151 | o.u += (127 - 15) << 23; // exponent adjust 152 | 153 | // handle exponent special cases 154 | if (exp == shifted_exp) { // Inf/NaN? 155 | o.u += (128 - 16) << 23; // extra exp adjust 156 | } else if (exp == 0) { // Zero/Denormal? 157 | o.u += 1 << 23; // extra exp adjust 158 | o.f -= magic.f; // renormalize 159 | } 160 | 161 | o.u |= (h.x & 0x8000) << 16; // sign bit 162 | return o.f; 163 | #endif 164 | } 165 | 166 | } // end namespace half_impl 167 | 168 | } // end namespace Eigen 169 | 170 | #endif // EIGEN_HALF_CUDA_H 171 | -------------------------------------------------------------------------------- /bench/ieee-element.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #ifdef FP16_COMPARATIVE_BENCHMARKS 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #endif 12 | 13 | static inline uint16_t next_xorshift16(uint16_t x) { 14 | x ^= x >> 8; 15 | x ^= x << 9; 16 | x ^= x >> 5; 17 | return x; 18 | } 19 | 20 | static inline uint32_t next_xorshift32(uint32_t x) { 21 | x ^= x >> 13; 22 | x ^= x << 17; 23 | x ^= x >> 5; 24 | return x; 25 | } 26 | 27 | 28 | /* Conversion from IEEE FP16 to IEEE FP32 */ 29 | 30 | static void fp16_ieee_to_fp32_bits(benchmark::State& state) { 31 | uint16_t fp16 = UINT16_C(0x7C00); 32 | while (state.KeepRunning()) { 33 | uint32_t fp32 = fp16_ieee_to_fp32_bits(fp16); 34 | 35 | fp16 = next_xorshift16(fp16); 36 | benchmark::DoNotOptimize(fp32); 37 | } 38 | } 39 | BENCHMARK(fp16_ieee_to_fp32_bits); 40 | 41 | static void fp16_ieee_to_fp32_value(benchmark::State& state) { 42 | uint16_t fp16 = UINT16_C(0x7C00); 43 | while (state.KeepRunning()) { 44 | float fp32 = fp16_ieee_to_fp32_value(fp16); 45 | 46 | fp16 = next_xorshift16(fp16); 47 | benchmark::DoNotOptimize(fp32); 48 | } 49 | } 50 | BENCHMARK(fp16_ieee_to_fp32_value); 51 | 52 | #ifdef FP16_COMPARATIVE_BENCHMARKS 53 | static void TH_halfbits2float(benchmark::State& state) { 54 | uint16_t fp16 = UINT16_C(0x7C00); 55 | while (state.KeepRunning()) { 56 | float fp32; 57 | TH_halfbits2float(&fp16, &fp32); 58 | 59 | fp16 = next_xorshift16(fp16); 60 | benchmark::DoNotOptimize(fp32); 61 | } 62 | } 63 | BENCHMARK(TH_halfbits2float); 64 | 65 | static void npy_halfbits_to_floatbits(benchmark::State& state) { 66 | uint16_t fp16 = UINT16_C(0x7C00); 67 | while (state.KeepRunning()) { 68 | uint32_t fp32 = npy_halfbits_to_floatbits(fp16); 69 | 70 | fp16 = next_xorshift16(fp16); 71 | benchmark::DoNotOptimize(fp32); 72 | } 73 | } 74 | BENCHMARK(npy_halfbits_to_floatbits); 75 | 76 | static void Eigen_half_to_float(benchmark::State& state) { 77 | uint16_t fp16 = UINT16_C(0x7C00); 78 | while (state.KeepRunning()) { 79 | float fp32 = 80 | Eigen::half_impl::half_to_float( 81 | Eigen::half_impl::raw_uint16_to_half(fp16)); 82 | 83 | fp16 = next_xorshift16(fp16); 84 | benchmark::DoNotOptimize(fp32); 85 | } 86 | } 87 | BENCHMARK(Eigen_half_to_float); 88 | 89 | static void Float16Compressor_decompress(benchmark::State& state) { 90 | uint16_t fp16 = UINT16_C(0x7C00); 91 | while (state.KeepRunning()) { 92 | float fp32 = Float16Compressor::decompress(fp16); 93 | 94 | fp16 = next_xorshift16(fp16); 95 | benchmark::DoNotOptimize(fp32); 96 | } 97 | } 98 | BENCHMARK(Float16Compressor_decompress); 99 | 100 | static void half_float_detail_half2float_table(benchmark::State& state) { 101 | uint16_t fp16 = UINT16_C(0x7C00); 102 | while (state.KeepRunning()) { 103 | float fp32 = 104 | half_float::detail::half2float_impl(fp16, 105 | half_float::detail::true_type()); 106 | 107 | fp16 = next_xorshift16(fp16); 108 | benchmark::DoNotOptimize(fp32); 109 | } 110 | } 111 | BENCHMARK(half_float_detail_half2float_table); 112 | 113 | static void half_float_detail_half2float_branch(benchmark::State& state) { 114 | uint16_t fp16 = UINT16_C(0x7C00); 115 | while (state.KeepRunning()) { 116 | float fp32 = 117 | half_float::detail::half2float_impl(fp16, 118 | half_float::detail::false_type()); 119 | 120 | fp16 = next_xorshift16(fp16); 121 | benchmark::DoNotOptimize(fp32); 122 | } 123 | } 124 | BENCHMARK(half_float_detail_half2float_branch); 125 | #endif 126 | 127 | /* Conversion from IEEE FP32 to IEEE FP16 */ 128 | 129 | static void fp16_ieee_from_fp32_value(benchmark::State& state) { 130 | uint32_t fp32 = UINT32_C(0x7F800000); 131 | while (state.KeepRunning()) { 132 | uint16_t fp16 = fp16_ieee_from_fp32_value(fp32_from_bits(fp32)); 133 | 134 | fp32 = next_xorshift32(fp32); 135 | benchmark::DoNotOptimize(fp16); 136 | } 137 | } 138 | BENCHMARK(fp16_ieee_from_fp32_value); 139 | 140 | #ifdef FP16_COMPARATIVE_BENCHMARKS 141 | static void TH_float2halfbits(benchmark::State& state) { 142 | uint32_t fp32 = UINT32_C(0x7F800000); 143 | while (state.KeepRunning()) { 144 | uint16_t fp16; 145 | float fp32_value = fp32_from_bits(fp32); 146 | TH_float2halfbits(&fp32_value, &fp16); 147 | 148 | fp32 = next_xorshift32(fp32); 149 | benchmark::DoNotOptimize(fp16); 150 | } 151 | } 152 | BENCHMARK(TH_float2halfbits); 153 | 154 | static void npy_floatbits_to_halfbits(benchmark::State& state) { 155 | uint32_t fp32 = UINT32_C(0x7F800000); 156 | while (state.KeepRunning()) { 157 | uint16_t fp16 = npy_floatbits_to_halfbits(fp32); 158 | 159 | fp32 = next_xorshift32(fp32); 160 | benchmark::DoNotOptimize(fp16); 161 | } 162 | } 163 | BENCHMARK(npy_floatbits_to_halfbits); 164 | 165 | static void Eigen_float_to_half_rtne(benchmark::State& state) { 166 | uint32_t fp32 = UINT32_C(0x7F800000); 167 | while (state.KeepRunning()) { 168 | Eigen::half_impl::__half fp16 = 169 | Eigen::half_impl::float_to_half_rtne( 170 | fp32_from_bits(fp32)); 171 | 172 | fp32 = next_xorshift32(fp32); 173 | benchmark::DoNotOptimize(fp16); 174 | } 175 | } 176 | BENCHMARK(Eigen_float_to_half_rtne); 177 | 178 | static void Float16Compressor_compress(benchmark::State& state) { 179 | uint32_t fp32 = UINT32_C(0x7F800000); 180 | while (state.KeepRunning()) { 181 | uint16_t fp16 = Float16Compressor::compress(fp32_from_bits(fp32)); 182 | 183 | fp32 = next_xorshift32(fp32); 184 | benchmark::DoNotOptimize(fp16); 185 | } 186 | } 187 | BENCHMARK(Float16Compressor_compress); 188 | 189 | static void half_float_detail_float2half_table(benchmark::State& state) { 190 | uint32_t fp32 = UINT32_C(0x7F800000); 191 | while (state.KeepRunning()) { 192 | uint16_t fp16 = 193 | half_float::detail::float2half_impl( 194 | fp32_from_bits(fp32), 195 | half_float::detail::true_type()); 196 | 197 | fp32 = next_xorshift32(fp32); 198 | benchmark::DoNotOptimize(fp16); 199 | } 200 | } 201 | BENCHMARK(half_float_detail_float2half_table); 202 | 203 | static void half_float_detail_float2half_branch(benchmark::State& state) { 204 | uint32_t fp32 = UINT32_C(0x7F800000); 205 | while (state.KeepRunning()) { 206 | uint16_t fp16 = 207 | half_float::detail::float2half_impl( 208 | fp32_from_bits(fp32), 209 | half_float::detail::false_type()); 210 | 211 | fp32 = next_xorshift32(fp32); 212 | benchmark::DoNotOptimize(fp16); 213 | } 214 | } 215 | BENCHMARK(half_float_detail_float2half_branch); 216 | #endif 217 | 218 | BENCHMARK_MAIN(); 219 | -------------------------------------------------------------------------------- /.github/workflows/cmake.yml: -------------------------------------------------------------------------------- 1 | name: CMake build 2 | on: 3 | push: 4 | paths: 5 | - '.github/**/*.yml' 6 | - 'CMakeLists.txt' 7 | - 'cmake/**' 8 | - '**.cc' 9 | - '**.h' 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} 12 | cancel-in-progress: true 13 | jobs: 14 | cmake-linux-x86_64: 15 | runs-on: ubuntu-20.04 16 | timeout-minutes: 15 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Update apt 20 | run: sudo apt update 21 | - name: Install ninja 22 | run: sudo apt install ninja-build 23 | - name: Configure 24 | run: cmake -Bbuild -S. -G Ninja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 25 | - name: Build 26 | run: cmake --build build --parallel 27 | - name: Test 28 | run: ctest --test-dir build --parallel --output-on-failure 29 | cmake-linux-x86_64-f16c: 30 | runs-on: ubuntu-24.04 # required for gcc >= 12 31 | timeout-minutes: 15 32 | steps: 33 | - uses: actions/checkout@v4 34 | - name: Update apt 35 | run: sudo apt update 36 | - name: Install ninja 37 | run: sudo apt install ninja-build 38 | - name: Configure 39 | run: cmake -Bbuild -S. -G Ninja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 40 | env: 41 | CFLAGS: "-mf16c" 42 | CXXFLAGS: "-mf16c" 43 | - name: Build 44 | run: cmake --build build --parallel 45 | - name: Test 46 | run: ctest --test-dir build --parallel --output-on-failure 47 | cmake-linux-x86: 48 | runs-on: ubuntu-20.04 49 | timeout-minutes: 15 50 | steps: 51 | - uses: actions/checkout@v4 52 | - name: Update apt 53 | run: sudo apt update 54 | - name: Install multilib gcc 55 | run: sudo apt install gcc-multilib g++-multilib 56 | - name: Install ninja 57 | run: sudo apt install ninja-build 58 | - name: Configure 59 | run: cmake -Bbuild -S. -G Ninja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 60 | env: 61 | CFLAGS: "-m32 -mfpmath=387" 62 | CXXFLAGS: "-m32 -mfpmath=387" 63 | LDFLAGS: "-m32" 64 | - name: Build 65 | run: cmake --build build --parallel 66 | - name: Test 67 | run: ctest --test-dir build --parallel --output-on-failure 68 | cmake-linux-x86-f16c: 69 | runs-on: ubuntu-20.04 70 | timeout-minutes: 15 71 | steps: 72 | - uses: actions/checkout@v4 73 | - name: Update apt 74 | run: sudo apt update 75 | - name: Install multilib gcc 76 | run: sudo apt install gcc-multilib g++-multilib 77 | - name: Install ninja 78 | run: sudo apt install ninja-build 79 | - name: Configure 80 | run: cmake -Bbuild -S. -G Ninja -DCMAKE_BUILD_TYPE=Release -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 81 | env: 82 | CFLAGS: "-m32 -mf16c" 83 | CXXFLAGS: "-m32 -mf16c" 84 | LDFLAGS: "-m32" 85 | - name: Build 86 | run: cmake --build build --parallel 87 | - name: Test 88 | run: ctest --test-dir build --parallel --output-on-failure 89 | cmake-macos-x86_64: 90 | runs-on: macos-12 91 | timeout-minutes: 15 92 | steps: 93 | - uses: actions/checkout@v4 94 | - name: Configure 95 | run: cmake -Bbuild -S. -G Xcode -DCMAKE_OSX_ARCHITECTURES=x86_64 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 96 | - name: Build 97 | run: cmake --build build --config Release --parallel -- -quiet 98 | - name: Test 99 | run: ctest --test-dir build --build-config Release --parallel --output-on-failure 100 | cmake-macos-x86_64-f16c: 101 | runs-on: macos-12 102 | timeout-minutes: 15 103 | steps: 104 | - uses: actions/checkout@v4 105 | - name: Configure 106 | run: cmake -Bbuild -S. -G Xcode -DCMAKE_OSX_ARCHITECTURES=x86_64 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 107 | env: 108 | CFLAGS: "-mf16c" 109 | CXXFLAGS: "-mf16c" 110 | - name: Build 111 | run: cmake --build build --config Release --parallel -- -quiet 112 | - name: Test 113 | run: ctest --test-dir build --build-config Release --parallel --output-on-failure 114 | cmake-macos-arm64: 115 | runs-on: macos-14 116 | timeout-minutes: 15 117 | steps: 118 | - uses: actions/checkout@v4 119 | - name: Configure 120 | run: cmake -Bbuild -S. -G Xcode -DCMAKE_OSX_ARCHITECTURES=arm64 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 121 | - name: Build 122 | run: cmake --build build --config Release --parallel -- -quiet 123 | - name: Test 124 | run: ctest --test-dir build --build-config Release --parallel --output-on-failure 125 | cmake-windows-x86: 126 | runs-on: windows-2019 127 | timeout-minutes: 15 128 | steps: 129 | - uses: actions/checkout@v4 130 | - name: Configure 131 | run: cmake -Bbuild -S. -G "Visual Studio 16 2019" -A Win32 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 132 | env: 133 | CFLAGS: "/arch:IA32" 134 | CXXFLAGS: "/arch:IA32" 135 | - name: Build 136 | run: cmake --build build --config Release --parallel 137 | - name: Test 138 | run: ctest --test-dir build --build-config Release --parallel --output-on-failure 139 | cmake-windows-x86-avx2: 140 | runs-on: windows-2019 141 | timeout-minutes: 15 142 | steps: 143 | - uses: actions/checkout@v4 144 | - name: Configure 145 | run: cmake -Bbuild -S. -G "Visual Studio 16 2019" -A Win32 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 146 | env: 147 | CFLAGS: "/arch:AVX2" 148 | CXXFLAGS: "/arch:AVX2" 149 | - name: Build 150 | run: cmake --build build --config Release --parallel 151 | - name: Test 152 | run: ctest --test-dir build --build-config Release --parallel --output-on-failure 153 | cmake-windows-x64: 154 | runs-on: windows-2019 155 | timeout-minutes: 15 156 | steps: 157 | - uses: actions/checkout@v4 158 | - name: Configure 159 | run: cmake -Bbuild -S. -G "Visual Studio 16 2019" -A x64 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 160 | - name: Build 161 | run: cmake --build build --config Release --parallel 162 | - name: Test 163 | run: ctest --test-dir build --build-config Release --parallel --output-on-failure 164 | cmake-windows-x64-avx2: 165 | runs-on: windows-2019 166 | timeout-minutes: 15 167 | steps: 168 | - uses: actions/checkout@v4 169 | - name: Configure 170 | run: cmake -Bbuild -S. -G "Visual Studio 16 2019" -A x64 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 171 | env: 172 | CFLAGS: "/arch:AVX2" 173 | CXXFLAGS: "/arch:AVX2" 174 | - name: Build 175 | run: cmake --build build --config Release --parallel 176 | - name: Test 177 | run: ctest --test-dir build --build-config Release --parallel --output-on-failure 178 | cmake-windows-arm64: 179 | runs-on: windows-2019 180 | timeout-minutes: 15 181 | steps: 182 | - uses: actions/checkout@v4 183 | - name: Configure 184 | run: cmake -Bbuild -S. -G "Visual Studio 16 2019" -A ARM64 -DFP16_BUILD_COMPARATIVE_BENCHMARKS=ON 185 | - name: Build 186 | run: cmake --build build --config Release --parallel 187 | -------------------------------------------------------------------------------- /bench/to-ieee-array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef FP16_COMPARATIVE_BENCHMARKS 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #endif 18 | 19 | 20 | static void fp16_ieee_from_fp32_value(benchmark::State& state) { 21 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 22 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 23 | 24 | std::vector fp32(state.range(0)); 25 | std::vector fp16(state.range(0)); 26 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 27 | 28 | while (state.KeepRunning()) { 29 | float* input = fp32.data(); 30 | benchmark::DoNotOptimize(input); 31 | 32 | uint16_t* output = fp16.data(); 33 | const size_t n = state.range(0); 34 | for (size_t i = 0; i < n; i++) { 35 | output[i] = fp16_ieee_from_fp32_value(input[i]); 36 | } 37 | 38 | benchmark::DoNotOptimize(output); 39 | } 40 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 41 | } 42 | BENCHMARK(fp16_ieee_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); 43 | 44 | #ifdef FP16_COMPARATIVE_BENCHMARKS 45 | static void TH_float2halfbits(benchmark::State& state) { 46 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 47 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 48 | 49 | std::vector fp32(state.range(0)); 50 | std::vector fp16(state.range(0)); 51 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 52 | 53 | while (state.KeepRunning()) { 54 | float* input = fp32.data(); 55 | benchmark::DoNotOptimize(input); 56 | 57 | uint16_t* output = fp16.data(); 58 | const size_t n = state.range(0); 59 | for (size_t i = 0; i < n; i++) { 60 | TH_float2halfbits(&input[i], &output[i]); 61 | } 62 | 63 | benchmark::DoNotOptimize(output); 64 | } 65 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 66 | } 67 | BENCHMARK(TH_float2halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20); 68 | 69 | static void npy_floatbits_to_halfbits(benchmark::State& state) { 70 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 71 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 72 | 73 | std::vector fp32(state.range(0)); 74 | std::vector fp16(state.range(0)); 75 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 76 | 77 | while (state.KeepRunning()) { 78 | float* input = fp32.data(); 79 | benchmark::DoNotOptimize(input); 80 | 81 | uint16_t* output = fp16.data(); 82 | const size_t n = state.range(0); 83 | for (size_t i = 0; i < n; i++) { 84 | output[i] = npy_floatbits_to_halfbits(fp32_to_bits(input[i])); 85 | } 86 | 87 | benchmark::DoNotOptimize(output); 88 | } 89 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 90 | } 91 | BENCHMARK(npy_floatbits_to_halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20); 92 | 93 | static void Eigen_float_to_half_rtne(benchmark::State& state) { 94 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 95 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 96 | 97 | std::vector fp32(state.range(0)); 98 | std::vector fp16(state.range(0)); 99 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 100 | 101 | while (state.KeepRunning()) { 102 | float* input = fp32.data(); 103 | benchmark::DoNotOptimize(input); 104 | 105 | uint16_t* output = fp16.data(); 106 | const size_t n = state.range(0); 107 | for (size_t i = 0; i < n; i++) { 108 | output[i] = Eigen::half_impl::float_to_half_rtne(input[i]).x; 109 | } 110 | 111 | benchmark::DoNotOptimize(output); 112 | } 113 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 114 | } 115 | BENCHMARK(Eigen_float_to_half_rtne)->RangeMultiplier(2)->Range(1<<10, 64<<20); 116 | 117 | static void Float16Compressor_compress(benchmark::State& state) { 118 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 119 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 120 | 121 | std::vector fp32(state.range(0)); 122 | std::vector fp16(state.range(0)); 123 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 124 | 125 | while (state.KeepRunning()) { 126 | float* input = fp32.data(); 127 | benchmark::DoNotOptimize(input); 128 | 129 | uint16_t* output = fp16.data(); 130 | const size_t n = state.range(0); 131 | for (size_t i = 0; i < n; i++) { 132 | output[i] = Float16Compressor::compress(input[i]); 133 | } 134 | 135 | benchmark::DoNotOptimize(output); 136 | } 137 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 138 | } 139 | BENCHMARK(Float16Compressor_compress)->RangeMultiplier(2)->Range(1<<10, 64<<20); 140 | 141 | static void half_float_detail_float2half_table(benchmark::State& state) { 142 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 143 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 144 | 145 | std::vector fp32(state.range(0)); 146 | std::vector fp16(state.range(0)); 147 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 148 | 149 | while (state.KeepRunning()) { 150 | float* input = fp32.data(); 151 | benchmark::DoNotOptimize(input); 152 | 153 | uint16_t* output = fp16.data(); 154 | const size_t n = state.range(0); 155 | for (size_t i = 0; i < n; i++) { 156 | output[i] = 157 | half_float::detail::float2half_impl( 158 | input[i], half_float::detail::true_type()); 159 | } 160 | 161 | benchmark::DoNotOptimize(output); 162 | } 163 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 164 | } 165 | BENCHMARK(half_float_detail_float2half_table)->RangeMultiplier(2)->Range(1<<10, 64<<20); 166 | 167 | static void half_float_detail_float2half_branch(benchmark::State& state) { 168 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 169 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 170 | 171 | std::vector fp32(state.range(0)); 172 | std::vector fp16(state.range(0)); 173 | std::generate(fp32.begin(), fp32.end(), std::ref(rng)); 174 | 175 | while (state.KeepRunning()) { 176 | float* input = fp32.data(); 177 | benchmark::DoNotOptimize(input); 178 | 179 | uint16_t* output = fp16.data(); 180 | const size_t n = state.range(0); 181 | for (size_t i = 0; i < n; i++) { 182 | output[i] = 183 | half_float::detail::float2half_impl( 184 | input[i], half_float::detail::false_type()); 185 | } 186 | 187 | benchmark::DoNotOptimize(output); 188 | } 189 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 190 | } 191 | BENCHMARK(half_float_detail_float2half_branch)->RangeMultiplier(2)->Range(1<<10, 64<<20); 192 | #endif 193 | 194 | BENCHMARK_MAIN(); 195 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.16 FATAL_ERROR) 2 | 3 | # ---[ Project 4 | PROJECT(FP16 C) 5 | 6 | # ---[ Options. 7 | OPTION(FP16_BUILD_TESTS "Build FP16 unit tests" ON) 8 | OPTION(FP16_BUILD_BENCHMARKS "Build FP16 micro-benchmarks" ON) 9 | OPTION(FP16_BUILD_COMPARATIVE_BENCHMARKS "Build FP16 micro-benchmarks comparing to alternatives" OFF) 10 | OPTION(FP16_INSTALL_LIBRARY "Install the FP16 library headers" ON) 11 | 12 | # ---[ CMake options 13 | IF(FP16_BUILD_TESTS OR FP16_BUILD_BENCHMARKS) 14 | ENABLE_LANGUAGE(CXX) 15 | ENDIF() 16 | 17 | IF(FP16_BUILD_TESTS) 18 | ENABLE_TESTING() 19 | ENDIF() 20 | 21 | # ---[ Download deps 22 | IF(FP16_BUILD_TESTS AND NOT DEFINED GOOGLETEST_SOURCE_DIR) 23 | MESSAGE(STATUS "Downloading Google Test to ${CMAKE_BINARY_DIR}/googletest-source (define GOOGLETEST_SOURCE_DIR to avoid it)") 24 | CONFIGURE_FILE(cmake/DownloadGoogleTest.cmake "${CMAKE_BINARY_DIR}/googletest-download/CMakeLists.txt") 25 | EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" . 26 | WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/googletest-download") 27 | EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build . 28 | WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/googletest-download") 29 | SET(GOOGLETEST_SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-source" CACHE STRING "Google Test source directory") 30 | ENDIF() 31 | 32 | IF(FP16_BUILD_BENCHMARKS AND NOT DEFINED GOOGLEBENCHMARK_SOURCE_DIR) 33 | MESSAGE(STATUS "Downloading Google Benchmark to ${CMAKE_BINARY_DIR}/googlebenchmark-source (define GOOGLEBENCHMARK_SOURCE_DIR to avoid it)") 34 | CONFIGURE_FILE(cmake/DownloadGoogleBenchmark.cmake "${CMAKE_BINARY_DIR}/googlebenchmark-download/CMakeLists.txt") 35 | EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" . 36 | WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/googlebenchmark-download") 37 | EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build . 38 | WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/googlebenchmark-download") 39 | SET(GOOGLEBENCHMARK_SOURCE_DIR "${CMAKE_BINARY_DIR}/googlebenchmark-source" CACHE STRING "Google Benchmark source directory") 40 | ENDIF() 41 | 42 | # ---[ FP16 library 43 | ADD_LIBRARY(fp16 INTERFACE) 44 | TARGET_INCLUDE_DIRECTORIES(fp16 INTERFACE 45 | "$" 46 | "$") 47 | 48 | IF(FP16_INSTALL_LIBRARY) 49 | INCLUDE(GNUInstallDirs) 50 | INSTALL(FILES include/fp16.h 51 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") 52 | INSTALL(FILES 53 | include/fp16/bitcasts.h 54 | include/fp16/fp16.h 55 | include/fp16/macros.h 56 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/fp16") 57 | ENDIF() 58 | 59 | IF(FP16_BUILD_TESTS) 60 | # ---[ Build google test 61 | IF(NOT TARGET gtest) 62 | SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 63 | ADD_SUBDIRECTORY( 64 | "${GOOGLETEST_SOURCE_DIR}" 65 | "${CMAKE_BINARY_DIR}/googletest") 66 | ENDIF() 67 | 68 | # ---[ Build FP16 unit tests 69 | ADD_EXECUTABLE(ieee-to-fp32-bits-test test/ieee-to-fp32-bits.cc test/tables.cc) 70 | SET_TARGET_PROPERTIES(ieee-to-fp32-bits-test PROPERTIES 71 | CXX_STANDARD 11 72 | CXX_STANDARD_REQUIRED YES 73 | CXX_EXTENSIONS YES) 74 | TARGET_INCLUDE_DIRECTORIES(ieee-to-fp32-bits-test PRIVATE test) 75 | TARGET_LINK_LIBRARIES(ieee-to-fp32-bits-test PRIVATE fp16 gtest gtest_main) 76 | ADD_TEST(NAME ieee-to-fp32-bits COMMAND ieee-to-fp32-bits-test) 77 | 78 | ADD_EXECUTABLE(ieee-to-fp32-value-test test/ieee-to-fp32-value.cc test/tables.cc) 79 | SET_TARGET_PROPERTIES(ieee-to-fp32-value-test PROPERTIES 80 | CXX_STANDARD 11 81 | CXX_STANDARD_REQUIRED YES 82 | CXX_EXTENSIONS YES) 83 | TARGET_INCLUDE_DIRECTORIES(ieee-to-fp32-value-test PRIVATE test) 84 | TARGET_LINK_LIBRARIES(ieee-to-fp32-value-test PRIVATE fp16 gtest gtest_main) 85 | ADD_TEST(NAME ieee-to-fp32-value COMMAND ieee-to-fp32-value-test) 86 | 87 | ADD_EXECUTABLE(ieee-from-fp32-value-test test/ieee-from-fp32-value.cc test/tables.cc) 88 | SET_TARGET_PROPERTIES(ieee-from-fp32-value-test PROPERTIES 89 | CXX_STANDARD 11 90 | CXX_STANDARD_REQUIRED YES 91 | CXX_EXTENSIONS YES) 92 | TARGET_INCLUDE_DIRECTORIES(ieee-from-fp32-value-test PRIVATE test) 93 | TARGET_LINK_LIBRARIES(ieee-from-fp32-value-test PRIVATE fp16 gtest gtest_main) 94 | ADD_TEST(NAME ieee-from-fp32-value COMMAND ieee-from-fp32-value-test) 95 | 96 | ADD_EXECUTABLE(alt-to-fp32-bits-test test/alt-to-fp32-bits.cc test/tables.cc) 97 | SET_TARGET_PROPERTIES(alt-to-fp32-bits-test PROPERTIES 98 | CXX_STANDARD 11 99 | CXX_STANDARD_REQUIRED YES 100 | CXX_EXTENSIONS YES) 101 | TARGET_INCLUDE_DIRECTORIES(alt-to-fp32-bits-test PRIVATE test) 102 | TARGET_LINK_LIBRARIES(alt-to-fp32-bits-test PRIVATE fp16 gtest gtest_main) 103 | ADD_TEST(NAME alt-to-fp32-bits COMMAND alt-to-fp32-bits-test) 104 | 105 | ADD_EXECUTABLE(alt-to-fp32-value-test test/alt-to-fp32-value.cc test/tables.cc) 106 | SET_TARGET_PROPERTIES(alt-to-fp32-value-test PROPERTIES 107 | CXX_STANDARD 11 108 | CXX_STANDARD_REQUIRED YES 109 | CXX_EXTENSIONS YES) 110 | TARGET_INCLUDE_DIRECTORIES(alt-to-fp32-value-test PRIVATE test) 111 | TARGET_LINK_LIBRARIES(alt-to-fp32-value-test PRIVATE fp16 gtest gtest_main) 112 | ADD_TEST(NAME alt-to-fp32-value COMMAND alt-to-fp32-value-test) 113 | 114 | ADD_EXECUTABLE(alt-from-fp32-value-test test/alt-from-fp32-value.cc test/tables.cc) 115 | SET_TARGET_PROPERTIES(alt-from-fp32-value-test PROPERTIES 116 | CXX_STANDARD 11 117 | CXX_STANDARD_REQUIRED YES 118 | CXX_EXTENSIONS YES) 119 | TARGET_INCLUDE_DIRECTORIES(alt-from-fp32-value-test PRIVATE test) 120 | TARGET_LINK_LIBRARIES(alt-from-fp32-value-test PRIVATE fp16 gtest gtest_main) 121 | ADD_TEST(NAME alt-from-fp32-value COMMAND alt-from-fp32-value-test) 122 | 123 | ADD_EXECUTABLE(bitcasts-test test/bitcasts.cc) 124 | SET_TARGET_PROPERTIES(bitcasts-test PROPERTIES 125 | CXX_STANDARD 11 126 | CXX_STANDARD_REQUIRED YES 127 | CXX_EXTENSIONS YES) 128 | TARGET_LINK_LIBRARIES(bitcasts-test PRIVATE fp16 gtest gtest_main) 129 | ADD_TEST(NAME bitcasts COMMAND bitcasts-test) 130 | ENDIF() 131 | 132 | IF(FP16_BUILD_BENCHMARKS) 133 | # ---[ Build google benchmark 134 | IF(NOT TARGET benchmark) 135 | SET(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "") 136 | ADD_SUBDIRECTORY( 137 | "${GOOGLEBENCHMARK_SOURCE_DIR}" 138 | "${CMAKE_BINARY_DIR}/googlebenchmark") 139 | ENDIF() 140 | 141 | # ---[ Build FP16 benchmarks 142 | ADD_EXECUTABLE(ieee-element-bench bench/ieee-element.cc) 143 | SET_TARGET_PROPERTIES(ieee-element-bench PROPERTIES 144 | CXX_STANDARD 11 145 | CXX_STANDARD_REQUIRED YES 146 | CXX_EXTENSIONS YES) 147 | TARGET_COMPILE_DEFINITIONS(ieee-element-bench PRIVATE "FP16_COMPARATIVE_BENCHMARKS=$") 148 | TARGET_INCLUDE_DIRECTORIES(ieee-element-bench PRIVATE "${PROJECT_SOURCE_DIR}") 149 | TARGET_LINK_LIBRARIES(ieee-element-bench PRIVATE fp16 benchmark) 150 | 151 | ADD_EXECUTABLE(alt-element-bench bench/alt-element.cc) 152 | SET_TARGET_PROPERTIES(alt-element-bench PROPERTIES 153 | CXX_STANDARD 11 154 | CXX_STANDARD_REQUIRED YES 155 | CXX_EXTENSIONS YES) 156 | TARGET_LINK_LIBRARIES(alt-element-bench PRIVATE fp16 benchmark) 157 | 158 | ADD_EXECUTABLE(from-ieee-array-bench bench/from-ieee-array.cc) 159 | SET_TARGET_PROPERTIES(from-ieee-array-bench PROPERTIES 160 | CXX_STANDARD 11 161 | CXX_STANDARD_REQUIRED YES 162 | CXX_EXTENSIONS YES) 163 | TARGET_COMPILE_DEFINITIONS(from-ieee-array-bench PRIVATE "FP16_COMPARATIVE_BENCHMARKS=$") 164 | TARGET_INCLUDE_DIRECTORIES(from-ieee-array-bench PRIVATE "${PROJECT_SOURCE_DIR}") 165 | TARGET_LINK_LIBRARIES(from-ieee-array-bench PRIVATE fp16 benchmark) 166 | 167 | ADD_EXECUTABLE(from-alt-array-bench bench/from-alt-array.cc) 168 | SET_TARGET_PROPERTIES(from-alt-array-bench PROPERTIES 169 | CXX_STANDARD 11 170 | CXX_STANDARD_REQUIRED YES 171 | CXX_EXTENSIONS YES) 172 | TARGET_LINK_LIBRARIES(from-alt-array-bench PRIVATE fp16 benchmark) 173 | 174 | ADD_EXECUTABLE(to-ieee-array-bench bench/to-ieee-array.cc) 175 | SET_TARGET_PROPERTIES(to-ieee-array-bench PROPERTIES 176 | CXX_STANDARD 11 177 | CXX_STANDARD_REQUIRED YES 178 | CXX_EXTENSIONS YES) 179 | TARGET_COMPILE_DEFINITIONS(to-ieee-array-bench PRIVATE "FP16_COMPARATIVE_BENCHMARKS=$") 180 | TARGET_INCLUDE_DIRECTORIES(to-ieee-array-bench PRIVATE "${PROJECT_SOURCE_DIR}") 181 | TARGET_LINK_LIBRARIES(to-ieee-array-bench PRIVATE fp16 benchmark) 182 | 183 | ADD_EXECUTABLE(to-alt-array-bench bench/to-alt-array.cc) 184 | SET_TARGET_PROPERTIES(to-alt-array-bench PROPERTIES 185 | CXX_STANDARD 11 186 | CXX_STANDARD_REQUIRED YES 187 | CXX_EXTENSIONS YES) 188 | TARGET_LINK_LIBRARIES(to-alt-array-bench PRIVATE fp16 benchmark) 189 | ENDIF() 190 | -------------------------------------------------------------------------------- /bench/from-ieee-array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef FP16_COMPARATIVE_BENCHMARKS 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #endif 18 | 19 | 20 | static void fp16_ieee_to_fp32_bits(benchmark::State& state) { 21 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 22 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 23 | 24 | std::vector fp16(state.range(0)); 25 | std::vector fp32(state.range(0)); 26 | std::generate(fp16.begin(), fp16.end(), 27 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 28 | 29 | while (state.KeepRunning()) { 30 | uint16_t* input = fp16.data(); 31 | benchmark::DoNotOptimize(input); 32 | 33 | uint32_t* output = fp32.data(); 34 | const size_t n = state.range(0); 35 | for (size_t i = 0; i < n; i++) { 36 | output[i] = fp16_ieee_to_fp32_bits(input[i]); 37 | } 38 | 39 | benchmark::DoNotOptimize(output); 40 | } 41 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 42 | } 43 | BENCHMARK(fp16_ieee_to_fp32_bits)->RangeMultiplier(2)->Range(1<<10, 64<<20); 44 | 45 | static void fp16_ieee_to_fp32_value(benchmark::State& state) { 46 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 47 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 48 | 49 | std::vector fp16(state.range(0)); 50 | std::vector fp32(state.range(0)); 51 | std::generate(fp16.begin(), fp16.end(), 52 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 53 | 54 | while (state.KeepRunning()) { 55 | uint16_t* input = fp16.data(); 56 | benchmark::DoNotOptimize(input); 57 | 58 | float* output = fp32.data(); 59 | const size_t n = state.range(0); 60 | for (size_t i = 0; i < n; i++) { 61 | output[i] = fp16_ieee_to_fp32_value(input[i]); 62 | } 63 | 64 | benchmark::DoNotOptimize(output); 65 | } 66 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 67 | } 68 | BENCHMARK(fp16_ieee_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20); 69 | 70 | #ifdef FP16_COMPARATIVE_BENCHMARKS 71 | static void TH_halfbits2float(benchmark::State& state) { 72 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 73 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 74 | 75 | std::vector fp16(state.range(0)); 76 | std::vector fp32(state.range(0)); 77 | std::generate(fp16.begin(), fp16.end(), 78 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 79 | 80 | while (state.KeepRunning()) { 81 | uint16_t* input = fp16.data(); 82 | benchmark::DoNotOptimize(input); 83 | 84 | float* output = fp32.data(); 85 | const size_t n = state.range(0); 86 | for (size_t i = 0; i < n; i++) { 87 | TH_halfbits2float(&input[i], &output[i]); 88 | } 89 | 90 | benchmark::DoNotOptimize(output); 91 | } 92 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 93 | } 94 | BENCHMARK(TH_halfbits2float)->RangeMultiplier(2)->Range(1<<10, 64<<20); 95 | 96 | static void npy_halfbits_to_floatbits(benchmark::State& state) { 97 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 98 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 99 | 100 | std::vector fp16(state.range(0)); 101 | std::vector fp32(state.range(0)); 102 | std::generate(fp16.begin(), fp16.end(), 103 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 104 | 105 | while (state.KeepRunning()) { 106 | uint16_t* input = fp16.data(); 107 | benchmark::DoNotOptimize(input); 108 | 109 | uint32_t* output = fp32.data(); 110 | const size_t n = state.range(0); 111 | for (size_t i = 0; i < n; i++) { 112 | output[i] = npy_halfbits_to_floatbits(input[i]); 113 | } 114 | 115 | benchmark::DoNotOptimize(output); 116 | } 117 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 118 | } 119 | BENCHMARK(npy_halfbits_to_floatbits)->RangeMultiplier(2)->Range(1<<10, 64<<20); 120 | 121 | static void Eigen_half_to_float(benchmark::State& state) { 122 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 123 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 124 | 125 | std::vector fp16(state.range(0)); 126 | std::vector fp32(state.range(0)); 127 | std::generate(fp16.begin(), fp16.end(), 128 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 129 | 130 | while (state.KeepRunning()) { 131 | uint16_t* input = fp16.data(); 132 | benchmark::DoNotOptimize(input); 133 | 134 | float* output = fp32.data(); 135 | const size_t n = state.range(0); 136 | for (size_t i = 0; i < n; i++) { 137 | output[i] = 138 | Eigen::half_impl::half_to_float( 139 | Eigen::half_impl::raw_uint16_to_half(input[i])); 140 | } 141 | 142 | benchmark::DoNotOptimize(output); 143 | } 144 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 145 | } 146 | BENCHMARK(Eigen_half_to_float)->RangeMultiplier(2)->Range(1<<10, 64<<20); 147 | 148 | static void Float16Compressor_decompress(benchmark::State& state) { 149 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 150 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 151 | 152 | std::vector fp16(state.range(0)); 153 | std::vector fp32(state.range(0)); 154 | std::generate(fp16.begin(), fp16.end(), 155 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 156 | 157 | while (state.KeepRunning()) { 158 | uint16_t* input = fp16.data(); 159 | benchmark::DoNotOptimize(input); 160 | 161 | float* output = fp32.data(); 162 | const size_t n = state.range(0); 163 | for (size_t i = 0; i < n; i++) { 164 | output[i] = Float16Compressor::decompress(input[i]); 165 | } 166 | 167 | benchmark::DoNotOptimize(output); 168 | } 169 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 170 | } 171 | BENCHMARK(Float16Compressor_decompress)->RangeMultiplier(2)->Range(1<<10, 64<<20); 172 | 173 | static void half_float_detail_half2float_table(benchmark::State& state) { 174 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 175 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 176 | 177 | std::vector fp16(state.range(0)); 178 | std::vector fp32(state.range(0)); 179 | std::generate(fp16.begin(), fp16.end(), 180 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 181 | 182 | while (state.KeepRunning()) { 183 | uint16_t* input = fp16.data(); 184 | benchmark::DoNotOptimize(input); 185 | 186 | float* output = fp32.data(); 187 | const size_t n = state.range(0); 188 | for (size_t i = 0; i < n; i++) { 189 | output[i] = half_float::detail::half2float_impl(input[i], 190 | half_float::detail::true_type()); 191 | } 192 | 193 | benchmark::DoNotOptimize(output); 194 | } 195 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 196 | } 197 | BENCHMARK(half_float_detail_half2float_table)->RangeMultiplier(2)->Range(1<<10, 64<<20); 198 | 199 | static void half_float_detail_half2float_branch(benchmark::State& state) { 200 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 201 | auto rng = std::bind(std::uniform_real_distribution(-1.0f, 1.0f), std::mt19937(seed)); 202 | 203 | std::vector fp16(state.range(0)); 204 | std::vector fp32(state.range(0)); 205 | std::generate(fp16.begin(), fp16.end(), 206 | [&rng]{ return fp16_ieee_from_fp32_value(rng()); }); 207 | 208 | while (state.KeepRunning()) { 209 | uint16_t* input = fp16.data(); 210 | benchmark::DoNotOptimize(input); 211 | 212 | float* output = fp32.data(); 213 | const size_t n = state.range(0); 214 | for (size_t i = 0; i < n; i++) { 215 | output[i] = half_float::detail::half2float_impl(input[i], 216 | half_float::detail::false_type()); 217 | } 218 | 219 | benchmark::DoNotOptimize(output); 220 | } 221 | state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0))); 222 | } 223 | BENCHMARK(half_float_detail_half2float_branch)->RangeMultiplier(2)->Range(1<<10, 64<<20); 224 | #endif 225 | 226 | BENCHMARK_MAIN(); 227 | -------------------------------------------------------------------------------- /third-party/npy-halffloat.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This implementation is extracted from numpy: 3 | * Repo: github.com/numpy/numpy 4 | * File: numpy/core/src/npymath/halffloat.c 5 | * Commit ID: 25c23f1d956104a072a95355ffaa7a38b53710b7 6 | * Functions are made "static inline" for performance, and 7 | * non-conversion functions are removed, and generation of 8 | * exceptions is disabled. 9 | */ 10 | 11 | #include 12 | typedef uint16_t npy_uint16; 13 | typedef uint32_t npy_uint32; 14 | typedef uint64_t npy_uint64; 15 | 16 | /* 17 | * This chooses between 'ties to even' and 'ties away from zero'. 18 | */ 19 | #define NPY_HALF_ROUND_TIES_TO_EVEN 1 20 | /* 21 | * If these are 1, the conversions try to trigger underflow, 22 | * overflow, and invalid exceptions in the FP system when needed. 23 | */ 24 | #define NPY_HALF_GENERATE_OVERFLOW 0 25 | #define NPY_HALF_GENERATE_UNDERFLOW 0 26 | #define NPY_HALF_GENERATE_INVALID 0 27 | 28 | /* 29 | ******************************************************************** 30 | * BIT-LEVEL CONVERSIONS * 31 | ******************************************************************** 32 | */ 33 | 34 | static inline npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f) 35 | { 36 | npy_uint32 f_exp, f_sig; 37 | npy_uint16 h_sgn, h_exp, h_sig; 38 | 39 | h_sgn = (npy_uint16) ((f&0x80000000u) >> 16); 40 | f_exp = (f&0x7f800000u); 41 | 42 | /* Exponent overflow/NaN converts to signed inf/NaN */ 43 | if (f_exp >= 0x47800000u) { 44 | if (f_exp == 0x7f800000u) { 45 | /* Inf or NaN */ 46 | f_sig = (f&0x007fffffu); 47 | if (f_sig != 0) { 48 | /* NaN - propagate the flag in the significand... */ 49 | npy_uint16 ret = (npy_uint16) (0x7c00u + (f_sig >> 13)); 50 | /* ...but make sure it stays a NaN */ 51 | if (ret == 0x7c00u) { 52 | ret++; 53 | } 54 | return h_sgn + ret; 55 | } else { 56 | /* signed inf */ 57 | return (npy_uint16) (h_sgn + 0x7c00u); 58 | } 59 | } else { 60 | /* overflow to signed inf */ 61 | #if NPY_HALF_GENERATE_OVERFLOW 62 | npy_set_floatstatus_overflow(); 63 | #endif 64 | return (npy_uint16) (h_sgn + 0x7c00u); 65 | } 66 | } 67 | 68 | /* Exponent underflow converts to a subnormal half or signed zero */ 69 | if (f_exp <= 0x38000000u) { 70 | /* 71 | * Signed zeros, subnormal floats, and floats with small 72 | * exponents all convert to signed zero halfs. 73 | */ 74 | if (f_exp < 0x33000000u) { 75 | #if NPY_HALF_GENERATE_UNDERFLOW 76 | /* If f != 0, it underflowed to 0 */ 77 | if ((f&0x7fffffff) != 0) { 78 | npy_set_floatstatus_underflow(); 79 | } 80 | #endif 81 | return h_sgn; 82 | } 83 | /* Make the subnormal significand */ 84 | f_exp >>= 23; 85 | f_sig = (0x00800000u + (f&0x007fffffu)); 86 | #if NPY_HALF_GENERATE_UNDERFLOW 87 | /* If it's not exactly represented, it underflowed */ 88 | if ((f_sig&(((npy_uint32)1 << (126 - f_exp)) - 1)) != 0) { 89 | npy_set_floatstatus_underflow(); 90 | } 91 | #endif 92 | f_sig >>= (113 - f_exp); 93 | /* Handle rounding by adding 1 to the bit beyond half precision */ 94 | #if NPY_HALF_ROUND_TIES_TO_EVEN 95 | /* 96 | * If the last bit in the half significand is 0 (already even), and 97 | * the remaining bit pattern is 1000...0, then we do not add one 98 | * to the bit after the half significand. In all other cases, we do. 99 | */ 100 | if ((f_sig&0x00003fffu) != 0x00001000u) { 101 | f_sig += 0x00001000u; 102 | } 103 | #else 104 | f_sig += 0x00001000u; 105 | #endif 106 | h_sig = (npy_uint16) (f_sig >> 13); 107 | /* 108 | * If the rounding causes a bit to spill into h_exp, it will 109 | * increment h_exp from zero to one and h_sig will be zero. 110 | * This is the correct result. 111 | */ 112 | return (npy_uint16) (h_sgn + h_sig); 113 | } 114 | 115 | /* Regular case with no overflow or underflow */ 116 | h_exp = (npy_uint16) ((f_exp - 0x38000000u) >> 13); 117 | /* Handle rounding by adding 1 to the bit beyond half precision */ 118 | f_sig = (f&0x007fffffu); 119 | #if NPY_HALF_ROUND_TIES_TO_EVEN 120 | /* 121 | * If the last bit in the half significand is 0 (already even), and 122 | * the remaining bit pattern is 1000...0, then we do not add one 123 | * to the bit after the half significand. In all other cases, we do. 124 | */ 125 | if ((f_sig&0x00003fffu) != 0x00001000u) { 126 | f_sig += 0x00001000u; 127 | } 128 | #else 129 | f_sig += 0x00001000u; 130 | #endif 131 | h_sig = (npy_uint16) (f_sig >> 13); 132 | /* 133 | * If the rounding causes a bit to spill into h_exp, it will 134 | * increment h_exp by one and h_sig will be zero. This is the 135 | * correct result. h_exp may increment to 15, at greatest, in 136 | * which case the result overflows to a signed inf. 137 | */ 138 | #if NPY_HALF_GENERATE_OVERFLOW 139 | h_sig += h_exp; 140 | if (h_sig == 0x7c00u) { 141 | npy_set_floatstatus_overflow(); 142 | } 143 | return h_sgn + h_sig; 144 | #else 145 | return h_sgn + h_exp + h_sig; 146 | #endif 147 | } 148 | 149 | static inline npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d) 150 | { 151 | npy_uint64 d_exp, d_sig; 152 | npy_uint16 h_sgn, h_exp, h_sig; 153 | 154 | h_sgn = (d&0x8000000000000000ULL) >> 48; 155 | d_exp = (d&0x7ff0000000000000ULL); 156 | 157 | /* Exponent overflow/NaN converts to signed inf/NaN */ 158 | if (d_exp >= 0x40f0000000000000ULL) { 159 | if (d_exp == 0x7ff0000000000000ULL) { 160 | /* Inf or NaN */ 161 | d_sig = (d&0x000fffffffffffffULL); 162 | if (d_sig != 0) { 163 | /* NaN - propagate the flag in the significand... */ 164 | npy_uint16 ret = (npy_uint16) (0x7c00u + (d_sig >> 42)); 165 | /* ...but make sure it stays a NaN */ 166 | if (ret == 0x7c00u) { 167 | ret++; 168 | } 169 | return h_sgn + ret; 170 | } else { 171 | /* signed inf */ 172 | return h_sgn + 0x7c00u; 173 | } 174 | } else { 175 | /* overflow to signed inf */ 176 | #if NPY_HALF_GENERATE_OVERFLOW 177 | npy_set_floatstatus_overflow(); 178 | #endif 179 | return h_sgn + 0x7c00u; 180 | } 181 | } 182 | 183 | /* Exponent underflow converts to subnormal half or signed zero */ 184 | if (d_exp <= 0x3f00000000000000ULL) { 185 | /* 186 | * Signed zeros, subnormal floats, and floats with small 187 | * exponents all convert to signed zero halfs. 188 | */ 189 | if (d_exp < 0x3e60000000000000ULL) { 190 | #if NPY_HALF_GENERATE_UNDERFLOW 191 | /* If d != 0, it underflowed to 0 */ 192 | if ((d&0x7fffffffffffffffULL) != 0) { 193 | npy_set_floatstatus_underflow(); 194 | } 195 | #endif 196 | return h_sgn; 197 | } 198 | /* Make the subnormal significand */ 199 | d_exp >>= 52; 200 | d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL)); 201 | #if NPY_HALF_GENERATE_UNDERFLOW 202 | /* If it's not exactly represented, it underflowed */ 203 | if ((d_sig&(((npy_uint64)1 << (1051 - d_exp)) - 1)) != 0) { 204 | npy_set_floatstatus_underflow(); 205 | } 206 | #endif 207 | d_sig >>= (1009 - d_exp); 208 | /* Handle rounding by adding 1 to the bit beyond half precision */ 209 | #if NPY_HALF_ROUND_TIES_TO_EVEN 210 | /* 211 | * If the last bit in the half significand is 0 (already even), and 212 | * the remaining bit pattern is 1000...0, then we do not add one 213 | * to the bit after the half significand. In all other cases, we do. 214 | */ 215 | if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) { 216 | d_sig += 0x0000020000000000ULL; 217 | } 218 | #else 219 | d_sig += 0x0000020000000000ULL; 220 | #endif 221 | h_sig = (npy_uint16) (d_sig >> 42); 222 | /* 223 | * If the rounding causes a bit to spill into h_exp, it will 224 | * increment h_exp from zero to one and h_sig will be zero. 225 | * This is the correct result. 226 | */ 227 | return h_sgn + h_sig; 228 | } 229 | 230 | /* Regular case with no overflow or underflow */ 231 | h_exp = (npy_uint16) ((d_exp - 0x3f00000000000000ULL) >> 42); 232 | /* Handle rounding by adding 1 to the bit beyond half precision */ 233 | d_sig = (d&0x000fffffffffffffULL); 234 | #if NPY_HALF_ROUND_TIES_TO_EVEN 235 | /* 236 | * If the last bit in the half significand is 0 (already even), and 237 | * the remaining bit pattern is 1000...0, then we do not add one 238 | * to the bit after the half significand. In all other cases, we do. 239 | */ 240 | if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) { 241 | d_sig += 0x0000020000000000ULL; 242 | } 243 | #else 244 | d_sig += 0x0000020000000000ULL; 245 | #endif 246 | h_sig = (npy_uint16) (d_sig >> 42); 247 | 248 | /* 249 | * If the rounding causes a bit to spill into h_exp, it will 250 | * increment h_exp by one and h_sig will be zero. This is the 251 | * correct result. h_exp may increment to 15, at greatest, in 252 | * which case the result overflows to a signed inf. 253 | */ 254 | #if NPY_HALF_GENERATE_OVERFLOW 255 | h_sig += h_exp; 256 | if (h_sig == 0x7c00u) { 257 | npy_set_floatstatus_overflow(); 258 | } 259 | return h_sgn + h_sig; 260 | #else 261 | return h_sgn + h_exp + h_sig; 262 | #endif 263 | } 264 | 265 | static inline npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h) 266 | { 267 | npy_uint16 h_exp, h_sig; 268 | npy_uint32 f_sgn, f_exp, f_sig; 269 | 270 | h_exp = (h&0x7c00u); 271 | f_sgn = ((npy_uint32)h&0x8000u) << 16; 272 | switch (h_exp) { 273 | case 0x0000u: /* 0 or subnormal */ 274 | h_sig = (h&0x03ffu); 275 | /* Signed zero */ 276 | if (h_sig == 0) { 277 | return f_sgn; 278 | } 279 | /* Subnormal */ 280 | h_sig <<= 1; 281 | while ((h_sig&0x0400u) == 0) { 282 | h_sig <<= 1; 283 | h_exp++; 284 | } 285 | f_exp = ((npy_uint32)(127 - 15 - h_exp)) << 23; 286 | f_sig = ((npy_uint32)(h_sig&0x03ffu)) << 13; 287 | return f_sgn + f_exp + f_sig; 288 | case 0x7c00u: /* inf or NaN */ 289 | /* All-ones exponent and a copy of the significand */ 290 | return f_sgn + 0x7f800000u + (((npy_uint32)(h&0x03ffu)) << 13); 291 | default: /* normalized */ 292 | /* Just need to adjust the exponent and shift */ 293 | return f_sgn + (((npy_uint32)(h&0x7fffu) + 0x1c000u) << 13); 294 | } 295 | } 296 | 297 | static inline npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h) 298 | { 299 | npy_uint16 h_exp, h_sig; 300 | npy_uint64 d_sgn, d_exp, d_sig; 301 | 302 | h_exp = (h&0x7c00u); 303 | d_sgn = ((npy_uint64)h&0x8000u) << 48; 304 | switch (h_exp) { 305 | case 0x0000u: /* 0 or subnormal */ 306 | h_sig = (h&0x03ffu); 307 | /* Signed zero */ 308 | if (h_sig == 0) { 309 | return d_sgn; 310 | } 311 | /* Subnormal */ 312 | h_sig <<= 1; 313 | while ((h_sig&0x0400u) == 0) { 314 | h_sig <<= 1; 315 | h_exp++; 316 | } 317 | d_exp = ((npy_uint64)(1023 - 15 - h_exp)) << 52; 318 | d_sig = ((npy_uint64)(h_sig&0x03ffu)) << 42; 319 | return d_sgn + d_exp + d_sig; 320 | case 0x7c00u: /* inf or NaN */ 321 | /* All-ones exponent and a copy of the significand */ 322 | return d_sgn + 0x7ff0000000000000ULL + 323 | (((npy_uint64)(h&0x03ffu)) << 42); 324 | default: /* normalized */ 325 | /* Just need to adjust the exponent and shift */ 326 | return d_sgn + (((npy_uint64)(h&0x7fffu) + 0xfc000u) << 42); 327 | } 328 | } 329 | -------------------------------------------------------------------------------- /test/alt-to-fp32-bits.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | 11 | TEST(FP16_ALT_TO_FP32_BITS, normalized_powers_of_2) { 12 | const uint16_t min_po2_f16 = UINT16_C(0x0400); 13 | const uint16_t eighths_f16 = UINT16_C(0x3000); 14 | const uint16_t quarter_f16 = UINT16_C(0x3400); 15 | const uint16_t half_f16 = UINT16_C(0x3800); 16 | const uint16_t one_f16 = UINT16_C(0x3C00); 17 | const uint16_t two_f16 = UINT16_C(0x4000); 18 | const uint16_t four_f16 = UINT16_C(0x4400); 19 | const uint16_t eight_f16 = UINT16_C(0x4800); 20 | const uint16_t sixteen_f16 = UINT16_C(0x4C00); 21 | const uint16_t thirtytwo_f16 = UINT16_C(0x5000); 22 | const uint16_t sixtyfour_f16 = UINT16_C(0x5400); 23 | const uint16_t max_po2_f16 = UINT16_C(0x7C00); 24 | 25 | const uint32_t min_po2_f32 = UINT32_C(0x38800000); 26 | const uint32_t eighths_f32 = UINT32_C(0x3E000000); 27 | const uint32_t quarter_f32 = UINT32_C(0x3E800000); 28 | const uint32_t half_f32 = UINT32_C(0x3F000000); 29 | const uint32_t one_f32 = UINT32_C(0x3F800000); 30 | const uint32_t two_f32 = UINT32_C(0x40000000); 31 | const uint32_t four_f32 = UINT32_C(0x40800000); 32 | const uint32_t eight_f32 = UINT32_C(0x41000000); 33 | const uint32_t sixteen_f32 = UINT32_C(0x41800000); 34 | const uint32_t thirtytwo_f32 = UINT32_C(0x42000000); 35 | const uint32_t sixtyfour_f32 = UINT32_C(0x42800000); 36 | const uint32_t max_po2_f32 = UINT32_C(0x47800000); 37 | 38 | EXPECT_EQ(min_po2_f32, fp16_alt_to_fp32_bits(min_po2_f16)) << 39 | std::hex << std::uppercase << std::setfill('0') << 40 | "F16 = 0x" << std::setw(4) << min_po2_f16 << ", " << 41 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(min_po2_f16) << ", " << 42 | "F32 = 0x" << std::setw(8) << min_po2_f32; 43 | 44 | EXPECT_EQ(eighths_f32, fp16_alt_to_fp32_bits(eighths_f16)) << 45 | std::hex << std::uppercase << std::setfill('0') << 46 | "F16 = 0x" << std::setw(4) << eighths_f16 << ", " << 47 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(eighths_f16) << ", " << 48 | "F32 = 0x" << std::setw(8) << eighths_f32; 49 | 50 | EXPECT_EQ(quarter_f32, fp16_alt_to_fp32_bits(quarter_f16)) << 51 | std::hex << std::uppercase << std::setfill('0') << 52 | "F16 = 0x" << std::setw(4) << quarter_f16 << ", " << 53 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(quarter_f16) << ", " << 54 | "F32 = 0x" << std::setw(8) << quarter_f32; 55 | 56 | EXPECT_EQ(half_f32, fp16_alt_to_fp32_bits(half_f16)) << 57 | std::hex << std::uppercase << std::setfill('0') << 58 | "F16 = 0x" << std::setw(4) << half_f16 << ", " << 59 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(half_f16) << ", " << 60 | "F32 = 0x" << std::setw(8) << half_f32; 61 | 62 | EXPECT_EQ(one_f32, fp16_alt_to_fp32_bits(one_f16)) << 63 | std::hex << std::uppercase << std::setfill('0') << 64 | "F16 = 0x" << std::setw(4) << one_f16 << ", " << 65 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(one_f16) << ", " << 66 | "F32 = 0x" << std::setw(8) << one_f32; 67 | 68 | EXPECT_EQ(two_f32, fp16_alt_to_fp32_bits(two_f16)) << 69 | std::hex << std::uppercase << std::setfill('0') << 70 | "F16 = 0x" << std::setw(4) << two_f16 << ", " << 71 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(two_f16) << ", " << 72 | "F32 = 0x" << std::setw(8) << two_f32; 73 | 74 | EXPECT_EQ(four_f32, fp16_alt_to_fp32_bits(four_f16)) << 75 | std::hex << std::uppercase << std::setfill('0') << 76 | "F16 = 0x" << std::setw(4) << four_f16 << ", " << 77 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(four_f16) << ", " << 78 | "F32 = 0x" << std::setw(8) << four_f32; 79 | 80 | EXPECT_EQ(eight_f32, fp16_alt_to_fp32_bits(eight_f16)) << 81 | std::hex << std::uppercase << std::setfill('0') << 82 | "F16 = 0x" << std::setw(4) << eight_f16 << ", " << 83 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(eight_f16) << ", " << 84 | "F32 = 0x" << std::setw(8) << eight_f32; 85 | 86 | EXPECT_EQ(sixteen_f32, fp16_alt_to_fp32_bits(sixteen_f16)) << 87 | std::hex << std::uppercase << std::setfill('0') << 88 | "F16 = 0x" << std::setw(4) << sixteen_f16 << ", " << 89 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(sixteen_f16) << ", " << 90 | "F32 = 0x" << std::setw(8) << sixteen_f32; 91 | 92 | EXPECT_EQ(thirtytwo_f32, fp16_alt_to_fp32_bits(thirtytwo_f16)) << 93 | std::hex << std::uppercase << std::setfill('0') << 94 | "F16 = 0x" << std::setw(4) << thirtytwo_f16 << ", " << 95 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(thirtytwo_f16) << ", " << 96 | "F32 = 0x" << std::setw(8) << thirtytwo_f32; 97 | 98 | EXPECT_EQ(sixtyfour_f32, fp16_alt_to_fp32_bits(sixtyfour_f16)) << 99 | std::hex << std::uppercase << std::setfill('0') << 100 | "F16 = 0x" << std::setw(4) << sixtyfour_f16 << ", " << 101 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(sixtyfour_f16) << ", " << 102 | "F32 = 0x" << std::setw(8) << sixtyfour_f32; 103 | 104 | EXPECT_EQ(max_po2_f32, fp16_alt_to_fp32_bits(max_po2_f16)) << 105 | std::hex << std::uppercase << std::setfill('0') << 106 | "F16 = 0x" << std::setw(4) << max_po2_f16 << ", " << 107 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(max_po2_f16) << ", " << 108 | "F32 = 0x" << std::setw(8) << max_po2_f32; 109 | } 110 | 111 | TEST(FP16_ALT_TO_FP32_BITS, denormalized_powers_of_2) { 112 | const uint16_t exp2_minus_15_f16 = UINT16_C(0x0200); 113 | const uint16_t exp2_minus_16_f16 = UINT16_C(0x0100); 114 | const uint16_t exp2_minus_17_f16 = UINT16_C(0x0080); 115 | const uint16_t exp2_minus_18_f16 = UINT16_C(0x0040); 116 | const uint16_t exp2_minus_19_f16 = UINT16_C(0x0020); 117 | const uint16_t exp2_minus_20_f16 = UINT16_C(0x0010); 118 | const uint16_t exp2_minus_21_f16 = UINT16_C(0x0008); 119 | const uint16_t exp2_minus_22_f16 = UINT16_C(0x0004); 120 | const uint16_t exp2_minus_23_f16 = UINT16_C(0x0002); 121 | const uint16_t exp2_minus_24_f16 = UINT16_C(0x0001); 122 | 123 | const uint32_t exp2_minus_15_f32 = UINT32_C(0x38000000); 124 | const uint32_t exp2_minus_16_f32 = UINT32_C(0x37800000); 125 | const uint32_t exp2_minus_17_f32 = UINT32_C(0x37000000); 126 | const uint32_t exp2_minus_18_f32 = UINT32_C(0x36800000); 127 | const uint32_t exp2_minus_19_f32 = UINT32_C(0x36000000); 128 | const uint32_t exp2_minus_20_f32 = UINT32_C(0x35800000); 129 | const uint32_t exp2_minus_21_f32 = UINT32_C(0x35000000); 130 | const uint32_t exp2_minus_22_f32 = UINT32_C(0x34800000); 131 | const uint32_t exp2_minus_23_f32 = UINT32_C(0x34000000); 132 | const uint32_t exp2_minus_24_f32 = UINT32_C(0x33800000); 133 | 134 | EXPECT_EQ(exp2_minus_15_f32, fp16_alt_to_fp32_bits(exp2_minus_15_f16)) << 135 | std::hex << std::uppercase << std::setfill('0') << 136 | "F16 = 0x" << std::setw(4) << exp2_minus_15_f16 << ", " << 137 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_15_f16) << ", " << 138 | "F32 = 0x" << std::setw(8) << exp2_minus_15_f32; 139 | 140 | EXPECT_EQ(exp2_minus_16_f32, fp16_alt_to_fp32_bits(exp2_minus_16_f16)) << 141 | std::hex << std::uppercase << std::setfill('0') << 142 | "F16 = 0x" << std::setw(4) << exp2_minus_16_f16 << ", " << 143 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_16_f16) << ", " << 144 | "F32 = 0x" << std::setw(8) << exp2_minus_16_f32; 145 | 146 | EXPECT_EQ(exp2_minus_17_f32, fp16_alt_to_fp32_bits(exp2_minus_17_f16)) << 147 | std::hex << std::uppercase << std::setfill('0') << 148 | "F16 = 0x" << std::setw(4) << exp2_minus_17_f16 << ", " << 149 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_17_f16) << ", " << 150 | "F32 = 0x" << std::setw(8) << exp2_minus_17_f32; 151 | 152 | EXPECT_EQ(exp2_minus_18_f32, fp16_alt_to_fp32_bits(exp2_minus_18_f16)) << 153 | std::hex << std::uppercase << std::setfill('0') << 154 | "F16 = 0x" << std::setw(4) << exp2_minus_18_f16 << ", " << 155 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_18_f16) << ", " << 156 | "F32 = 0x" << std::setw(8) << exp2_minus_18_f32; 157 | 158 | EXPECT_EQ(exp2_minus_19_f32, fp16_alt_to_fp32_bits(exp2_minus_19_f16)) << 159 | std::hex << std::uppercase << std::setfill('0') << 160 | "F16 = 0x" << std::setw(4) << exp2_minus_19_f16 << ", " << 161 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_19_f16) << ", " << 162 | "F32 = 0x" << std::setw(8) << exp2_minus_19_f32; 163 | 164 | EXPECT_EQ(exp2_minus_20_f32, fp16_alt_to_fp32_bits(exp2_minus_20_f16)) << 165 | std::hex << std::uppercase << std::setfill('0') << 166 | "F16 = 0x" << std::setw(4) << exp2_minus_20_f16 << ", " << 167 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_20_f16) << ", " << 168 | "F32 = 0x" << std::setw(8) << exp2_minus_20_f32; 169 | 170 | EXPECT_EQ(exp2_minus_21_f32, fp16_alt_to_fp32_bits(exp2_minus_21_f16)) << 171 | std::hex << std::uppercase << std::setfill('0') << 172 | "F16 = 0x" << std::setw(4) << exp2_minus_21_f16 << ", " << 173 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_21_f16) << ", " << 174 | "F32 = 0x" << std::setw(8) << exp2_minus_21_f32; 175 | 176 | EXPECT_EQ(exp2_minus_22_f32, fp16_alt_to_fp32_bits(exp2_minus_22_f16)) << 177 | std::hex << std::uppercase << std::setfill('0') << 178 | "F16 = 0x" << std::setw(4) << exp2_minus_22_f16 << ", " << 179 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_22_f16) << ", " << 180 | "F32 = 0x" << std::setw(8) << exp2_minus_22_f32; 181 | 182 | EXPECT_EQ(exp2_minus_23_f32, fp16_alt_to_fp32_bits(exp2_minus_23_f16)) << 183 | std::hex << std::uppercase << std::setfill('0') << 184 | "F16 = 0x" << std::setw(4) << exp2_minus_23_f16 << ", " << 185 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_23_f16) << ", " << 186 | "F32 = 0x" << std::setw(8) << exp2_minus_23_f32; 187 | 188 | EXPECT_EQ(exp2_minus_24_f32, fp16_alt_to_fp32_bits(exp2_minus_24_f16)) << 189 | std::hex << std::uppercase << std::setfill('0') << 190 | "F16 = 0x" << std::setw(4) << exp2_minus_24_f16 << ", " << 191 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(exp2_minus_24_f16) << ", " << 192 | "F32 = 0x" << std::setw(8) << exp2_minus_24_f32; 193 | } 194 | 195 | TEST(FP16_ALT_TO_FP32_BITS, zero) { 196 | const uint16_t positive_zero_f16 = UINT16_C(0x0000); 197 | const uint16_t negative_zero_f16 = UINT16_C(0x8000); 198 | 199 | const uint32_t positive_zero_f32 = UINT32_C(0x00000000); 200 | const uint32_t negative_zero_f32 = UINT32_C(0x80000000); 201 | 202 | EXPECT_EQ(positive_zero_f32, fp16_alt_to_fp32_bits(positive_zero_f16)) << 203 | std::hex << std::uppercase << std::setfill('0') << 204 | "F16 = 0x" << std::setw(4) << positive_zero_f16 << ", " << 205 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(positive_zero_f16) << ", " << 206 | "F32 = 0x" << std::setw(8) << positive_zero_f32; 207 | 208 | EXPECT_EQ(negative_zero_f32, fp16_alt_to_fp32_bits(negative_zero_f16)) << 209 | std::hex << std::uppercase << std::setfill('0') << 210 | "F16 = 0x" << std::setw(4) << negative_zero_f16 << ", " << 211 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(negative_zero_f16) << ", " << 212 | "F32 = 0x" << std::setw(8) << negative_zero_f32; 213 | } 214 | 215 | TEST(FP16_ALT_TO_FP32_BITS, positive_normalized_values) { 216 | const uint32_t exponentBias = 15; 217 | for (int32_t e = -14; e <= 16; e++) { 218 | for (uint16_t h = 0; h < 0x0400; h++) { 219 | const uint16_t fp16 = h + ((uint16_t) (e + exponentBias) << 10); 220 | const uint32_t fp32 = fp16::normalizedValues[h] + ((uint32_t) e << 23); 221 | EXPECT_EQ(fp32, fp16_alt_to_fp32_bits(fp16)) << 222 | std::hex << std::uppercase << std::setfill('0') << 223 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 224 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(fp16) << ", " << 225 | "F32 = 0x" << std::setw(8) << fp32; 226 | } 227 | } 228 | } 229 | 230 | TEST(FP16_ALT_TO_FP32_BITS, negative_normalized_values) { 231 | const uint32_t exponentBias = 15; 232 | for (int32_t e = -14; e <= 16; e++) { 233 | for (uint16_t h = 0; h < 0x0400; h++) { 234 | const uint16_t fp16 = (h + ((uint16_t) (e + exponentBias) << 10)) ^ UINT16_C(0x8000); 235 | const uint32_t fp32 = (fp16::normalizedValues[h] + ((uint32_t) e << 23)) ^ UINT32_C(0x80000000); 236 | EXPECT_EQ(fp32, fp16_alt_to_fp32_bits(fp16)) << 237 | std::hex << std::uppercase << std::setfill('0') << 238 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 239 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(fp16) << ", " << 240 | "F32 = 0x" << std::setw(8) << fp32; 241 | } 242 | } 243 | } 244 | 245 | TEST(FP16_ALT_TO_FP32_BITS, positive_denormalized_values) { 246 | for (uint16_t h = 0; h < 0x0400; h++) { 247 | EXPECT_EQ(fp16::denormalizedValues[h], fp16_alt_to_fp32_bits(h)) << 248 | std::hex << std::uppercase << std::setfill('0') << 249 | "F16 = 0x" << std::setw(4) << h << ", " << 250 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(h) << ", " << 251 | "F32 = 0x" << std::setw(8) << fp16::denormalizedValues[h]; 252 | } 253 | } 254 | 255 | TEST(FP16_ALT_TO_FP32_BITS, negative_denormalized_values) { 256 | for (uint16_t h = 0; h < 0x0400; h++) { 257 | const uint16_t fp16 = h ^ UINT16_C(0x8000); 258 | const uint32_t fp32 = fp16::denormalizedValues[h] ^ UINT32_C(0x80000000); 259 | EXPECT_EQ(fp32, fp16_alt_to_fp32_bits(fp16)) << 260 | std::hex << std::uppercase << std::setfill('0') << 261 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 262 | "F32(F16) = 0x" << std::setw(8) << fp16_alt_to_fp32_bits(fp16) << ", " << 263 | "F32 = 0x" << std::setw(8) << fp32; 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /test/ieee-to-fp32-bits.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | 11 | TEST(FP16_IEEE_TO_FP32_BITS, normalized_powers_of_2) { 12 | const uint16_t min_po2_f16 = UINT16_C(0x0400); 13 | const uint16_t eighths_f16 = UINT16_C(0x3000); 14 | const uint16_t quarter_f16 = UINT16_C(0x3400); 15 | const uint16_t half_f16 = UINT16_C(0x3800); 16 | const uint16_t one_f16 = UINT16_C(0x3C00); 17 | const uint16_t two_f16 = UINT16_C(0x4000); 18 | const uint16_t four_f16 = UINT16_C(0x4400); 19 | const uint16_t eight_f16 = UINT16_C(0x4800); 20 | const uint16_t sixteen_f16 = UINT16_C(0x4C00); 21 | const uint16_t thirtytwo_f16 = UINT16_C(0x5000); 22 | const uint16_t sixtyfour_f16 = UINT16_C(0x5400); 23 | const uint16_t max_po2_f16 = UINT16_C(0x7800); 24 | 25 | const uint32_t min_po2_f32 = UINT32_C(0x38800000); 26 | const uint32_t eighths_f32 = UINT32_C(0x3E000000); 27 | const uint32_t quarter_f32 = UINT32_C(0x3E800000); 28 | const uint32_t half_f32 = UINT32_C(0x3F000000); 29 | const uint32_t one_f32 = UINT32_C(0x3F800000); 30 | const uint32_t two_f32 = UINT32_C(0x40000000); 31 | const uint32_t four_f32 = UINT32_C(0x40800000); 32 | const uint32_t eight_f32 = UINT32_C(0x41000000); 33 | const uint32_t sixteen_f32 = UINT32_C(0x41800000); 34 | const uint32_t thirtytwo_f32 = UINT32_C(0x42000000); 35 | const uint32_t sixtyfour_f32 = UINT32_C(0x42800000); 36 | const uint32_t max_po2_f32 = UINT32_C(0x47000000); 37 | 38 | EXPECT_EQ(min_po2_f32, fp16_ieee_to_fp32_bits(min_po2_f16)) << 39 | std::hex << std::uppercase << std::setfill('0') << 40 | "F16 = 0x" << std::setw(4) << min_po2_f16 << ", " << 41 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(min_po2_f16) << ", " << 42 | "F32 = 0x" << std::setw(8) << min_po2_f32; 43 | 44 | EXPECT_EQ(eighths_f32, fp16_ieee_to_fp32_bits(eighths_f16)) << 45 | std::hex << std::uppercase << std::setfill('0') << 46 | "F16 = 0x" << std::setw(4) << eighths_f16 << ", " << 47 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(eighths_f16) << ", " << 48 | "F32 = 0x" << std::setw(8) << eighths_f32; 49 | 50 | EXPECT_EQ(quarter_f32, fp16_ieee_to_fp32_bits(quarter_f16)) << 51 | std::hex << std::uppercase << std::setfill('0') << 52 | "F16 = 0x" << std::setw(4) << quarter_f16 << ", " << 53 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(quarter_f16) << ", " << 54 | "F32 = 0x" << std::setw(8) << quarter_f32; 55 | 56 | EXPECT_EQ(half_f32, fp16_ieee_to_fp32_bits(half_f16)) << 57 | std::hex << std::uppercase << std::setfill('0') << 58 | "F16 = 0x" << std::setw(4) << half_f16 << ", " << 59 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(half_f16) << ", " << 60 | "F32 = 0x" << std::setw(8) << half_f32; 61 | 62 | EXPECT_EQ(one_f32, fp16_ieee_to_fp32_bits(one_f16)) << 63 | std::hex << std::uppercase << std::setfill('0') << 64 | "F16 = 0x" << std::setw(4) << one_f16 << ", " << 65 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(one_f16) << ", " << 66 | "F32 = 0x" << std::setw(8) << one_f32; 67 | 68 | EXPECT_EQ(two_f32, fp16_ieee_to_fp32_bits(two_f16)) << 69 | std::hex << std::uppercase << std::setfill('0') << 70 | "F16 = 0x" << std::setw(4) << two_f16 << ", " << 71 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(two_f16) << ", " << 72 | "F32 = 0x" << std::setw(8) << two_f32; 73 | 74 | EXPECT_EQ(four_f32, fp16_ieee_to_fp32_bits(four_f16)) << 75 | std::hex << std::uppercase << std::setfill('0') << 76 | "F16 = 0x" << std::setw(4) << four_f16 << ", " << 77 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(four_f16) << ", " << 78 | "F32 = 0x" << std::setw(8) << four_f32; 79 | 80 | EXPECT_EQ(eight_f32, fp16_ieee_to_fp32_bits(eight_f16)) << 81 | std::hex << std::uppercase << std::setfill('0') << 82 | "F16 = 0x" << std::setw(4) << eight_f16 << ", " << 83 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(eight_f16) << ", " << 84 | "F32 = 0x" << std::setw(8) << eight_f32; 85 | 86 | EXPECT_EQ(sixteen_f32, fp16_ieee_to_fp32_bits(sixteen_f16)) << 87 | std::hex << std::uppercase << std::setfill('0') << 88 | "F16 = 0x" << std::setw(4) << sixteen_f16 << ", " << 89 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(sixteen_f16) << ", " << 90 | "F32 = 0x" << std::setw(8) << sixteen_f32; 91 | 92 | EXPECT_EQ(thirtytwo_f32, fp16_ieee_to_fp32_bits(thirtytwo_f16)) << 93 | std::hex << std::uppercase << std::setfill('0') << 94 | "F16 = 0x" << std::setw(4) << thirtytwo_f16 << ", " << 95 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(thirtytwo_f16) << ", " << 96 | "F32 = 0x" << std::setw(8) << thirtytwo_f32; 97 | 98 | EXPECT_EQ(sixtyfour_f32, fp16_ieee_to_fp32_bits(sixtyfour_f16)) << 99 | std::hex << std::uppercase << std::setfill('0') << 100 | "F16 = 0x" << std::setw(4) << sixtyfour_f16 << ", " << 101 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(sixtyfour_f16) << ", " << 102 | "F32 = 0x" << std::setw(8) << sixtyfour_f32; 103 | 104 | EXPECT_EQ(max_po2_f32, fp16_ieee_to_fp32_bits(max_po2_f16)) << 105 | std::hex << std::uppercase << std::setfill('0') << 106 | "F16 = 0x" << std::setw(4) << max_po2_f16 << ", " << 107 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(max_po2_f16) << ", " << 108 | "F32 = 0x" << std::setw(8) << max_po2_f32; 109 | } 110 | 111 | TEST(FP16_IEEE_TO_FP32_BITS, denormalized_powers_of_2) { 112 | const uint16_t exp2_minus_15_f16 = UINT16_C(0x0200); 113 | const uint16_t exp2_minus_16_f16 = UINT16_C(0x0100); 114 | const uint16_t exp2_minus_17_f16 = UINT16_C(0x0080); 115 | const uint16_t exp2_minus_18_f16 = UINT16_C(0x0040); 116 | const uint16_t exp2_minus_19_f16 = UINT16_C(0x0020); 117 | const uint16_t exp2_minus_20_f16 = UINT16_C(0x0010); 118 | const uint16_t exp2_minus_21_f16 = UINT16_C(0x0008); 119 | const uint16_t exp2_minus_22_f16 = UINT16_C(0x0004); 120 | const uint16_t exp2_minus_23_f16 = UINT16_C(0x0002); 121 | const uint16_t exp2_minus_24_f16 = UINT16_C(0x0001); 122 | 123 | const uint32_t exp2_minus_15_f32 = UINT32_C(0x38000000); 124 | const uint32_t exp2_minus_16_f32 = UINT32_C(0x37800000); 125 | const uint32_t exp2_minus_17_f32 = UINT32_C(0x37000000); 126 | const uint32_t exp2_minus_18_f32 = UINT32_C(0x36800000); 127 | const uint32_t exp2_minus_19_f32 = UINT32_C(0x36000000); 128 | const uint32_t exp2_minus_20_f32 = UINT32_C(0x35800000); 129 | const uint32_t exp2_minus_21_f32 = UINT32_C(0x35000000); 130 | const uint32_t exp2_minus_22_f32 = UINT32_C(0x34800000); 131 | const uint32_t exp2_minus_23_f32 = UINT32_C(0x34000000); 132 | const uint32_t exp2_minus_24_f32 = UINT32_C(0x33800000); 133 | 134 | EXPECT_EQ(exp2_minus_15_f32, fp16_ieee_to_fp32_bits(exp2_minus_15_f16)) << 135 | std::hex << std::uppercase << std::setfill('0') << 136 | "F16 = 0x" << std::setw(4) << exp2_minus_15_f16 << ", " << 137 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_15_f16) << ", " << 138 | "F32 = 0x" << std::setw(8) << exp2_minus_15_f32; 139 | 140 | EXPECT_EQ(exp2_minus_16_f32, fp16_ieee_to_fp32_bits(exp2_minus_16_f16)) << 141 | std::hex << std::uppercase << std::setfill('0') << 142 | "F16 = 0x" << std::setw(4) << exp2_minus_16_f16 << ", " << 143 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_16_f16) << ", " << 144 | "F32 = 0x" << std::setw(8) << exp2_minus_16_f32; 145 | 146 | EXPECT_EQ(exp2_minus_17_f32, fp16_ieee_to_fp32_bits(exp2_minus_17_f16)) << 147 | std::hex << std::uppercase << std::setfill('0') << 148 | "F16 = 0x" << std::setw(4) << exp2_minus_17_f16 << ", " << 149 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_17_f16) << ", " << 150 | "F32 = 0x" << std::setw(8) << exp2_minus_17_f32; 151 | 152 | EXPECT_EQ(exp2_minus_18_f32, fp16_ieee_to_fp32_bits(exp2_minus_18_f16)) << 153 | std::hex << std::uppercase << std::setfill('0') << 154 | "F16 = 0x" << std::setw(4) << exp2_minus_18_f16 << ", " << 155 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_18_f16) << ", " << 156 | "F32 = 0x" << std::setw(8) << exp2_minus_18_f32; 157 | 158 | EXPECT_EQ(exp2_minus_19_f32, fp16_ieee_to_fp32_bits(exp2_minus_19_f16)) << 159 | std::hex << std::uppercase << std::setfill('0') << 160 | "F16 = 0x" << std::setw(4) << exp2_minus_19_f16 << ", " << 161 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_19_f16) << ", " << 162 | "F32 = 0x" << std::setw(8) << exp2_minus_19_f32; 163 | 164 | EXPECT_EQ(exp2_minus_20_f32, fp16_ieee_to_fp32_bits(exp2_minus_20_f16)) << 165 | std::hex << std::uppercase << std::setfill('0') << 166 | "F16 = 0x" << std::setw(4) << exp2_minus_20_f16 << ", " << 167 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_20_f16) << ", " << 168 | "F32 = 0x" << std::setw(8) << exp2_minus_20_f32; 169 | 170 | EXPECT_EQ(exp2_minus_21_f32, fp16_ieee_to_fp32_bits(exp2_minus_21_f16)) << 171 | std::hex << std::uppercase << std::setfill('0') << 172 | "F16 = 0x" << std::setw(4) << exp2_minus_21_f16 << ", " << 173 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_21_f16) << ", " << 174 | "F32 = 0x" << std::setw(8) << exp2_minus_21_f32; 175 | 176 | EXPECT_EQ(exp2_minus_22_f32, fp16_ieee_to_fp32_bits(exp2_minus_22_f16)) << 177 | std::hex << std::uppercase << std::setfill('0') << 178 | "F16 = 0x" << std::setw(4) << exp2_minus_22_f16 << ", " << 179 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_22_f16) << ", " << 180 | "F32 = 0x" << std::setw(8) << exp2_minus_22_f32; 181 | 182 | EXPECT_EQ(exp2_minus_23_f32, fp16_ieee_to_fp32_bits(exp2_minus_23_f16)) << 183 | std::hex << std::uppercase << std::setfill('0') << 184 | "F16 = 0x" << std::setw(4) << exp2_minus_23_f16 << ", " << 185 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_23_f16) << ", " << 186 | "F32 = 0x" << std::setw(8) << exp2_minus_23_f32; 187 | 188 | EXPECT_EQ(exp2_minus_24_f32, fp16_ieee_to_fp32_bits(exp2_minus_24_f16)) << 189 | std::hex << std::uppercase << std::setfill('0') << 190 | "F16 = 0x" << std::setw(4) << exp2_minus_24_f16 << ", " << 191 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(exp2_minus_24_f16) << ", " << 192 | "F32 = 0x" << std::setw(8) << exp2_minus_24_f32; 193 | } 194 | 195 | TEST(FP16_IEEE_TO_FP32_BITS, zero) { 196 | const uint16_t positive_zero_f16 = UINT16_C(0x0000); 197 | const uint16_t negative_zero_f16 = UINT16_C(0x8000); 198 | 199 | const uint32_t positive_zero_f32 = UINT32_C(0x00000000); 200 | const uint32_t negative_zero_f32 = UINT32_C(0x80000000); 201 | 202 | EXPECT_EQ(positive_zero_f32, fp16_ieee_to_fp32_bits(positive_zero_f16)) << 203 | std::hex << std::uppercase << std::setfill('0') << 204 | "F16 = 0x" << std::setw(4) << positive_zero_f16 << ", " << 205 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(positive_zero_f16) << ", " << 206 | "F32 = 0x" << std::setw(8) << positive_zero_f32; 207 | 208 | EXPECT_EQ(negative_zero_f32, fp16_ieee_to_fp32_bits(negative_zero_f16)) << 209 | std::hex << std::uppercase << std::setfill('0') << 210 | "F16 = 0x" << std::setw(4) << negative_zero_f16 << ", " << 211 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(negative_zero_f16) << ", " << 212 | "F32 = 0x" << std::setw(8) << negative_zero_f32; 213 | } 214 | 215 | TEST(FP16_IEEE_TO_FP32_BITS, infinity) { 216 | const uint16_t positive_infinity_f16 = UINT16_C(0x7C00); 217 | const uint16_t negative_infinity_f16 = UINT16_C(0xFC00); 218 | 219 | const uint32_t positive_infinity_f32 = UINT32_C(0x7F800000); 220 | const uint32_t negative_infinity_f32 = UINT32_C(0xFF800000); 221 | 222 | EXPECT_EQ(positive_infinity_f32, fp16_ieee_to_fp32_bits(positive_infinity_f16)) << 223 | std::hex << std::uppercase << std::setfill('0') << 224 | "F16 = 0x" << std::setw(4) << positive_infinity_f16 << ", " << 225 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(positive_infinity_f16) << ", " << 226 | "F32 = 0x" << std::setw(8) << positive_infinity_f32; 227 | 228 | EXPECT_EQ(negative_infinity_f32, fp16_ieee_to_fp32_bits(negative_infinity_f16)) << 229 | std::hex << std::uppercase << std::setfill('0') << 230 | "F16 = 0x" << std::setw(4) << negative_infinity_f16 << ", " << 231 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(negative_infinity_f16) << ", " << 232 | "F32 = 0x" << std::setw(8) << negative_infinity_f32; 233 | } 234 | 235 | TEST(FP16_IEEE_TO_FP32_BITS, positive_nan) { 236 | for (uint16_t m = UINT16_C(1); m < UINT16_C(0x0400); m++) { 237 | const uint16_t nan_f16 = UINT16_C(0x7C00) | m; 238 | const uint32_t nan_f32 = fp16_ieee_to_fp32_bits(nan_f16); 239 | 240 | /* Check sign */ 241 | EXPECT_EQ(nan_f32 & UINT32_C(0x80000000), 0) << 242 | std::hex << std::uppercase << std::setfill('0') << 243 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 244 | "F32(F16) = 0x" << std::setw(8) << nan_f32; 245 | 246 | /* Check exponent */ 247 | EXPECT_EQ(nan_f32 & UINT32_C(0x7F800000), UINT32_C(0x7F800000)) << 248 | std::hex << std::uppercase << std::setfill('0') << 249 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 250 | "F32(F16) = 0x" << std::setw(8) << nan_f32; 251 | 252 | /* Check mantissa */ 253 | EXPECT_NE(nan_f32 & UINT32_C(0x007FFFFF), UINT32_C(0)) << 254 | std::hex << std::uppercase << std::setfill('0') << 255 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 256 | "F32(F16) = 0x" << std::setw(8) << nan_f32; 257 | } 258 | } 259 | 260 | TEST(FP16_IEEE_TO_FP32_BITS, negative_nan) { 261 | for (uint16_t m = UINT16_C(1); m < UINT16_C(0x0400); m++) { 262 | const uint16_t nan_f16 = UINT16_C(0xFC00) | m; 263 | const uint32_t nan_f32 = fp16_ieee_to_fp32_bits(nan_f16); 264 | 265 | /* Check sign */ 266 | EXPECT_NE(nan_f32 & UINT32_C(0x80000000), UINT32_C(0)) << 267 | std::hex << std::uppercase << std::setfill('0') << 268 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 269 | "F32(F16) = 0x" << std::setw(8) << nan_f32; 270 | 271 | /* Check exponent */ 272 | EXPECT_EQ(nan_f32 & UINT32_C(0x7F800000), UINT32_C(0x7F800000)) << 273 | std::hex << std::uppercase << std::setfill('0') << 274 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 275 | "F32(F16) = 0x" << std::setw(8) << nan_f32; 276 | 277 | /* Check mantissa */ 278 | EXPECT_NE(nan_f32 & UINT32_C(0x007FFFFF), UINT32_C(0)) << 279 | std::hex << std::uppercase << std::setfill('0') << 280 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 281 | "F32(F16) = 0x" << std::setw(8) << nan_f32; 282 | } 283 | } 284 | 285 | TEST(FP16_IEEE_TO_FP32_BITS, positive_normalized_values) { 286 | const uint32_t exponentBias = 15; 287 | for (int32_t e = -14; e <= 15; e++) { 288 | for (uint16_t h = 0; h < 0x0400; h++) { 289 | const uint16_t fp16 = h + ((uint16_t) (e + exponentBias) << 10); 290 | const uint32_t fp32 = fp16::normalizedValues[h] + ((uint32_t) e << 23); 291 | EXPECT_EQ(fp32, fp16_ieee_to_fp32_bits(fp16)) << 292 | std::hex << std::uppercase << std::setfill('0') << 293 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 294 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(fp16) << ", " << 295 | "F32 = 0x" << std::setw(8) << fp32; 296 | } 297 | } 298 | } 299 | 300 | TEST(FP16_IEEE_TO_FP32_BITS, negative_normalized_values) { 301 | const uint32_t exponentBias = 15; 302 | for (int32_t e = -14; e <= 15; e++) { 303 | for (uint16_t h = 0; h < 0x0400; h++) { 304 | const uint16_t fp16 = (h + ((uint16_t) (e + exponentBias) << 10)) ^ UINT16_C(0x8000); 305 | const uint32_t fp32 = (fp16::normalizedValues[h] + ((uint32_t) e << 23)) ^ UINT32_C(0x80000000); 306 | EXPECT_EQ(fp32, fp16_ieee_to_fp32_bits(fp16)) << 307 | std::hex << std::uppercase << std::setfill('0') << 308 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 309 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(fp16) << ", " << 310 | "F32 = 0x" << std::setw(8) << fp32; 311 | } 312 | } 313 | } 314 | 315 | TEST(FP16_IEEE_TO_FP32_BITS, positive_denormalized_values) { 316 | for (uint16_t h = 0; h < 0x0400; h++) { 317 | EXPECT_EQ(fp16::denormalizedValues[h], fp16_ieee_to_fp32_bits(h)) << 318 | std::hex << std::uppercase << std::setfill('0') << 319 | "F16 = 0x" << std::setw(4) << h << ", " << 320 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(h) << ", " << 321 | "F32 = 0x" << std::setw(8) << fp16::denormalizedValues[h]; 322 | } 323 | } 324 | 325 | TEST(FP16_IEEE_TO_FP32_BITS, negative_denormalized_values) { 326 | for (uint16_t h = 0; h < 0x0400; h++) { 327 | const uint16_t fp16 = h ^ UINT16_C(0x8000); 328 | const uint32_t fp32 = fp16::denormalizedValues[h] ^ UINT32_C(0x80000000); 329 | EXPECT_EQ(fp32, fp16_ieee_to_fp32_bits(fp16)) << 330 | std::hex << std::uppercase << std::setfill('0') << 331 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 332 | "F32(F16) = 0x" << std::setw(8) << fp16_ieee_to_fp32_bits(fp16) << ", " << 333 | "F32 = 0x" << std::setw(8) << fp32; 334 | } 335 | } 336 | -------------------------------------------------------------------------------- /test/alt-to-fp32-value.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | 12 | TEST(FP16_ALT_TO_FP32_VALUE, normalized_powers_of_2) { 13 | const uint16_t min_po2_f16 = UINT16_C(0x0400); 14 | const uint16_t eighths_f16 = UINT16_C(0x3000); 15 | const uint16_t quarter_f16 = UINT16_C(0x3400); 16 | const uint16_t half_f16 = UINT16_C(0x3800); 17 | const uint16_t one_f16 = UINT16_C(0x3C00); 18 | const uint16_t two_f16 = UINT16_C(0x4000); 19 | const uint16_t four_f16 = UINT16_C(0x4400); 20 | const uint16_t eight_f16 = UINT16_C(0x4800); 21 | const uint16_t sixteen_f16 = UINT16_C(0x4C00); 22 | const uint16_t thirtytwo_f16 = UINT16_C(0x5000); 23 | const uint16_t sixtyfour_f16 = UINT16_C(0x5400); 24 | const uint16_t max_po2_f16 = UINT16_C(0x7C00); 25 | 26 | const uint32_t min_po2_f32 = UINT32_C(0x38800000); 27 | const uint32_t eighths_f32 = UINT32_C(0x3E000000); 28 | const uint32_t quarter_f32 = UINT32_C(0x3E800000); 29 | const uint32_t half_f32 = UINT32_C(0x3F000000); 30 | const uint32_t one_f32 = UINT32_C(0x3F800000); 31 | const uint32_t two_f32 = UINT32_C(0x40000000); 32 | const uint32_t four_f32 = UINT32_C(0x40800000); 33 | const uint32_t eight_f32 = UINT32_C(0x41000000); 34 | const uint32_t sixteen_f32 = UINT32_C(0x41800000); 35 | const uint32_t thirtytwo_f32 = UINT32_C(0x42000000); 36 | const uint32_t sixtyfour_f32 = UINT32_C(0x42800000); 37 | const uint32_t max_po2_f32 = UINT32_C(0x47800000); 38 | 39 | const float min_po2_value = fp16_alt_to_fp32_value(min_po2_f16); 40 | uint32_t min_po2_bits; 41 | memcpy(&min_po2_bits, &min_po2_value, sizeof(min_po2_bits)); 42 | EXPECT_EQ(min_po2_f32, min_po2_bits) << 43 | std::hex << std::uppercase << std::setfill('0') << 44 | "F16 = 0x" << std::setw(4) << min_po2_f16 << ", " << 45 | "F32(F16) = 0x" << std::setw(8) << min_po2_bits << ", " << 46 | "F32 = 0x" << std::setw(8) << min_po2_f32; 47 | 48 | const float eighths_value = fp16_alt_to_fp32_value(eighths_f16); 49 | uint32_t eighths_bits; 50 | memcpy(&eighths_bits, &eighths_value, sizeof(eighths_bits)); 51 | EXPECT_EQ(eighths_f32, eighths_bits) << 52 | std::hex << std::uppercase << std::setfill('0') << 53 | "F16 = 0x" << std::setw(4) << eighths_f16 << ", " << 54 | "F32(F16) = 0x" << std::setw(8) << eighths_bits << ", " << 55 | "F32 = 0x" << std::setw(8) << eighths_f32; 56 | 57 | const float quarter_value = fp16_alt_to_fp32_value(quarter_f16); 58 | uint32_t quarter_bits; 59 | memcpy(&quarter_bits, &quarter_value, sizeof(quarter_bits)); 60 | EXPECT_EQ(quarter_f32, quarter_bits) << 61 | std::hex << std::uppercase << std::setfill('0') << 62 | "F16 = 0x" << std::setw(4) << quarter_f16 << ", " << 63 | "F32(F16) = 0x" << std::setw(8) << quarter_bits << ", " << 64 | "F32 = 0x" << std::setw(8) << quarter_f32; 65 | 66 | const float half_value = fp16_alt_to_fp32_value(half_f16); 67 | uint32_t half_bits; 68 | memcpy(&half_bits, &half_value, sizeof(half_bits)); 69 | EXPECT_EQ(half_f32, half_bits) << 70 | std::hex << std::uppercase << std::setfill('0') << 71 | "F16 = 0x" << std::setw(4) << half_f16 << ", " << 72 | "F32(F16) = 0x" << std::setw(8) << half_bits << ", " << 73 | "F32 = 0x" << std::setw(8) << half_f32; 74 | 75 | const float one_value = fp16_alt_to_fp32_value(one_f16); 76 | uint32_t one_bits; 77 | memcpy(&one_bits, &one_value, sizeof(one_bits)); 78 | EXPECT_EQ(one_f32, one_bits) << 79 | std::hex << std::uppercase << std::setfill('0') << 80 | "F16 = 0x" << std::setw(4) << one_f16 << ", " << 81 | "F32(F16) = 0x" << std::setw(8) << one_bits << ", " << 82 | "F32 = 0x" << std::setw(8) << one_f32; 83 | 84 | const float two_value = fp16_alt_to_fp32_value(two_f16); 85 | uint32_t two_bits; 86 | memcpy(&two_bits, &two_value, sizeof(two_bits)); 87 | EXPECT_EQ(two_f32, two_bits) << 88 | std::hex << std::uppercase << std::setfill('0') << 89 | "F16 = 0x" << std::setw(4) << two_f16 << ", " << 90 | "F32(F16) = 0x" << std::setw(8) << two_bits << ", " << 91 | "F32 = 0x" << std::setw(8) << two_f32; 92 | 93 | const float four_value = fp16_alt_to_fp32_value(four_f16); 94 | uint32_t four_bits; 95 | memcpy(&four_bits, &four_value, sizeof(four_bits)); 96 | EXPECT_EQ(four_f32, four_bits) << 97 | std::hex << std::uppercase << std::setfill('0') << 98 | "F16 = 0x" << std::setw(4) << four_f16 << ", " << 99 | "F32(F16) = 0x" << std::setw(8) << four_bits << ", " << 100 | "F32 = 0x" << std::setw(8) << four_f32; 101 | 102 | const float eight_value = fp16_alt_to_fp32_value(eight_f16); 103 | uint32_t eight_bits; 104 | memcpy(&eight_bits, &eight_value, sizeof(eight_bits)); 105 | EXPECT_EQ(eight_f32, eight_bits) << 106 | std::hex << std::uppercase << std::setfill('0') << 107 | "F16 = 0x" << std::setw(4) << eight_f16 << ", " << 108 | "F32(F16) = 0x" << std::setw(8) << eight_bits << ", " << 109 | "F32 = 0x" << std::setw(8) << eight_f32; 110 | 111 | const float sixteen_value = fp16_alt_to_fp32_value(sixteen_f16); 112 | uint32_t sixteen_bits; 113 | memcpy(&sixteen_bits, &sixteen_value, sizeof(sixteen_bits)); 114 | EXPECT_EQ(sixteen_f32, sixteen_bits) << 115 | std::hex << std::uppercase << std::setfill('0') << 116 | "F16 = 0x" << std::setw(4) << sixteen_f16 << ", " << 117 | "F32(F16) = 0x" << std::setw(8) << sixteen_bits << ", " << 118 | "F32 = 0x" << std::setw(8) << sixteen_f32; 119 | 120 | const float thirtytwo_value = fp16_alt_to_fp32_value(thirtytwo_f16); 121 | uint32_t thirtytwo_bits; 122 | memcpy(&thirtytwo_bits, &thirtytwo_value, sizeof(thirtytwo_bits)); 123 | EXPECT_EQ(thirtytwo_f32, thirtytwo_bits) << 124 | std::hex << std::uppercase << std::setfill('0') << 125 | "F16 = 0x" << std::setw(4) << thirtytwo_f16 << ", " << 126 | "F32(F16) = 0x" << std::setw(8) << thirtytwo_bits << ", " << 127 | "F32 = 0x" << std::setw(8) << thirtytwo_f32; 128 | 129 | const float sixtyfour_value = fp16_alt_to_fp32_value(sixtyfour_f16); 130 | uint32_t sixtyfour_bits; 131 | memcpy(&sixtyfour_bits, &sixtyfour_value, sizeof(sixtyfour_bits)); 132 | EXPECT_EQ(sixtyfour_f32, sixtyfour_bits) << 133 | std::hex << std::uppercase << std::setfill('0') << 134 | "F16 = 0x" << std::setw(4) << sixtyfour_f16 << ", " << 135 | "F32(F16) = 0x" << std::setw(8) << sixtyfour_bits << ", " << 136 | "F32 = 0x" << std::setw(8) << sixtyfour_f32; 137 | 138 | const float max_po2_value = fp16_alt_to_fp32_value(max_po2_f16); 139 | uint32_t max_po2_bits; 140 | memcpy(&max_po2_bits, &max_po2_value, sizeof(max_po2_bits)); 141 | EXPECT_EQ(max_po2_f32, max_po2_bits) << 142 | std::hex << std::uppercase << std::setfill('0') << 143 | "F16 = 0x" << std::setw(4) << max_po2_f16 << ", " << 144 | "F32(F16) = 0x" << std::setw(8) << max_po2_bits << ", " << 145 | "F32 = 0x" << std::setw(8) << max_po2_f32; 146 | } 147 | 148 | TEST(FP16_ALT_TO_FP32_VALUE, denormalized_powers_of_2) { 149 | const uint16_t exp2_minus_15_f16 = UINT16_C(0x0200); 150 | const uint16_t exp2_minus_16_f16 = UINT16_C(0x0100); 151 | const uint16_t exp2_minus_17_f16 = UINT16_C(0x0080); 152 | const uint16_t exp2_minus_18_f16 = UINT16_C(0x0040); 153 | const uint16_t exp2_minus_19_f16 = UINT16_C(0x0020); 154 | const uint16_t exp2_minus_20_f16 = UINT16_C(0x0010); 155 | const uint16_t exp2_minus_21_f16 = UINT16_C(0x0008); 156 | const uint16_t exp2_minus_22_f16 = UINT16_C(0x0004); 157 | const uint16_t exp2_minus_23_f16 = UINT16_C(0x0002); 158 | const uint16_t exp2_minus_24_f16 = UINT16_C(0x0001); 159 | 160 | const uint32_t exp2_minus_15_f32 = UINT32_C(0x38000000); 161 | const uint32_t exp2_minus_16_f32 = UINT32_C(0x37800000); 162 | const uint32_t exp2_minus_17_f32 = UINT32_C(0x37000000); 163 | const uint32_t exp2_minus_18_f32 = UINT32_C(0x36800000); 164 | const uint32_t exp2_minus_19_f32 = UINT32_C(0x36000000); 165 | const uint32_t exp2_minus_20_f32 = UINT32_C(0x35800000); 166 | const uint32_t exp2_minus_21_f32 = UINT32_C(0x35000000); 167 | const uint32_t exp2_minus_22_f32 = UINT32_C(0x34800000); 168 | const uint32_t exp2_minus_23_f32 = UINT32_C(0x34000000); 169 | const uint32_t exp2_minus_24_f32 = UINT32_C(0x33800000); 170 | 171 | const float exp2_minus_15_value = fp16_alt_to_fp32_value(exp2_minus_15_f16); 172 | uint32_t exp2_minus_15_bits; 173 | memcpy(&exp2_minus_15_bits, &exp2_minus_15_value, sizeof(exp2_minus_15_bits)); 174 | EXPECT_EQ(exp2_minus_15_f32, exp2_minus_15_bits) << 175 | std::hex << std::uppercase << std::setfill('0') << 176 | "F16 = 0x" << std::setw(4) << exp2_minus_15_f16 << ", " << 177 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_15_bits << ", " << 178 | "F32 = 0x" << std::setw(8) << exp2_minus_15_f32; 179 | 180 | const float exp2_minus_16_value = fp16_alt_to_fp32_value(exp2_minus_16_f16); 181 | uint32_t exp2_minus_16_bits; 182 | memcpy(&exp2_minus_16_bits, &exp2_minus_16_value, sizeof(exp2_minus_16_bits)); 183 | EXPECT_EQ(exp2_minus_16_f32, exp2_minus_16_bits) << 184 | std::hex << std::uppercase << std::setfill('0') << 185 | "F16 = 0x" << std::setw(4) << exp2_minus_16_f16 << ", " << 186 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_16_bits << ", " << 187 | "F32 = 0x" << std::setw(8) << exp2_minus_16_f32; 188 | 189 | const float exp2_minus_17_value = fp16_alt_to_fp32_value(exp2_minus_17_f16); 190 | uint32_t exp2_minus_17_bits; 191 | memcpy(&exp2_minus_17_bits, &exp2_minus_17_value, sizeof(exp2_minus_17_bits)); 192 | EXPECT_EQ(exp2_minus_17_f32, exp2_minus_17_bits) << 193 | std::hex << std::uppercase << std::setfill('0') << 194 | "F16 = 0x" << std::setw(4) << exp2_minus_17_f16 << ", " << 195 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_17_bits << ", " << 196 | "F32 = 0x" << std::setw(8) << exp2_minus_17_f32; 197 | 198 | const float exp2_minus_18_value = fp16_alt_to_fp32_value(exp2_minus_18_f16); 199 | uint32_t exp2_minus_18_bits; 200 | memcpy(&exp2_minus_18_bits, &exp2_minus_18_value, sizeof(exp2_minus_18_bits)); 201 | EXPECT_EQ(exp2_minus_18_f32, exp2_minus_18_bits) << 202 | std::hex << std::uppercase << std::setfill('0') << 203 | "F16 = 0x" << std::setw(4) << exp2_minus_18_f16 << ", " << 204 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_18_bits << ", " << 205 | "F32 = 0x" << std::setw(8) << exp2_minus_18_f32; 206 | 207 | const float exp2_minus_19_value = fp16_alt_to_fp32_value(exp2_minus_19_f16); 208 | uint32_t exp2_minus_19_bits; 209 | memcpy(&exp2_minus_19_bits, &exp2_minus_19_value, sizeof(exp2_minus_19_bits)); 210 | EXPECT_EQ(exp2_minus_19_f32, exp2_minus_19_bits) << 211 | std::hex << std::uppercase << std::setfill('0') << 212 | "F16 = 0x" << std::setw(4) << exp2_minus_19_f16 << ", " << 213 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_19_bits << ", " << 214 | "F32 = 0x" << std::setw(8) << exp2_minus_19_f32; 215 | 216 | const float exp2_minus_20_value = fp16_alt_to_fp32_value(exp2_minus_20_f16); 217 | uint32_t exp2_minus_20_bits; 218 | memcpy(&exp2_minus_20_bits, &exp2_minus_20_value, sizeof(exp2_minus_20_bits)); 219 | EXPECT_EQ(exp2_minus_20_f32, exp2_minus_20_bits) << 220 | std::hex << std::uppercase << std::setfill('0') << 221 | "F16 = 0x" << std::setw(4) << exp2_minus_20_f16 << ", " << 222 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_20_bits << ", " << 223 | "F32 = 0x" << std::setw(8) << exp2_minus_20_f32; 224 | 225 | const float exp2_minus_21_value = fp16_alt_to_fp32_value(exp2_minus_21_f16); 226 | uint32_t exp2_minus_21_bits; 227 | memcpy(&exp2_minus_21_bits, &exp2_minus_21_value, sizeof(exp2_minus_21_bits)); 228 | EXPECT_EQ(exp2_minus_21_f32, exp2_minus_21_bits) << 229 | std::hex << std::uppercase << std::setfill('0') << 230 | "F16 = 0x" << std::setw(4) << exp2_minus_21_f16 << ", " << 231 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_21_bits << ", " << 232 | "F32 = 0x" << std::setw(8) << exp2_minus_21_f32; 233 | 234 | const float exp2_minus_22_value = fp16_alt_to_fp32_value(exp2_minus_22_f16); 235 | uint32_t exp2_minus_22_bits; 236 | memcpy(&exp2_minus_22_bits, &exp2_minus_22_value, sizeof(exp2_minus_22_bits)); 237 | EXPECT_EQ(exp2_minus_22_f32, exp2_minus_22_bits) << 238 | std::hex << std::uppercase << std::setfill('0') << 239 | "F16 = 0x" << std::setw(4) << exp2_minus_22_f16 << ", " << 240 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_22_bits << ", " << 241 | "F32 = 0x" << std::setw(8) << exp2_minus_22_f32; 242 | 243 | const float exp2_minus_23_value = fp16_alt_to_fp32_value(exp2_minus_23_f16); 244 | uint32_t exp2_minus_23_bits; 245 | memcpy(&exp2_minus_23_bits, &exp2_minus_23_value, sizeof(exp2_minus_23_bits)); 246 | EXPECT_EQ(exp2_minus_23_f32, exp2_minus_23_bits) << 247 | std::hex << std::uppercase << std::setfill('0') << 248 | "F16 = 0x" << std::setw(4) << exp2_minus_23_f16 << ", " << 249 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_23_bits << ", " << 250 | "F32 = 0x" << std::setw(8) << exp2_minus_23_f32; 251 | 252 | const float exp2_minus_24_value = fp16_alt_to_fp32_value(exp2_minus_24_f16); 253 | uint32_t exp2_minus_24_bits; 254 | memcpy(&exp2_minus_24_bits, &exp2_minus_24_value, sizeof(exp2_minus_24_bits)); 255 | EXPECT_EQ(exp2_minus_24_f32, exp2_minus_24_bits) << 256 | std::hex << std::uppercase << std::setfill('0') << 257 | "F16 = 0x" << std::setw(4) << exp2_minus_24_f16 << ", " << 258 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_24_bits << ", " << 259 | "F32 = 0x" << std::setw(8) << exp2_minus_24_f32; 260 | } 261 | 262 | TEST(FP16_ALT_TO_FP32_VALUE, zero) { 263 | const uint16_t positive_zero_f16 = UINT16_C(0x0000); 264 | const uint16_t negative_zero_f16 = UINT16_C(0x8000); 265 | 266 | const uint32_t positive_zero_f32 = UINT32_C(0x00000000); 267 | const uint32_t negative_zero_f32 = UINT32_C(0x80000000); 268 | 269 | const float positive_zero_value = fp16_alt_to_fp32_value(positive_zero_f16); 270 | uint32_t positive_zero_bits; 271 | memcpy(&positive_zero_bits, &positive_zero_value, sizeof(positive_zero_bits)); 272 | EXPECT_EQ(positive_zero_f32, positive_zero_bits) << 273 | std::hex << std::uppercase << std::setfill('0') << 274 | "F16 = 0x" << std::setw(4) << positive_zero_f16 << ", " << 275 | "F32(F16) = 0x" << std::setw(8) << positive_zero_bits << ", " << 276 | "F32 = 0x" << std::setw(8) << positive_zero_f32; 277 | 278 | const float negative_zero_value = fp16_alt_to_fp32_value(negative_zero_f16); 279 | uint32_t negative_zero_bits; 280 | memcpy(&negative_zero_bits, &negative_zero_value, sizeof(negative_zero_bits)); 281 | EXPECT_EQ(negative_zero_f32, negative_zero_bits) << 282 | std::hex << std::uppercase << std::setfill('0') << 283 | "F16 = 0x" << std::setw(4) << negative_zero_f16 << ", " << 284 | "F32(F16) = 0x" << std::setw(8) << negative_zero_bits << ", " << 285 | "F32 = 0x" << std::setw(8) << negative_zero_f32; 286 | } 287 | 288 | TEST(FP16_ALT_TO_FP32_VALUE, positive_normalized_values) { 289 | const uint32_t exponentBias = 15; 290 | for (int32_t e = -14; e <= 16; e++) { 291 | for (uint16_t h = 0; h < 0x0400; h++) { 292 | const uint16_t fp16 = h + ((uint16_t) (e + exponentBias) << 10); 293 | const uint32_t fp32 = fp16::normalizedValues[h] + ((uint32_t) e << 23); 294 | const float value = fp16_alt_to_fp32_value(fp16); 295 | uint32_t bits; 296 | memcpy(&bits, &value, sizeof(bits)); 297 | EXPECT_EQ(fp32, bits) << 298 | std::hex << std::uppercase << std::setfill('0') << 299 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 300 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 301 | "F32 = 0x" << std::setw(8) << fp32; 302 | } 303 | } 304 | } 305 | 306 | TEST(FP16_ALT_TO_FP32_VALUE, negative_normalized_values) { 307 | const uint32_t exponentBias = 15; 308 | for (int32_t e = -14; e <= 16; e++) { 309 | for (uint16_t h = 0; h < 0x0400; h++) { 310 | const uint16_t fp16 = (h + ((uint16_t) (e + exponentBias) << 10)) ^ UINT16_C(0x8000); 311 | const uint32_t fp32 = (fp16::normalizedValues[h] + ((uint32_t) e << 23)) ^ UINT32_C(0x80000000); 312 | const float value = fp16_alt_to_fp32_value(fp16); 313 | uint32_t bits; 314 | memcpy(&bits, &value, sizeof(bits)); 315 | EXPECT_EQ(fp32, bits) << 316 | std::hex << std::uppercase << std::setfill('0') << 317 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 318 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 319 | "F32 = 0x" << std::setw(8) << fp32; 320 | } 321 | } 322 | } 323 | 324 | TEST(FP16_ALT_TO_FP32_VALUE, positive_denormalized_values) { 325 | for (uint16_t h = 0; h < 0x0400; h++) { 326 | const float value = fp16_alt_to_fp32_value(h); 327 | uint32_t bits; 328 | memcpy(&bits, &value, sizeof(bits)); 329 | EXPECT_EQ(fp16::denormalizedValues[h], bits) << 330 | std::hex << std::uppercase << std::setfill('0') << 331 | "F16 = 0x" << std::setw(4) << h << ", " << 332 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 333 | "F32 = 0x" << std::setw(8) << fp16::denormalizedValues[h]; 334 | } 335 | } 336 | 337 | TEST(FP16_ALT_TO_FP32_VALUE, negative_denormalized_values) { 338 | for (uint16_t h = 0; h < 0x0400; h++) { 339 | const uint16_t fp16 = h ^ UINT16_C(0x8000); 340 | const uint32_t fp32 = fp16::denormalizedValues[h] ^ UINT32_C(0x80000000); 341 | const float value = fp16_alt_to_fp32_value(fp16); 342 | uint32_t bits; 343 | memcpy(&bits, &value, sizeof(bits)); 344 | EXPECT_EQ(fp32, bits) << 345 | std::hex << std::uppercase << std::setfill('0') << 346 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 347 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 348 | "F32 = 0x" << std::setw(8) << fp32; 349 | } 350 | } 351 | -------------------------------------------------------------------------------- /test/ieee-to-fp32-value.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | 12 | TEST(FP16_IEEE_TO_FP32_VALUE, normalized_powers_of_2) { 13 | const uint16_t min_po2_f16 = UINT16_C(0x0400); 14 | const uint16_t eighths_f16 = UINT16_C(0x3000); 15 | const uint16_t quarter_f16 = UINT16_C(0x3400); 16 | const uint16_t half_f16 = UINT16_C(0x3800); 17 | const uint16_t one_f16 = UINT16_C(0x3C00); 18 | const uint16_t two_f16 = UINT16_C(0x4000); 19 | const uint16_t four_f16 = UINT16_C(0x4400); 20 | const uint16_t eight_f16 = UINT16_C(0x4800); 21 | const uint16_t sixteen_f16 = UINT16_C(0x4C00); 22 | const uint16_t thirtytwo_f16 = UINT16_C(0x5000); 23 | const uint16_t sixtyfour_f16 = UINT16_C(0x5400); 24 | const uint16_t max_po2_f16 = UINT16_C(0x7800); 25 | 26 | const uint32_t min_po2_f32 = UINT32_C(0x38800000); 27 | const uint32_t eighths_f32 = UINT32_C(0x3E000000); 28 | const uint32_t quarter_f32 = UINT32_C(0x3E800000); 29 | const uint32_t half_f32 = UINT32_C(0x3F000000); 30 | const uint32_t one_f32 = UINT32_C(0x3F800000); 31 | const uint32_t two_f32 = UINT32_C(0x40000000); 32 | const uint32_t four_f32 = UINT32_C(0x40800000); 33 | const uint32_t eight_f32 = UINT32_C(0x41000000); 34 | const uint32_t sixteen_f32 = UINT32_C(0x41800000); 35 | const uint32_t thirtytwo_f32 = UINT32_C(0x42000000); 36 | const uint32_t sixtyfour_f32 = UINT32_C(0x42800000); 37 | const uint32_t max_po2_f32 = UINT32_C(0x47000000); 38 | 39 | const float min_po2_value = fp16_ieee_to_fp32_value(min_po2_f16); 40 | uint32_t min_po2_bits; 41 | memcpy(&min_po2_bits, &min_po2_value, sizeof(min_po2_bits)); 42 | EXPECT_EQ(min_po2_f32, min_po2_bits) << 43 | std::hex << std::uppercase << std::setfill('0') << 44 | "F16 = 0x" << std::setw(4) << min_po2_f16 << ", " << 45 | "F32(F16) = 0x" << std::setw(8) << min_po2_bits << ", " << 46 | "F32 = 0x" << std::setw(8) << min_po2_f32; 47 | 48 | const float eighths_value = fp16_ieee_to_fp32_value(eighths_f16); 49 | uint32_t eighths_bits; 50 | memcpy(&eighths_bits, &eighths_value, sizeof(eighths_bits)); 51 | EXPECT_EQ(eighths_f32, eighths_bits) << 52 | std::hex << std::uppercase << std::setfill('0') << 53 | "F16 = 0x" << std::setw(4) << eighths_f16 << ", " << 54 | "F32(F16) = 0x" << std::setw(8) << eighths_bits << ", " << 55 | "F32 = 0x" << std::setw(8) << eighths_f32; 56 | 57 | const float quarter_value = fp16_ieee_to_fp32_value(quarter_f16); 58 | uint32_t quarter_bits; 59 | memcpy(&quarter_bits, &quarter_value, sizeof(quarter_bits)); 60 | EXPECT_EQ(quarter_f32, quarter_bits) << 61 | std::hex << std::uppercase << std::setfill('0') << 62 | "F16 = 0x" << std::setw(4) << quarter_f16 << ", " << 63 | "F32(F16) = 0x" << std::setw(8) << quarter_bits << ", " << 64 | "F32 = 0x" << std::setw(8) << quarter_f32; 65 | 66 | const float half_value = fp16_ieee_to_fp32_value(half_f16); 67 | uint32_t half_bits; 68 | memcpy(&half_bits, &half_value, sizeof(half_bits)); 69 | EXPECT_EQ(half_f32, half_bits) << 70 | std::hex << std::uppercase << std::setfill('0') << 71 | "F16 = 0x" << std::setw(4) << half_f16 << ", " << 72 | "F32(F16) = 0x" << std::setw(8) << half_bits << ", " << 73 | "F32 = 0x" << std::setw(8) << half_f32; 74 | 75 | const float one_value = fp16_ieee_to_fp32_value(one_f16); 76 | uint32_t one_bits; 77 | memcpy(&one_bits, &one_value, sizeof(one_bits)); 78 | EXPECT_EQ(one_f32, one_bits) << 79 | std::hex << std::uppercase << std::setfill('0') << 80 | "F16 = 0x" << std::setw(4) << one_f16 << ", " << 81 | "F32(F16) = 0x" << std::setw(8) << one_bits << ", " << 82 | "F32 = 0x" << std::setw(8) << one_f32; 83 | 84 | const float two_value = fp16_ieee_to_fp32_value(two_f16); 85 | uint32_t two_bits; 86 | memcpy(&two_bits, &two_value, sizeof(two_bits)); 87 | EXPECT_EQ(two_f32, two_bits) << 88 | std::hex << std::uppercase << std::setfill('0') << 89 | "F16 = 0x" << std::setw(4) << two_f16 << ", " << 90 | "F32(F16) = 0x" << std::setw(8) << two_bits << ", " << 91 | "F32 = 0x" << std::setw(8) << two_f32; 92 | 93 | const float four_value = fp16_ieee_to_fp32_value(four_f16); 94 | uint32_t four_bits; 95 | memcpy(&four_bits, &four_value, sizeof(four_bits)); 96 | EXPECT_EQ(four_f32, four_bits) << 97 | std::hex << std::uppercase << std::setfill('0') << 98 | "F16 = 0x" << std::setw(4) << four_f16 << ", " << 99 | "F32(F16) = 0x" << std::setw(8) << four_bits << ", " << 100 | "F32 = 0x" << std::setw(8) << four_f32; 101 | 102 | const float eight_value = fp16_ieee_to_fp32_value(eight_f16); 103 | uint32_t eight_bits; 104 | memcpy(&eight_bits, &eight_value, sizeof(eight_bits)); 105 | EXPECT_EQ(eight_f32, eight_bits) << 106 | std::hex << std::uppercase << std::setfill('0') << 107 | "F16 = 0x" << std::setw(4) << eight_f16 << ", " << 108 | "F32(F16) = 0x" << std::setw(8) << eight_bits << ", " << 109 | "F32 = 0x" << std::setw(8) << eight_f32; 110 | 111 | const float sixteen_value = fp16_ieee_to_fp32_value(sixteen_f16); 112 | uint32_t sixteen_bits; 113 | memcpy(&sixteen_bits, &sixteen_value, sizeof(sixteen_bits)); 114 | EXPECT_EQ(sixteen_f32, sixteen_bits) << 115 | std::hex << std::uppercase << std::setfill('0') << 116 | "F16 = 0x" << std::setw(4) << sixteen_f16 << ", " << 117 | "F32(F16) = 0x" << std::setw(8) << sixteen_bits << ", " << 118 | "F32 = 0x" << std::setw(8) << sixteen_f32; 119 | 120 | const float thirtytwo_value = fp16_ieee_to_fp32_value(thirtytwo_f16); 121 | uint32_t thirtytwo_bits; 122 | memcpy(&thirtytwo_bits, &thirtytwo_value, sizeof(thirtytwo_bits)); 123 | EXPECT_EQ(thirtytwo_f32, thirtytwo_bits) << 124 | std::hex << std::uppercase << std::setfill('0') << 125 | "F16 = 0x" << std::setw(4) << thirtytwo_f16 << ", " << 126 | "F32(F16) = 0x" << std::setw(8) << thirtytwo_bits << ", " << 127 | "F32 = 0x" << std::setw(8) << thirtytwo_f32; 128 | 129 | const float sixtyfour_value = fp16_ieee_to_fp32_value(sixtyfour_f16); 130 | uint32_t sixtyfour_bits; 131 | memcpy(&sixtyfour_bits, &sixtyfour_value, sizeof(sixtyfour_bits)); 132 | EXPECT_EQ(sixtyfour_f32, sixtyfour_bits) << 133 | std::hex << std::uppercase << std::setfill('0') << 134 | "F16 = 0x" << std::setw(4) << sixtyfour_f16 << ", " << 135 | "F32(F16) = 0x" << std::setw(8) << sixtyfour_bits << ", " << 136 | "F32 = 0x" << std::setw(8) << sixtyfour_f32; 137 | 138 | const float max_po2_value = fp16_ieee_to_fp32_value(max_po2_f16); 139 | uint32_t max_po2_bits; 140 | memcpy(&max_po2_bits, &max_po2_value, sizeof(max_po2_bits)); 141 | EXPECT_EQ(max_po2_f32, max_po2_bits) << 142 | std::hex << std::uppercase << std::setfill('0') << 143 | "F16 = 0x" << std::setw(4) << max_po2_f16 << ", " << 144 | "F32(F16) = 0x" << std::setw(8) << max_po2_bits << ", " << 145 | "F32 = 0x" << std::setw(8) << max_po2_f32; 146 | } 147 | 148 | TEST(FP16_IEEE_TO_FP32_VALUE, denormalized_powers_of_2) { 149 | const uint16_t exp2_minus_15_f16 = UINT16_C(0x0200); 150 | const uint16_t exp2_minus_16_f16 = UINT16_C(0x0100); 151 | const uint16_t exp2_minus_17_f16 = UINT16_C(0x0080); 152 | const uint16_t exp2_minus_18_f16 = UINT16_C(0x0040); 153 | const uint16_t exp2_minus_19_f16 = UINT16_C(0x0020); 154 | const uint16_t exp2_minus_20_f16 = UINT16_C(0x0010); 155 | const uint16_t exp2_minus_21_f16 = UINT16_C(0x0008); 156 | const uint16_t exp2_minus_22_f16 = UINT16_C(0x0004); 157 | const uint16_t exp2_minus_23_f16 = UINT16_C(0x0002); 158 | const uint16_t exp2_minus_24_f16 = UINT16_C(0x0001); 159 | 160 | const uint32_t exp2_minus_15_f32 = UINT32_C(0x38000000); 161 | const uint32_t exp2_minus_16_f32 = UINT32_C(0x37800000); 162 | const uint32_t exp2_minus_17_f32 = UINT32_C(0x37000000); 163 | const uint32_t exp2_minus_18_f32 = UINT32_C(0x36800000); 164 | const uint32_t exp2_minus_19_f32 = UINT32_C(0x36000000); 165 | const uint32_t exp2_minus_20_f32 = UINT32_C(0x35800000); 166 | const uint32_t exp2_minus_21_f32 = UINT32_C(0x35000000); 167 | const uint32_t exp2_minus_22_f32 = UINT32_C(0x34800000); 168 | const uint32_t exp2_minus_23_f32 = UINT32_C(0x34000000); 169 | const uint32_t exp2_minus_24_f32 = UINT32_C(0x33800000); 170 | 171 | const float exp2_minus_15_value = fp16_ieee_to_fp32_value(exp2_minus_15_f16); 172 | uint32_t exp2_minus_15_bits; 173 | memcpy(&exp2_minus_15_bits, &exp2_minus_15_value, sizeof(exp2_minus_15_bits)); 174 | EXPECT_EQ(exp2_minus_15_f32, exp2_minus_15_bits) << 175 | std::hex << std::uppercase << std::setfill('0') << 176 | "F16 = 0x" << std::setw(4) << exp2_minus_15_f16 << ", " << 177 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_15_bits << ", " << 178 | "F32 = 0x" << std::setw(8) << exp2_minus_15_f32; 179 | 180 | const float exp2_minus_16_value = fp16_ieee_to_fp32_value(exp2_minus_16_f16); 181 | uint32_t exp2_minus_16_bits; 182 | memcpy(&exp2_minus_16_bits, &exp2_minus_16_value, sizeof(exp2_minus_16_bits)); 183 | EXPECT_EQ(exp2_minus_16_f32, exp2_minus_16_bits) << 184 | std::hex << std::uppercase << std::setfill('0') << 185 | "F16 = 0x" << std::setw(4) << exp2_minus_16_f16 << ", " << 186 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_16_bits << ", " << 187 | "F32 = 0x" << std::setw(8) << exp2_minus_16_f32; 188 | 189 | const float exp2_minus_17_value = fp16_ieee_to_fp32_value(exp2_minus_17_f16); 190 | uint32_t exp2_minus_17_bits; 191 | memcpy(&exp2_minus_17_bits, &exp2_minus_17_value, sizeof(exp2_minus_17_bits)); 192 | EXPECT_EQ(exp2_minus_17_f32, exp2_minus_17_bits) << 193 | std::hex << std::uppercase << std::setfill('0') << 194 | "F16 = 0x" << std::setw(4) << exp2_minus_17_f16 << ", " << 195 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_17_bits << ", " << 196 | "F32 = 0x" << std::setw(8) << exp2_minus_17_f32; 197 | 198 | const float exp2_minus_18_value = fp16_ieee_to_fp32_value(exp2_minus_18_f16); 199 | uint32_t exp2_minus_18_bits; 200 | memcpy(&exp2_minus_18_bits, &exp2_minus_18_value, sizeof(exp2_minus_18_bits)); 201 | EXPECT_EQ(exp2_minus_18_f32, exp2_minus_18_bits) << 202 | std::hex << std::uppercase << std::setfill('0') << 203 | "F16 = 0x" << std::setw(4) << exp2_minus_18_f16 << ", " << 204 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_18_bits << ", " << 205 | "F32 = 0x" << std::setw(8) << exp2_minus_18_f32; 206 | 207 | const float exp2_minus_19_value = fp16_ieee_to_fp32_value(exp2_minus_19_f16); 208 | uint32_t exp2_minus_19_bits; 209 | memcpy(&exp2_minus_19_bits, &exp2_minus_19_value, sizeof(exp2_minus_19_bits)); 210 | EXPECT_EQ(exp2_minus_19_f32, exp2_minus_19_bits) << 211 | std::hex << std::uppercase << std::setfill('0') << 212 | "F16 = 0x" << std::setw(4) << exp2_minus_19_f16 << ", " << 213 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_19_bits << ", " << 214 | "F32 = 0x" << std::setw(8) << exp2_minus_19_f32; 215 | 216 | const float exp2_minus_20_value = fp16_ieee_to_fp32_value(exp2_minus_20_f16); 217 | uint32_t exp2_minus_20_bits; 218 | memcpy(&exp2_minus_20_bits, &exp2_minus_20_value, sizeof(exp2_minus_20_bits)); 219 | EXPECT_EQ(exp2_minus_20_f32, exp2_minus_20_bits) << 220 | std::hex << std::uppercase << std::setfill('0') << 221 | "F16 = 0x" << std::setw(4) << exp2_minus_20_f16 << ", " << 222 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_20_bits << ", " << 223 | "F32 = 0x" << std::setw(8) << exp2_minus_20_f32; 224 | 225 | const float exp2_minus_21_value = fp16_ieee_to_fp32_value(exp2_minus_21_f16); 226 | uint32_t exp2_minus_21_bits; 227 | memcpy(&exp2_minus_21_bits, &exp2_minus_21_value, sizeof(exp2_minus_21_bits)); 228 | EXPECT_EQ(exp2_minus_21_f32, exp2_minus_21_bits) << 229 | std::hex << std::uppercase << std::setfill('0') << 230 | "F16 = 0x" << std::setw(4) << exp2_minus_21_f16 << ", " << 231 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_21_bits << ", " << 232 | "F32 = 0x" << std::setw(8) << exp2_minus_21_f32; 233 | 234 | const float exp2_minus_22_value = fp16_ieee_to_fp32_value(exp2_minus_22_f16); 235 | uint32_t exp2_minus_22_bits; 236 | memcpy(&exp2_minus_22_bits, &exp2_minus_22_value, sizeof(exp2_minus_22_bits)); 237 | EXPECT_EQ(exp2_minus_22_f32, exp2_minus_22_bits) << 238 | std::hex << std::uppercase << std::setfill('0') << 239 | "F16 = 0x" << std::setw(4) << exp2_minus_22_f16 << ", " << 240 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_22_bits << ", " << 241 | "F32 = 0x" << std::setw(8) << exp2_minus_22_f32; 242 | 243 | const float exp2_minus_23_value = fp16_ieee_to_fp32_value(exp2_minus_23_f16); 244 | uint32_t exp2_minus_23_bits; 245 | memcpy(&exp2_minus_23_bits, &exp2_minus_23_value, sizeof(exp2_minus_23_bits)); 246 | EXPECT_EQ(exp2_minus_23_f32, exp2_minus_23_bits) << 247 | std::hex << std::uppercase << std::setfill('0') << 248 | "F16 = 0x" << std::setw(4) << exp2_minus_23_f16 << ", " << 249 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_23_bits << ", " << 250 | "F32 = 0x" << std::setw(8) << exp2_minus_23_f32; 251 | 252 | const float exp2_minus_24_value = fp16_ieee_to_fp32_value(exp2_minus_24_f16); 253 | uint32_t exp2_minus_24_bits; 254 | memcpy(&exp2_minus_24_bits, &exp2_minus_24_value, sizeof(exp2_minus_24_bits)); 255 | EXPECT_EQ(exp2_minus_24_f32, exp2_minus_24_bits) << 256 | std::hex << std::uppercase << std::setfill('0') << 257 | "F16 = 0x" << std::setw(4) << exp2_minus_24_f16 << ", " << 258 | "F32(F16) = 0x" << std::setw(8) << exp2_minus_24_bits << ", " << 259 | "F32 = 0x" << std::setw(8) << exp2_minus_24_f32; 260 | } 261 | 262 | TEST(FP16_IEEE_TO_FP32_VALUE, zero) { 263 | const uint16_t positive_zero_f16 = UINT16_C(0x0000); 264 | const uint16_t negative_zero_f16 = UINT16_C(0x8000); 265 | 266 | const uint32_t positive_zero_f32 = UINT32_C(0x00000000); 267 | const uint32_t negative_zero_f32 = UINT32_C(0x80000000); 268 | 269 | const float positive_zero_value = fp16_ieee_to_fp32_value(positive_zero_f16); 270 | uint32_t positive_zero_bits; 271 | memcpy(&positive_zero_bits, &positive_zero_value, sizeof(positive_zero_bits)); 272 | EXPECT_EQ(positive_zero_f32, positive_zero_bits) << 273 | std::hex << std::uppercase << std::setfill('0') << 274 | "F16 = 0x" << std::setw(4) << positive_zero_f16 << ", " << 275 | "F32(F16) = 0x" << std::setw(8) << positive_zero_bits << ", " << 276 | "F32 = 0x" << std::setw(8) << positive_zero_f32; 277 | 278 | const float negative_zero_value = fp16_ieee_to_fp32_value(negative_zero_f16); 279 | uint32_t negative_zero_bits; 280 | memcpy(&negative_zero_bits, &negative_zero_value, sizeof(negative_zero_bits)); 281 | EXPECT_EQ(negative_zero_f32, negative_zero_bits) << 282 | std::hex << std::uppercase << std::setfill('0') << 283 | "F16 = 0x" << std::setw(4) << negative_zero_f16 << ", " << 284 | "F32(F16) = 0x" << std::setw(8) << negative_zero_bits << ", " << 285 | "F32 = 0x" << std::setw(8) << negative_zero_f32; 286 | } 287 | 288 | TEST(FP16_IEEE_TO_FP32_VALUE, infinity) { 289 | const uint16_t positive_infinity_f16 = UINT16_C(0x7C00); 290 | const uint16_t negative_infinity_f16 = UINT16_C(0xFC00); 291 | 292 | const uint32_t positive_infinity_f32 = UINT32_C(0x7F800000); 293 | const uint32_t negative_infinity_f32 = UINT32_C(0xFF800000); 294 | 295 | const float positive_infinity_value = fp16_ieee_to_fp32_value(positive_infinity_f16); 296 | uint32_t positive_infinity_bits; 297 | memcpy(&positive_infinity_bits, &positive_infinity_value, sizeof(positive_infinity_bits)); 298 | EXPECT_EQ(positive_infinity_f32, positive_infinity_bits) << 299 | std::hex << std::uppercase << std::setfill('0') << 300 | "F16 = 0x" << std::setw(4) << positive_infinity_f16 << ", " << 301 | "F32(F16) = 0x" << std::setw(8) << positive_infinity_bits << ", " << 302 | "F32 = 0x" << std::setw(8) << positive_infinity_f32; 303 | 304 | const float negative_infinity_value = fp16_ieee_to_fp32_value(negative_infinity_f16); 305 | uint32_t negative_infinity_bits; 306 | memcpy(&negative_infinity_bits, &negative_infinity_value, sizeof(negative_infinity_bits)); 307 | EXPECT_EQ(negative_infinity_f32, negative_infinity_bits) << 308 | std::hex << std::uppercase << std::setfill('0') << 309 | "F16 = 0x" << std::setw(4) << negative_infinity_f16 << ", " << 310 | "F32(F16) = 0x" << std::setw(8) << negative_infinity_bits << ", " << 311 | "F32 = 0x" << std::setw(8) << negative_infinity_f32; 312 | } 313 | 314 | TEST(FP16_IEEE_TO_FP32_VALUE, positive_nan) { 315 | using std::signbit; 316 | using std::isnan; 317 | for (uint16_t m = UINT16_C(1); m < UINT16_C(0x0400); m++) { 318 | const uint16_t nan_f16 = UINT16_C(0x7C00) | m; 319 | const float nan_f32 = fp16_ieee_to_fp32_value(nan_f16); 320 | uint32_t nan_bits; 321 | memcpy(&nan_bits, &nan_f32, sizeof(nan_bits)); 322 | 323 | /* Check if NaN */ 324 | EXPECT_TRUE(isnan(nan_f32)) << 325 | std::hex << std::uppercase << std::setfill('0') << 326 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 327 | "F32(F16) = 0x" << std::setw(8) << nan_bits; 328 | 329 | /* Check sign */ 330 | EXPECT_EQ(signbit(nan_f32), 0) << 331 | std::hex << std::uppercase << std::setfill('0') << 332 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 333 | "F32(F16) = 0x" << std::setw(8) << nan_bits; 334 | } 335 | } 336 | 337 | TEST(FP16_IEEE_TO_FP32_VALUE, negative_nan) { 338 | using std::signbit; 339 | using std::isnan; 340 | for (uint16_t m = UINT16_C(1); m < UINT16_C(0x0400); m++) { 341 | const uint16_t nan_f16 = UINT16_C(0xFC00) | m; 342 | const float nan_f32 = fp16_ieee_to_fp32_value(nan_f16); 343 | uint32_t nan_bits; 344 | memcpy(&nan_bits, &nan_f32, sizeof(nan_bits)); 345 | 346 | /* Check if NaN */ 347 | EXPECT_TRUE(isnan(nan_f32)) << 348 | std::hex << std::uppercase << std::setfill('0') << 349 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 350 | "F32(F16) = 0x" << std::setw(8) << nan_bits; 351 | 352 | /* Check sign */ 353 | EXPECT_EQ(signbit(nan_f32), 1) << 354 | std::hex << std::uppercase << std::setfill('0') << 355 | "F16 = 0x" << std::setw(4) << nan_f16 << ", " << 356 | "F32(F16) = 0x" << std::setw(8) << nan_bits; 357 | } 358 | } 359 | 360 | TEST(FP16_IEEE_TO_FP32_VALUE, positive_normalized_values) { 361 | const uint32_t exponentBias = 15; 362 | for (int32_t e = -14; e <= 15; e++) { 363 | for (uint16_t h = 0; h < 0x0400; h++) { 364 | const uint16_t fp16 = h + ((uint16_t) (e + exponentBias) << 10); 365 | const uint32_t fp32 = fp16::normalizedValues[h] + ((uint32_t) e << 23); 366 | const float value = fp16_ieee_to_fp32_value(fp16); 367 | uint32_t bits; 368 | memcpy(&bits, &value, sizeof(bits)); 369 | EXPECT_EQ(fp32, bits) << 370 | std::hex << std::uppercase << std::setfill('0') << 371 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 372 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 373 | "F32 = 0x" << std::setw(8) << fp32; 374 | } 375 | } 376 | } 377 | 378 | TEST(FP16_IEEE_TO_FP32_VALUE, negative_normalized_values) { 379 | const uint32_t exponentBias = 15; 380 | for (int32_t e = -14; e <= 15; e++) { 381 | for (uint16_t h = 0; h < 0x0400; h++) { 382 | const uint16_t fp16 = (h + ((uint16_t) (e + exponentBias) << 10)) ^ UINT16_C(0x8000); 383 | const uint32_t fp32 = (fp16::normalizedValues[h] + ((uint32_t) e << 23)) ^ UINT32_C(0x80000000); 384 | const float value = fp16_ieee_to_fp32_value(fp16); 385 | uint32_t bits; 386 | memcpy(&bits, &value, sizeof(bits)); 387 | EXPECT_EQ(fp32, bits) << 388 | std::hex << std::uppercase << std::setfill('0') << 389 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 390 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 391 | "F32 = 0x" << std::setw(8) << fp32; 392 | } 393 | } 394 | } 395 | 396 | TEST(FP16_IEEE_TO_FP32_VALUE, positive_denormalized_values) { 397 | for (uint16_t h = 0; h < 0x0400; h++) { 398 | const float value = fp16_ieee_to_fp32_value(h); 399 | uint32_t bits; 400 | memcpy(&bits, &value, sizeof(bits)); 401 | EXPECT_EQ(fp16::denormalizedValues[h], bits) << 402 | std::hex << std::uppercase << std::setfill('0') << 403 | "F16 = 0x" << std::setw(4) << h << ", " << 404 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 405 | "F32 = 0x" << std::setw(8) << fp16::denormalizedValues[h]; 406 | } 407 | } 408 | 409 | TEST(FP16_IEEE_TO_FP32_VALUE, negative_denormalized_values) { 410 | for (uint16_t h = 0; h < 0x0400; h++) { 411 | const uint16_t fp16 = h ^ UINT16_C(0x8000); 412 | const uint32_t fp32 = fp16::denormalizedValues[h] ^ UINT32_C(0x80000000); 413 | const float value = fp16_ieee_to_fp32_value(fp16); 414 | uint32_t bits; 415 | memcpy(&bits, &value, sizeof(bits)); 416 | EXPECT_EQ(fp32, bits) << 417 | std::hex << std::uppercase << std::setfill('0') << 418 | "F16 = 0x" << std::setw(4) << fp16 << ", " << 419 | "F32(F16) = 0x" << std::setw(8) << bits << ", " << 420 | "F32 = 0x" << std::setw(8) << fp32; 421 | } 422 | } 423 | -------------------------------------------------------------------------------- /test/alt-from-fp32-value.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | 11 | 12 | TEST(FP16_ALT_FROM_FP32_VALUE, normalized_powers_of_2) { 13 | const uint16_t min_po2_f16 = UINT16_C(0x0400); 14 | const uint16_t eighths_f16 = UINT16_C(0x3000); 15 | const uint16_t quarter_f16 = UINT16_C(0x3400); 16 | const uint16_t half_f16 = UINT16_C(0x3800); 17 | const uint16_t one_f16 = UINT16_C(0x3C00); 18 | const uint16_t two_f16 = UINT16_C(0x4000); 19 | const uint16_t four_f16 = UINT16_C(0x4400); 20 | const uint16_t eight_f16 = UINT16_C(0x4800); 21 | const uint16_t sixteen_f16 = UINT16_C(0x4C00); 22 | const uint16_t thirtytwo_f16 = UINT16_C(0x5000); 23 | const uint16_t sixtyfour_f16 = UINT16_C(0x5400); 24 | const uint16_t max_po2_f16 = UINT16_C(0x7C00); 25 | 26 | const uint32_t min_po2_f32 = UINT32_C(0x38800000); 27 | const uint32_t eighths_f32 = UINT32_C(0x3E000000); 28 | const uint32_t quarter_f32 = UINT32_C(0x3E800000); 29 | const uint32_t half_f32 = UINT32_C(0x3F000000); 30 | const uint32_t one_f32 = UINT32_C(0x3F800000); 31 | const uint32_t two_f32 = UINT32_C(0x40000000); 32 | const uint32_t four_f32 = UINT32_C(0x40800000); 33 | const uint32_t eight_f32 = UINT32_C(0x41000000); 34 | const uint32_t sixteen_f32 = UINT32_C(0x41800000); 35 | const uint32_t thirtytwo_f32 = UINT32_C(0x42000000); 36 | const uint32_t sixtyfour_f32 = UINT32_C(0x42800000); 37 | const uint32_t max_po2_f32 = UINT32_C(0x47800000); 38 | 39 | float min_po2_value; 40 | memcpy(&min_po2_value, &min_po2_f32, sizeof(min_po2_value)); 41 | EXPECT_EQ(min_po2_f16, fp16_alt_from_fp32_value(min_po2_value)) << 42 | std::hex << std::uppercase << std::setfill('0') << 43 | "F32 = 0x" << std::setw(8) << min_po2_f32 << ", " << 44 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(min_po2_value) << ", " << 45 | "F16 = 0x" << std::setw(4) << min_po2_f16; 46 | 47 | float eighths_value; 48 | memcpy(&eighths_value, &eighths_f32, sizeof(eighths_value)); 49 | EXPECT_EQ(eighths_f16, fp16_alt_from_fp32_value(eighths_value)) << 50 | std::hex << std::uppercase << std::setfill('0') << 51 | "F32 = 0x" << std::setw(8) << eighths_f32 << ", " << 52 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(eighths_value) << ", " << 53 | "F16 = 0x" << std::setw(4) << eighths_f16; 54 | 55 | float quarter_value; 56 | memcpy(&quarter_value, &quarter_f32, sizeof(quarter_value)); 57 | EXPECT_EQ(quarter_f16, fp16_alt_from_fp32_value(quarter_value)) << 58 | std::hex << std::uppercase << std::setfill('0') << 59 | "F32 = 0x" << std::setw(8) << quarter_f32 << ", " << 60 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(quarter_value) << ", " << 61 | "F16 = 0x" << std::setw(4) << quarter_f16; 62 | 63 | float half_value; 64 | memcpy(&half_value, &half_f32, sizeof(half_value)); 65 | EXPECT_EQ(half_f16, fp16_alt_from_fp32_value(half_value)) << 66 | std::hex << std::uppercase << std::setfill('0') << 67 | "F32 = 0x" << std::setw(8) << half_f32 << ", " << 68 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(half_value) << ", " << 69 | "F16 = 0x" << std::setw(4) << half_f16; 70 | 71 | float one_value; 72 | memcpy(&one_value, &one_f32, sizeof(one_value)); 73 | EXPECT_EQ(one_f16, fp16_alt_from_fp32_value(one_value)) << 74 | std::hex << std::uppercase << std::setfill('0') << 75 | "F32 = 0x" << std::setw(8) << one_f32 << ", " << 76 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(one_value) << ", " << 77 | "F16 = 0x" << std::setw(4) << one_f16; 78 | 79 | float two_value; 80 | memcpy(&two_value, &two_f32, sizeof(two_value)); 81 | EXPECT_EQ(two_f16, fp16_alt_from_fp32_value(two_value)) << 82 | std::hex << std::uppercase << std::setfill('0') << 83 | "F32 = 0x" << std::setw(8) << two_f32 << ", " << 84 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(two_value) << ", " << 85 | "F16 = 0x" << std::setw(4) << two_f16; 86 | 87 | float four_value; 88 | memcpy(&four_value, &four_f32, sizeof(four_value)); 89 | EXPECT_EQ(four_f16, fp16_alt_from_fp32_value(four_value)) << 90 | std::hex << std::uppercase << std::setfill('0') << 91 | "F32 = 0x" << std::setw(8) << four_f32 << ", " << 92 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(four_value) << ", " << 93 | "F16 = 0x" << std::setw(4) << four_f16; 94 | 95 | float eight_value; 96 | memcpy(&eight_value, &eight_f32, sizeof(eight_value)); 97 | EXPECT_EQ(eight_f16, fp16_alt_from_fp32_value(eight_value)) << 98 | std::hex << std::uppercase << std::setfill('0') << 99 | "F32 = 0x" << std::setw(8) << eight_f32 << ", " << 100 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(eight_value) << ", " << 101 | "F16 = 0x" << std::setw(4) << eight_f16; 102 | 103 | float sixteen_value; 104 | memcpy(&sixteen_value, &sixteen_f32, sizeof(sixteen_value)); 105 | EXPECT_EQ(sixteen_f16, fp16_alt_from_fp32_value(sixteen_value)) << 106 | std::hex << std::uppercase << std::setfill('0') << 107 | "F32 = 0x" << std::setw(8) << sixteen_f32 << ", " << 108 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(sixteen_value) << ", " << 109 | "F16 = 0x" << std::setw(4) << sixteen_f16; 110 | 111 | float thirtytwo_value; 112 | memcpy(&thirtytwo_value, &thirtytwo_f32, sizeof(thirtytwo_value)); 113 | EXPECT_EQ(thirtytwo_f16, fp16_alt_from_fp32_value(thirtytwo_value)) << 114 | std::hex << std::uppercase << std::setfill('0') << 115 | "F32 = 0x" << std::setw(8) << thirtytwo_f32 << ", " << 116 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(thirtytwo_value) << ", " << 117 | "F16 = 0x" << std::setw(4) << thirtytwo_f16; 118 | 119 | float sixtyfour_value; 120 | memcpy(&sixtyfour_value, &sixtyfour_f32, sizeof(sixtyfour_value)); 121 | EXPECT_EQ(sixtyfour_f16, fp16_alt_from_fp32_value(sixtyfour_value)) << 122 | std::hex << std::uppercase << std::setfill('0') << 123 | "F32 = 0x" << std::setw(8) << sixtyfour_f32 << ", " << 124 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(sixtyfour_value) << ", " << 125 | "F16 = 0x" << std::setw(4) << sixtyfour_f16; 126 | 127 | float max_po2_value; 128 | memcpy(&max_po2_value, &max_po2_f32, sizeof(max_po2_value)); 129 | EXPECT_EQ(max_po2_f16, fp16_ieee_from_fp32_value(max_po2_value)) << 130 | std::hex << std::uppercase << std::setfill('0') << 131 | "F32 = 0x" << std::setw(8) << max_po2_f32 << ", " << 132 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(max_po2_value) << ", " << 133 | "F16 = 0x" << std::setw(4) << max_po2_f16; 134 | } 135 | 136 | TEST(FP16_ALT_FROM_FP32_VALUE, denormalized_powers_of_2) { 137 | const uint16_t exp2_minus_15_f16 = UINT16_C(0x0200); 138 | const uint16_t exp2_minus_16_f16 = UINT16_C(0x0100); 139 | const uint16_t exp2_minus_17_f16 = UINT16_C(0x0080); 140 | const uint16_t exp2_minus_18_f16 = UINT16_C(0x0040); 141 | const uint16_t exp2_minus_19_f16 = UINT16_C(0x0020); 142 | const uint16_t exp2_minus_20_f16 = UINT16_C(0x0010); 143 | const uint16_t exp2_minus_21_f16 = UINT16_C(0x0008); 144 | const uint16_t exp2_minus_22_f16 = UINT16_C(0x0004); 145 | const uint16_t exp2_minus_23_f16 = UINT16_C(0x0002); 146 | const uint16_t exp2_minus_24_f16 = UINT16_C(0x0001); 147 | const uint16_t exp2_minus_25_f16 = UINT16_C(0x0000); 148 | 149 | const uint32_t exp2_minus_15_f32 = UINT32_C(0x38000000); 150 | const uint32_t exp2_minus_16_f32 = UINT32_C(0x37800000); 151 | const uint32_t exp2_minus_17_f32 = UINT32_C(0x37000000); 152 | const uint32_t exp2_minus_18_f32 = UINT32_C(0x36800000); 153 | const uint32_t exp2_minus_19_f32 = UINT32_C(0x36000000); 154 | const uint32_t exp2_minus_20_f32 = UINT32_C(0x35800000); 155 | const uint32_t exp2_minus_21_f32 = UINT32_C(0x35000000); 156 | const uint32_t exp2_minus_22_f32 = UINT32_C(0x34800000); 157 | const uint32_t exp2_minus_23_f32 = UINT32_C(0x34000000); 158 | const uint32_t exp2_minus_24_f32 = UINT32_C(0x33800000); 159 | const uint32_t exp2_minus_25_f32 = UINT32_C(0x33000000); 160 | 161 | float exp2_minus_15_value; 162 | memcpy(&exp2_minus_15_value, &exp2_minus_15_f32, sizeof(exp2_minus_15_value)); 163 | EXPECT_EQ(exp2_minus_15_f16, fp16_alt_from_fp32_value(exp2_minus_15_value)) << 164 | std::hex << std::uppercase << std::setfill('0') << 165 | "F32 = 0x" << std::setw(8) << exp2_minus_15_f32 << ", " << 166 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_15_value) << ", " << 167 | "F16 = 0x" << std::setw(4) << exp2_minus_15_f16; 168 | 169 | float exp2_minus_16_value; 170 | memcpy(&exp2_minus_16_value, &exp2_minus_16_f32, sizeof(exp2_minus_16_value)); 171 | EXPECT_EQ(exp2_minus_16_f16, fp16_alt_from_fp32_value(exp2_minus_16_value)) << 172 | std::hex << std::uppercase << std::setfill('0') << 173 | "F32 = 0x" << std::setw(8) << exp2_minus_16_f32 << ", " << 174 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_16_value) << ", " << 175 | "F16 = 0x" << std::setw(4) << exp2_minus_16_f16; 176 | 177 | float exp2_minus_17_value; 178 | memcpy(&exp2_minus_17_value, &exp2_minus_17_f32, sizeof(exp2_minus_17_value)); 179 | EXPECT_EQ(exp2_minus_17_f16, fp16_alt_from_fp32_value(exp2_minus_17_value)) << 180 | std::hex << std::uppercase << std::setfill('0') << 181 | "F32 = 0x" << std::setw(8) << exp2_minus_17_f32 << ", " << 182 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_17_value) << ", " << 183 | "F16 = 0x" << std::setw(4) << exp2_minus_17_f16; 184 | 185 | float exp2_minus_18_value; 186 | memcpy(&exp2_minus_18_value, &exp2_minus_18_f32, sizeof(exp2_minus_18_value)); 187 | EXPECT_EQ(exp2_minus_18_f16, fp16_alt_from_fp32_value(exp2_minus_18_value)) << 188 | std::hex << std::uppercase << std::setfill('0') << 189 | "F32 = 0x" << std::setw(8) << exp2_minus_18_f32 << ", " << 190 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_18_value) << ", " << 191 | "F16 = 0x" << std::setw(4) << exp2_minus_18_f16; 192 | 193 | float exp2_minus_19_value; 194 | memcpy(&exp2_minus_19_value, &exp2_minus_19_f32, sizeof(exp2_minus_19_value)); 195 | EXPECT_EQ(exp2_minus_19_f16, fp16_alt_from_fp32_value(exp2_minus_19_value)) << 196 | std::hex << std::uppercase << std::setfill('0') << 197 | "F32 = 0x" << std::setw(8) << exp2_minus_19_f32 << ", " << 198 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_19_value) << ", " << 199 | "F16 = 0x" << std::setw(4) << exp2_minus_19_f16; 200 | 201 | float exp2_minus_20_value; 202 | memcpy(&exp2_minus_20_value, &exp2_minus_20_f32, sizeof(exp2_minus_20_value)); 203 | EXPECT_EQ(exp2_minus_20_f16, fp16_alt_from_fp32_value(exp2_minus_20_value)) << 204 | std::hex << std::uppercase << std::setfill('0') << 205 | "F32 = 0x" << std::setw(8) << exp2_minus_20_f32 << ", " << 206 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_20_value) << ", " << 207 | "F16 = 0x" << std::setw(4) << exp2_minus_20_f16; 208 | 209 | float exp2_minus_21_value; 210 | memcpy(&exp2_minus_21_value, &exp2_minus_21_f32, sizeof(exp2_minus_21_value)); 211 | EXPECT_EQ(exp2_minus_21_f16, fp16_alt_from_fp32_value(exp2_minus_21_value)) << 212 | std::hex << std::uppercase << std::setfill('0') << 213 | "F32 = 0x" << std::setw(8) << exp2_minus_21_f32 << ", " << 214 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_21_value) << ", " << 215 | "F16 = 0x" << std::setw(4) << exp2_minus_21_f16; 216 | 217 | float exp2_minus_22_value; 218 | memcpy(&exp2_minus_22_value, &exp2_minus_22_f32, sizeof(exp2_minus_22_value)); 219 | EXPECT_EQ(exp2_minus_22_f16, fp16_alt_from_fp32_value(exp2_minus_22_value)) << 220 | std::hex << std::uppercase << std::setfill('0') << 221 | "F32 = 0x" << std::setw(8) << exp2_minus_22_f32 << ", " << 222 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_22_value) << ", " << 223 | "F16 = 0x" << std::setw(4) << exp2_minus_22_f16; 224 | 225 | float exp2_minus_23_value; 226 | memcpy(&exp2_minus_23_value, &exp2_minus_23_f32, sizeof(exp2_minus_23_value)); 227 | EXPECT_EQ(exp2_minus_23_f16, fp16_alt_from_fp32_value(exp2_minus_23_value)) << 228 | std::hex << std::uppercase << std::setfill('0') << 229 | "F32 = 0x" << std::setw(8) << exp2_minus_23_f32 << ", " << 230 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_23_value) << ", " << 231 | "F16 = 0x" << std::setw(4) << exp2_minus_23_f16; 232 | 233 | float exp2_minus_24_value; 234 | memcpy(&exp2_minus_24_value, &exp2_minus_24_f32, sizeof(exp2_minus_24_value)); 235 | EXPECT_EQ(exp2_minus_24_f16, fp16_alt_from_fp32_value(exp2_minus_24_value)) << 236 | std::hex << std::uppercase << std::setfill('0') << 237 | "F32 = 0x" << std::setw(8) << exp2_minus_24_f32 << ", " << 238 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_24_value) << ", " << 239 | "F16 = 0x" << std::setw(4) << exp2_minus_24_f16; 240 | 241 | float exp2_minus_25_value; 242 | memcpy(&exp2_minus_25_value, &exp2_minus_25_f32, sizeof(exp2_minus_25_value)); 243 | EXPECT_EQ(exp2_minus_25_f16, fp16_alt_from_fp32_value(exp2_minus_25_value)) << 244 | std::hex << std::uppercase << std::setfill('0') << 245 | "F32 = 0x" << std::setw(8) << exp2_minus_25_f32 << ", " << 246 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(exp2_minus_25_value) << ", " << 247 | "F16 = 0x" << std::setw(4) << exp2_minus_25_f16; 248 | } 249 | 250 | TEST(FP16_ALT_FROM_FP32_VALUE, zero) { 251 | const uint16_t positive_zero_f16 = UINT16_C(0x0000); 252 | const uint16_t negative_zero_f16 = UINT16_C(0x8000); 253 | 254 | const uint32_t positive_zero_f32 = UINT32_C(0x00000000); 255 | const uint32_t negative_zero_f32 = UINT32_C(0x80000000); 256 | 257 | float positive_zero_value; 258 | memcpy(&positive_zero_value, &positive_zero_f32, sizeof(positive_zero_value)); 259 | EXPECT_EQ(positive_zero_f16, fp16_alt_from_fp32_value(positive_zero_value)) << 260 | std::hex << std::uppercase << std::setfill('0') << 261 | "F32 = 0x" << std::setw(8) << positive_zero_f32 << ", " << 262 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(positive_zero_value) << ", " << 263 | "F16 = 0x" << std::setw(4) << positive_zero_f16; 264 | 265 | float negative_zero_value; 266 | memcpy(&negative_zero_value, &negative_zero_f32, sizeof(negative_zero_value)); 267 | EXPECT_EQ(negative_zero_f16, fp16_alt_from_fp32_value(negative_zero_value)) << 268 | std::hex << std::uppercase << std::setfill('0') << 269 | "F32 = 0x" << std::setw(8) << negative_zero_f32 << ", " << 270 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(negative_zero_value) << ", " << 271 | "F16 = 0x" << std::setw(4) << negative_zero_f16; 272 | } 273 | 274 | TEST(FP16_ALT_FROM_FP32_VALUE, infinity) { 275 | const uint16_t max_f16 = UINT16_C(0x7FFF); 276 | const uint16_t min_f16 = UINT16_C(0xFFFF); 277 | 278 | const uint32_t positive_infinity_f32 = UINT32_C(0x7F800000); 279 | const uint32_t negative_infinity_f32 = UINT32_C(0xFF800000); 280 | 281 | float positive_infinity_value; 282 | memcpy(&positive_infinity_value, &positive_infinity_f32, sizeof(positive_infinity_value)); 283 | EXPECT_EQ(max_f16, fp16_alt_from_fp32_value(positive_infinity_value)) << 284 | std::hex << std::uppercase << std::setfill('0') << 285 | "F32 = 0x" << std::setw(8) << positive_infinity_f32 << ", " << 286 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(positive_infinity_value) << ", " << 287 | "F16 = 0x" << std::setw(4) << max_f16; 288 | 289 | float negative_infinity_value; 290 | memcpy(&negative_infinity_value, &negative_infinity_f32, sizeof(negative_infinity_value)); 291 | EXPECT_EQ(min_f16, fp16_alt_from_fp32_value(negative_infinity_value)) << 292 | std::hex << std::uppercase << std::setfill('0') << 293 | "F32 = 0x" << std::setw(8) << negative_infinity_f32 << ", " << 294 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(negative_infinity_value) << ", " << 295 | "F16 = 0x" << std::setw(4) << min_f16; 296 | } 297 | 298 | TEST(FP16_ALT_FROM_FP32_VALUE, positive_nan) { 299 | for (uint32_t nan_f32 = UINT32_C(0x7FFFFFFF); nan_f32 > UINT32_C(0x7F800000); nan_f32--) { 300 | float nan_value; 301 | memcpy(&nan_value, &nan_f32, sizeof(nan_value)); 302 | const uint16_t nan_f16 = fp16_alt_from_fp32_value(nan_value); 303 | 304 | /* Check sign */ 305 | ASSERT_EQ(nan_f16 & UINT16_C(0x8000), 0) << 306 | std::hex << std::uppercase << std::setfill('0') << 307 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 308 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 309 | 310 | /* Check exponent */ 311 | ASSERT_EQ(nan_f16 & UINT16_C(0x7C00), UINT16_C(0x7C00)) << 312 | std::hex << std::uppercase << std::setfill('0') << 313 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 314 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 315 | 316 | /* Check mantissa */ 317 | ASSERT_NE(nan_f16 & UINT16_C(0x03FF), 0) << 318 | std::hex << std::uppercase << std::setfill('0') << 319 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 320 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 321 | } 322 | } 323 | 324 | TEST(FP16_ALT_FROM_FP32_VALUE, negative_nan) { 325 | for (uint32_t nan_f32 = UINT32_C(0xFFFFFFFF); nan_f32 > UINT32_C(0xFF800000); nan_f32--) { 326 | float nan_value; 327 | memcpy(&nan_value, &nan_f32, sizeof(nan_value)); 328 | const uint16_t nan_f16 = fp16_alt_from_fp32_value(nan_value); 329 | 330 | /* Check sign */ 331 | ASSERT_EQ(nan_f16 & UINT16_C(0x8000), UINT16_C(0x8000)) << 332 | std::hex << std::uppercase << std::setfill('0') << 333 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 334 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 335 | 336 | /* Check exponent */ 337 | ASSERT_EQ(nan_f16 & UINT16_C(0x7C00), UINT16_C(0x7C00)) << 338 | std::hex << std::uppercase << std::setfill('0') << 339 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 340 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 341 | 342 | /* Check mantissa */ 343 | ASSERT_NE(nan_f16 & UINT16_C(0x03FF), 0) << 344 | std::hex << std::uppercase << std::setfill('0') << 345 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 346 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 347 | } 348 | } 349 | 350 | TEST(FP16_ALT_FROM_FP32_VALUE, revertible) { 351 | /* Positive values */ 352 | for (uint16_t f16 = UINT16_C(0x0000); f16 <= UINT16_C(0x7FFF); f16++) { 353 | const float value_f32 = fp16_alt_to_fp32_value(f16); 354 | uint32_t bits_f32; 355 | memcpy(&bits_f32, &value_f32, sizeof(bits_f32)); 356 | 357 | ASSERT_EQ(f16, fp16_alt_from_fp32_value(value_f32)) << 358 | std::hex << std::uppercase << std::setfill('0') << 359 | "F16 = 0x" << std::setw(4) << f16 << ", " << 360 | "F32(F16) = 0x" << std::setw(8) << bits_f32 << ", " << 361 | "F16(F32(F16)) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value_f32); 362 | } 363 | 364 | /* Negative values */ 365 | for (uint16_t f16 = UINT16_C(0xFFFF); f16 >= UINT16_C(0x8000); f16--) { 366 | const float value_f32 = fp16_alt_to_fp32_value(f16); 367 | uint32_t bits_f32; 368 | memcpy(&bits_f32, &value_f32, sizeof(bits_f32)); 369 | 370 | ASSERT_EQ(f16, fp16_alt_from_fp32_value(value_f32)) << 371 | std::hex << std::uppercase << std::setfill('0') << 372 | "F16 = 0x" << std::setw(4) << f16 << ", " << 373 | "F32(F16) = 0x" << std::setw(8) << bits_f32 << ", " << 374 | "F16(F32(F16)) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value_f32); 375 | } 376 | } 377 | 378 | TEST(FP16_ALT_FROM_FP32_VALUE, underflow) { 379 | const uint32_t min_nonzero_f32 = UINT32_C(0x33000001); 380 | const uint16_t zero_f16 = UINT16_C(0x0000); 381 | const uint16_t min_f16 = UINT16_C(0x0001); 382 | for (uint32_t bits = UINT32_C(0x00000001); bits < min_nonzero_f32; bits++) { 383 | float value; 384 | memcpy(&value, &bits, sizeof(value)); 385 | ASSERT_EQ(zero_f16, fp16_alt_from_fp32_value(value)) << 386 | std::hex << std::uppercase << std::setfill('0') << 387 | "F32 = 0x" << std::setw(8) << bits << ", " << 388 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value) << ", " << 389 | "F16 = 0x" << std::setw(4) << zero_f16; 390 | } 391 | float min_nonzero_value; 392 | memcpy(&min_nonzero_value, &min_nonzero_f32, sizeof(min_nonzero_value)); 393 | ASSERT_EQ(min_f16, fp16_alt_from_fp32_value(min_nonzero_value)) << 394 | std::hex << std::uppercase << std::setfill('0') << 395 | "F32 = 0x" << std::setw(8) << min_nonzero_f32 << ", " << 396 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(min_nonzero_value) << ", " << 397 | "F16 = 0x" << std::setw(4) << min_f16; 398 | } 399 | 400 | TEST(FP16_ALT_FROM_FP32_VALUE, saturation) { 401 | const uint32_t max_f16_f32 = UINT32_C(0x47FFE000); 402 | const uint16_t max_f16 = UINT16_C(0x7FFF); 403 | const uint32_t positive_infinity_f32 = UINT32_C(0x7F800000); 404 | for (uint32_t bits = positive_infinity_f32; bits > max_f16_f32; bits--) { 405 | float value; 406 | memcpy(&value, &bits, sizeof(value)); 407 | ASSERT_EQ(max_f16, fp16_alt_from_fp32_value(value)) << 408 | std::hex << std::uppercase << std::setfill('0') << 409 | "F32 = 0x" << std::setw(8) << bits << ", " << 410 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value) << ", " << 411 | "F16 = 0x" << std::setw(4) << max_f16; 412 | } 413 | } 414 | 415 | TEST(FP16_ALT_FROM_FP32_VALUE, positive_denormalized_values) { 416 | const uint32_t min_nonzero_f32 = UINT32_C(0x33000001); 417 | 418 | uint32_t f32_begin = min_nonzero_f32; 419 | for (uint16_t f16 = 0; f16 < UINT16_C(0x0400); f16++) { 420 | const uint32_t f32_end = fp16::denormalizedRanges[f16]; 421 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 422 | float value; 423 | memcpy(&value, &f32, sizeof(value)); 424 | ASSERT_EQ(f16, fp16_alt_from_fp32_value(value)) << 425 | std::hex << std::uppercase << std::setfill('0') << 426 | "F32 = 0x" << std::setw(8) << f32 << ", " << 427 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value) << ", " << 428 | "F16 = 0x" << std::setw(4) << f16; 429 | } 430 | f32_begin = f32_end; 431 | } 432 | } 433 | 434 | TEST(FP16_ALT_FROM_FP32_VALUE, negative_denormalized_values) { 435 | const uint32_t min_nonzero_f32 = UINT32_C(0x33000001); 436 | 437 | uint32_t f32_begin = min_nonzero_f32 | UINT32_C(0x80000000); 438 | for (uint16_t f16 = UINT16_C(0x8000); f16 < UINT16_C(0x8400); f16++) { 439 | const uint32_t f32_end = fp16::denormalizedRanges[f16 & UINT16_C(0x7FFF)] | UINT32_C(0x80000000); 440 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 441 | float value; 442 | memcpy(&value, &f32, sizeof(value)); 443 | ASSERT_EQ(f16, fp16_alt_from_fp32_value(value)) << 444 | std::hex << std::uppercase << std::setfill('0') << 445 | "F32 = 0x" << std::setw(8) << f32 << ", " << 446 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value) << ", " << 447 | "F16 = 0x" << std::setw(4) << f16; 448 | } 449 | f32_begin = f32_end; 450 | } 451 | } 452 | 453 | TEST(FP16_ALT_FROM_FP32_VALUE, positive_normalized_values) { 454 | /* Minimum number that rounds to 1.0h when converted to half-precision */ 455 | const uint32_t min_one_f32 = UINT32_C(0x3F7FF000); 456 | const uint32_t e_bias = 15; 457 | 458 | for (int32_t e = -14; e <= 16; e++) { 459 | uint32_t f32_begin = min_one_f32 + (uint32_t(e) << 23); 460 | for (uint16_t f16 = uint16_t(e + e_bias) << 10; f16 < uint16_t(e + e_bias + 1) << 10; f16++) { 461 | const uint32_t f32_end = fp16::normalizedRanges[f16 & UINT16_C(0x3FF)] + (uint32_t(e) << 23); 462 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 463 | float value; 464 | memcpy(&value, &f32, sizeof(value)); 465 | ASSERT_EQ(f16, fp16_alt_from_fp32_value(value)) << 466 | std::hex << std::uppercase << std::setfill('0') << 467 | "F32 = 0x" << std::setw(8) << f32 << ", " << 468 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value) << ", " << 469 | "F16 = 0x" << std::setw(4) << f16; 470 | } 471 | f32_begin = f32_end; 472 | } 473 | } 474 | } 475 | 476 | TEST(FP16_ALT_FROM_FP32_VALUE, negative_normalized_values) { 477 | /* Minimum number that rounds to 1.0h when converted to half-precision */ 478 | const uint32_t min_one_f32 = UINT32_C(0x3F7FF000); 479 | const uint32_t e_bias = 15; 480 | 481 | for (int32_t e = -14; e <= 16; e++) { 482 | uint32_t f32_begin = (min_one_f32 | UINT32_C(0x80000000)) + (uint32_t(e) << 23); 483 | for (uint16_t f16 = (UINT16_C(0x8000) | (uint16_t(e + e_bias) << 10)); f16 < (UINT16_C(0x8000) | (uint16_t(e + e_bias + 1) << 10)); f16++) { 484 | const uint32_t f32_end = (fp16::normalizedRanges[f16 & UINT16_C(0x3FF)] | UINT32_C(0x80000000)) + (uint32_t(e) << 23); 485 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 486 | float value; 487 | memcpy(&value, &f32, sizeof(value)); 488 | ASSERT_EQ(f16, fp16_alt_from_fp32_value(value)) << 489 | std::hex << std::uppercase << std::setfill('0') << 490 | "F32 = 0x" << std::setw(8) << f32 << ", " << 491 | "F16(F32) = 0x" << std::setw(4) << fp16_alt_from_fp32_value(value) << ", " << 492 | "F16 = 0x" << std::setw(4) << f16; 493 | } 494 | f32_begin = f32_end; 495 | } 496 | } 497 | } 498 | -------------------------------------------------------------------------------- /test/ieee-from-fp32-value.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | 11 | TEST(FP16_IEEE_FROM_FP32_VALUE, normalized_powers_of_2) { 12 | const uint16_t min_po2_f16 = UINT16_C(0x0400); 13 | const uint16_t eighths_f16 = UINT16_C(0x3000); 14 | const uint16_t quarter_f16 = UINT16_C(0x3400); 15 | const uint16_t half_f16 = UINT16_C(0x3800); 16 | const uint16_t one_f16 = UINT16_C(0x3C00); 17 | const uint16_t two_f16 = UINT16_C(0x4000); 18 | const uint16_t four_f16 = UINT16_C(0x4400); 19 | const uint16_t eight_f16 = UINT16_C(0x4800); 20 | const uint16_t sixteen_f16 = UINT16_C(0x4C00); 21 | const uint16_t thirtytwo_f16 = UINT16_C(0x5000); 22 | const uint16_t sixtyfour_f16 = UINT16_C(0x5400); 23 | const uint16_t max_po2_f16 = UINT16_C(0x7800); 24 | 25 | const uint32_t min_po2_f32 = UINT32_C(0x38800000); 26 | const uint32_t eighths_f32 = UINT32_C(0x3E000000); 27 | const uint32_t quarter_f32 = UINT32_C(0x3E800000); 28 | const uint32_t half_f32 = UINT32_C(0x3F000000); 29 | const uint32_t one_f32 = UINT32_C(0x3F800000); 30 | const uint32_t two_f32 = UINT32_C(0x40000000); 31 | const uint32_t four_f32 = UINT32_C(0x40800000); 32 | const uint32_t eight_f32 = UINT32_C(0x41000000); 33 | const uint32_t sixteen_f32 = UINT32_C(0x41800000); 34 | const uint32_t thirtytwo_f32 = UINT32_C(0x42000000); 35 | const uint32_t sixtyfour_f32 = UINT32_C(0x42800000); 36 | const uint32_t max_po2_f32 = UINT32_C(0x47000000); 37 | 38 | float min_po2_value; 39 | memcpy(&min_po2_value, &min_po2_f32, sizeof(min_po2_value)); 40 | EXPECT_EQ(min_po2_f16, fp16_ieee_from_fp32_value(min_po2_value)) << 41 | std::hex << std::uppercase << std::setfill('0') << 42 | "F32 = 0x" << std::setw(8) << min_po2_f32 << ", " << 43 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(min_po2_value) << ", " << 44 | "F16 = 0x" << std::setw(4) << min_po2_f16; 45 | 46 | float eighths_value; 47 | memcpy(&eighths_value, &eighths_f32, sizeof(eighths_value)); 48 | EXPECT_EQ(eighths_f16, fp16_ieee_from_fp32_value(eighths_value)) << 49 | std::hex << std::uppercase << std::setfill('0') << 50 | "F32 = 0x" << std::setw(8) << eighths_f32 << ", " << 51 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(eighths_value) << ", " << 52 | "F16 = 0x" << std::setw(4) << eighths_f16; 53 | 54 | float quarter_value; 55 | memcpy(&quarter_value, &quarter_f32, sizeof(quarter_value)); 56 | EXPECT_EQ(quarter_f16, fp16_ieee_from_fp32_value(quarter_value)) << 57 | std::hex << std::uppercase << std::setfill('0') << 58 | "F32 = 0x" << std::setw(8) << quarter_f32 << ", " << 59 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(quarter_value) << ", " << 60 | "F16 = 0x" << std::setw(4) << quarter_f16; 61 | 62 | float half_value; 63 | memcpy(&half_value, &half_f32, sizeof(half_value)); 64 | EXPECT_EQ(half_f16, fp16_ieee_from_fp32_value(half_value)) << 65 | std::hex << std::uppercase << std::setfill('0') << 66 | "F32 = 0x" << std::setw(8) << half_f32 << ", " << 67 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(half_value) << ", " << 68 | "F16 = 0x" << std::setw(4) << half_f16; 69 | 70 | float one_value; 71 | memcpy(&one_value, &one_f32, sizeof(one_value)); 72 | EXPECT_EQ(one_f16, fp16_ieee_from_fp32_value(one_value)) << 73 | std::hex << std::uppercase << std::setfill('0') << 74 | "F32 = 0x" << std::setw(8) << one_f32 << ", " << 75 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(one_value) << ", " << 76 | "F16 = 0x" << std::setw(4) << one_f16; 77 | 78 | float two_value; 79 | memcpy(&two_value, &two_f32, sizeof(two_value)); 80 | EXPECT_EQ(two_f16, fp16_ieee_from_fp32_value(two_value)) << 81 | std::hex << std::uppercase << std::setfill('0') << 82 | "F32 = 0x" << std::setw(8) << two_f32 << ", " << 83 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(two_value) << ", " << 84 | "F16 = 0x" << std::setw(4) << two_f16; 85 | 86 | float four_value; 87 | memcpy(&four_value, &four_f32, sizeof(four_value)); 88 | EXPECT_EQ(four_f16, fp16_ieee_from_fp32_value(four_value)) << 89 | std::hex << std::uppercase << std::setfill('0') << 90 | "F32 = 0x" << std::setw(8) << four_f32 << ", " << 91 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(four_value) << ", " << 92 | "F16 = 0x" << std::setw(4) << four_f16; 93 | 94 | float eight_value; 95 | memcpy(&eight_value, &eight_f32, sizeof(eight_value)); 96 | EXPECT_EQ(eight_f16, fp16_ieee_from_fp32_value(eight_value)) << 97 | std::hex << std::uppercase << std::setfill('0') << 98 | "F32 = 0x" << std::setw(8) << eight_f32 << ", " << 99 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(eight_value) << ", " << 100 | "F16 = 0x" << std::setw(4) << eight_f16; 101 | 102 | float sixteen_value; 103 | memcpy(&sixteen_value, &sixteen_f32, sizeof(sixteen_value)); 104 | EXPECT_EQ(sixteen_f16, fp16_ieee_from_fp32_value(sixteen_value)) << 105 | std::hex << std::uppercase << std::setfill('0') << 106 | "F32 = 0x" << std::setw(8) << sixteen_f32 << ", " << 107 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(sixteen_value) << ", " << 108 | "F16 = 0x" << std::setw(4) << sixteen_f16; 109 | 110 | float thirtytwo_value; 111 | memcpy(&thirtytwo_value, &thirtytwo_f32, sizeof(thirtytwo_value)); 112 | EXPECT_EQ(thirtytwo_f16, fp16_ieee_from_fp32_value(thirtytwo_value)) << 113 | std::hex << std::uppercase << std::setfill('0') << 114 | "F32 = 0x" << std::setw(8) << thirtytwo_f32 << ", " << 115 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(thirtytwo_value) << ", " << 116 | "F16 = 0x" << std::setw(4) << thirtytwo_f16; 117 | 118 | float sixtyfour_value; 119 | memcpy(&sixtyfour_value, &sixtyfour_f32, sizeof(sixtyfour_value)); 120 | EXPECT_EQ(sixtyfour_f16, fp16_ieee_from_fp32_value(sixtyfour_value)) << 121 | std::hex << std::uppercase << std::setfill('0') << 122 | "F32 = 0x" << std::setw(8) << sixtyfour_f32 << ", " << 123 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(sixtyfour_value) << ", " << 124 | "F16 = 0x" << std::setw(4) << sixtyfour_f16; 125 | 126 | float max_po2_value; 127 | memcpy(&max_po2_value, &max_po2_f32, sizeof(max_po2_value)); 128 | EXPECT_EQ(max_po2_f16, fp16_ieee_from_fp32_value(max_po2_value)) << 129 | std::hex << std::uppercase << std::setfill('0') << 130 | "F32 = 0x" << std::setw(8) << max_po2_f32 << ", " << 131 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(max_po2_value) << ", " << 132 | "F16 = 0x" << std::setw(4) << max_po2_f16; 133 | } 134 | 135 | TEST(FP16_IEEE_FROM_FP32_VALUE, denormalized_powers_of_2) { 136 | const uint16_t exp2_minus_15_f16 = UINT16_C(0x0200); 137 | const uint16_t exp2_minus_16_f16 = UINT16_C(0x0100); 138 | const uint16_t exp2_minus_17_f16 = UINT16_C(0x0080); 139 | const uint16_t exp2_minus_18_f16 = UINT16_C(0x0040); 140 | const uint16_t exp2_minus_19_f16 = UINT16_C(0x0020); 141 | const uint16_t exp2_minus_20_f16 = UINT16_C(0x0010); 142 | const uint16_t exp2_minus_21_f16 = UINT16_C(0x0008); 143 | const uint16_t exp2_minus_22_f16 = UINT16_C(0x0004); 144 | const uint16_t exp2_minus_23_f16 = UINT16_C(0x0002); 145 | const uint16_t exp2_minus_24_f16 = UINT16_C(0x0001); 146 | const uint16_t exp2_minus_25_f16 = UINT16_C(0x0000); 147 | 148 | const uint32_t exp2_minus_15_f32 = UINT32_C(0x38000000); 149 | const uint32_t exp2_minus_16_f32 = UINT32_C(0x37800000); 150 | const uint32_t exp2_minus_17_f32 = UINT32_C(0x37000000); 151 | const uint32_t exp2_minus_18_f32 = UINT32_C(0x36800000); 152 | const uint32_t exp2_minus_19_f32 = UINT32_C(0x36000000); 153 | const uint32_t exp2_minus_20_f32 = UINT32_C(0x35800000); 154 | const uint32_t exp2_minus_21_f32 = UINT32_C(0x35000000); 155 | const uint32_t exp2_minus_22_f32 = UINT32_C(0x34800000); 156 | const uint32_t exp2_minus_23_f32 = UINT32_C(0x34000000); 157 | const uint32_t exp2_minus_24_f32 = UINT32_C(0x33800000); 158 | const uint32_t exp2_minus_25_f32 = UINT32_C(0x33000000); 159 | 160 | float exp2_minus_15_value; 161 | memcpy(&exp2_minus_15_value, &exp2_minus_15_f32, sizeof(exp2_minus_15_value)); 162 | EXPECT_EQ(exp2_minus_15_f16, fp16_ieee_from_fp32_value(exp2_minus_15_value)) << 163 | std::hex << std::uppercase << std::setfill('0') << 164 | "F32 = 0x" << std::setw(8) << exp2_minus_15_f32 << ", " << 165 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_15_value) << ", " << 166 | "F16 = 0x" << std::setw(4) << exp2_minus_15_f16; 167 | 168 | float exp2_minus_16_value; 169 | memcpy(&exp2_minus_16_value, &exp2_minus_16_f32, sizeof(exp2_minus_16_value)); 170 | EXPECT_EQ(exp2_minus_16_f16, fp16_ieee_from_fp32_value(exp2_minus_16_value)) << 171 | std::hex << std::uppercase << std::setfill('0') << 172 | "F32 = 0x" << std::setw(8) << exp2_minus_16_f32 << ", " << 173 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_16_value) << ", " << 174 | "F16 = 0x" << std::setw(4) << exp2_minus_16_f16; 175 | 176 | float exp2_minus_17_value; 177 | memcpy(&exp2_minus_17_value, &exp2_minus_17_f32, sizeof(exp2_minus_17_value)); 178 | EXPECT_EQ(exp2_minus_17_f16, fp16_ieee_from_fp32_value(exp2_minus_17_value)) << 179 | std::hex << std::uppercase << std::setfill('0') << 180 | "F32 = 0x" << std::setw(8) << exp2_minus_17_f32 << ", " << 181 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_17_value) << ", " << 182 | "F16 = 0x" << std::setw(4) << exp2_minus_17_f16; 183 | 184 | float exp2_minus_18_value; 185 | memcpy(&exp2_minus_18_value, &exp2_minus_18_f32, sizeof(exp2_minus_18_value)); 186 | EXPECT_EQ(exp2_minus_18_f16, fp16_ieee_from_fp32_value(exp2_minus_18_value)) << 187 | std::hex << std::uppercase << std::setfill('0') << 188 | "F32 = 0x" << std::setw(8) << exp2_minus_18_f32 << ", " << 189 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_18_value) << ", " << 190 | "F16 = 0x" << std::setw(4) << exp2_minus_18_f16; 191 | 192 | float exp2_minus_19_value; 193 | memcpy(&exp2_minus_19_value, &exp2_minus_19_f32, sizeof(exp2_minus_19_value)); 194 | EXPECT_EQ(exp2_minus_19_f16, fp16_ieee_from_fp32_value(exp2_minus_19_value)) << 195 | std::hex << std::uppercase << std::setfill('0') << 196 | "F32 = 0x" << std::setw(8) << exp2_minus_19_f32 << ", " << 197 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_19_value) << ", " << 198 | "F16 = 0x" << std::setw(4) << exp2_minus_19_f16; 199 | 200 | float exp2_minus_20_value; 201 | memcpy(&exp2_minus_20_value, &exp2_minus_20_f32, sizeof(exp2_minus_20_value)); 202 | EXPECT_EQ(exp2_minus_20_f16, fp16_ieee_from_fp32_value(exp2_minus_20_value)) << 203 | std::hex << std::uppercase << std::setfill('0') << 204 | "F32 = 0x" << std::setw(8) << exp2_minus_20_f32 << ", " << 205 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_20_value) << ", " << 206 | "F16 = 0x" << std::setw(4) << exp2_minus_20_f16; 207 | 208 | float exp2_minus_21_value; 209 | memcpy(&exp2_minus_21_value, &exp2_minus_21_f32, sizeof(exp2_minus_21_value)); 210 | EXPECT_EQ(exp2_minus_21_f16, fp16_ieee_from_fp32_value(exp2_minus_21_value)) << 211 | std::hex << std::uppercase << std::setfill('0') << 212 | "F32 = 0x" << std::setw(8) << exp2_minus_21_f32 << ", " << 213 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_21_value) << ", " << 214 | "F16 = 0x" << std::setw(4) << exp2_minus_21_f16; 215 | 216 | float exp2_minus_22_value; 217 | memcpy(&exp2_minus_22_value, &exp2_minus_22_f32, sizeof(exp2_minus_22_value)); 218 | EXPECT_EQ(exp2_minus_22_f16, fp16_ieee_from_fp32_value(exp2_minus_22_value)) << 219 | std::hex << std::uppercase << std::setfill('0') << 220 | "F32 = 0x" << std::setw(8) << exp2_minus_22_f32 << ", " << 221 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_22_value) << ", " << 222 | "F16 = 0x" << std::setw(4) << exp2_minus_22_f16; 223 | 224 | float exp2_minus_23_value; 225 | memcpy(&exp2_minus_23_value, &exp2_minus_23_f32, sizeof(exp2_minus_23_value)); 226 | EXPECT_EQ(exp2_minus_23_f16, fp16_ieee_from_fp32_value(exp2_minus_23_value)) << 227 | std::hex << std::uppercase << std::setfill('0') << 228 | "F32 = 0x" << std::setw(8) << exp2_minus_23_f32 << ", " << 229 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_23_value) << ", " << 230 | "F16 = 0x" << std::setw(4) << exp2_minus_23_f16; 231 | 232 | float exp2_minus_24_value; 233 | memcpy(&exp2_minus_24_value, &exp2_minus_24_f32, sizeof(exp2_minus_24_value)); 234 | EXPECT_EQ(exp2_minus_24_f16, fp16_ieee_from_fp32_value(exp2_minus_24_value)) << 235 | std::hex << std::uppercase << std::setfill('0') << 236 | "F32 = 0x" << std::setw(8) << exp2_minus_24_f32 << ", " << 237 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_24_value) << ", " << 238 | "F16 = 0x" << std::setw(4) << exp2_minus_24_f16; 239 | 240 | float exp2_minus_25_value; 241 | memcpy(&exp2_minus_25_value, &exp2_minus_25_f32, sizeof(exp2_minus_25_value)); 242 | EXPECT_EQ(exp2_minus_25_f16, fp16_ieee_from_fp32_value(exp2_minus_25_value)) << 243 | std::hex << std::uppercase << std::setfill('0') << 244 | "F32 = 0x" << std::setw(8) << exp2_minus_25_f32 << ", " << 245 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(exp2_minus_25_value) << ", " << 246 | "F16 = 0x" << std::setw(4) << exp2_minus_25_f16; 247 | } 248 | 249 | TEST(FP16_IEEE_FROM_FP32_VALUE, zero) { 250 | const uint16_t positive_zero_f16 = UINT16_C(0x0000); 251 | const uint16_t negative_zero_f16 = UINT16_C(0x8000); 252 | 253 | const uint32_t positive_zero_f32 = UINT32_C(0x00000000); 254 | const uint32_t negative_zero_f32 = UINT32_C(0x80000000); 255 | 256 | float positive_zero_value; 257 | memcpy(&positive_zero_value, &positive_zero_f32, sizeof(positive_zero_value)); 258 | EXPECT_EQ(positive_zero_f16, fp16_ieee_from_fp32_value(positive_zero_value)) << 259 | std::hex << std::uppercase << std::setfill('0') << 260 | "F32 = 0x" << std::setw(8) << positive_zero_f32 << ", " << 261 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(positive_zero_value) << ", " << 262 | "F16 = 0x" << std::setw(4) << positive_zero_f16; 263 | 264 | float negative_zero_value; 265 | memcpy(&negative_zero_value, &negative_zero_f32, sizeof(negative_zero_value)); 266 | EXPECT_EQ(negative_zero_f16, fp16_ieee_from_fp32_value(negative_zero_value)) << 267 | std::hex << std::uppercase << std::setfill('0') << 268 | "F32 = 0x" << std::setw(8) << negative_zero_f32 << ", " << 269 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(negative_zero_value) << ", " << 270 | "F16 = 0x" << std::setw(4) << negative_zero_f16; 271 | } 272 | 273 | TEST(FP16_IEEE_FROM_FP32_VALUE, infinity) { 274 | const uint16_t positive_infinity_f16 = UINT16_C(0x7C00); 275 | const uint16_t negative_infinity_f16 = UINT16_C(0xFC00); 276 | 277 | const uint32_t positive_infinity_f32 = UINT32_C(0x7F800000); 278 | const uint32_t negative_infinity_f32 = UINT32_C(0xFF800000); 279 | 280 | float positive_infinity_value; 281 | memcpy(&positive_infinity_value, &positive_infinity_f32, sizeof(positive_infinity_value)); 282 | EXPECT_EQ(positive_infinity_f16, fp16_ieee_from_fp32_value(positive_infinity_value)) << 283 | std::hex << std::uppercase << std::setfill('0') << 284 | "F32 = 0x" << std::setw(8) << positive_infinity_f32 << ", " << 285 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(positive_infinity_value) << ", " << 286 | "F16 = 0x" << std::setw(4) << positive_infinity_f16; 287 | 288 | float negative_infinity_value; 289 | memcpy(&negative_infinity_value, &negative_infinity_f32, sizeof(negative_infinity_value)); 290 | EXPECT_EQ(negative_infinity_f16, fp16_ieee_from_fp32_value(negative_infinity_value)) << 291 | std::hex << std::uppercase << std::setfill('0') << 292 | "F32 = 0x" << std::setw(8) << negative_infinity_f32 << ", " << 293 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(negative_infinity_value) << ", " << 294 | "F16 = 0x" << std::setw(4) << negative_infinity_f16; 295 | } 296 | 297 | TEST(FP16_IEEE_FROM_FP32_VALUE, positive_nan) { 298 | for (uint32_t nan_f32 = UINT32_C(0x7FFFFFFF); nan_f32 > UINT32_C(0x7F800000); nan_f32--) { 299 | float nan_value; 300 | memcpy(&nan_value, &nan_f32, sizeof(nan_value)); 301 | const uint16_t nan_f16 = fp16_ieee_from_fp32_value(nan_value); 302 | 303 | /* Check sign */ 304 | ASSERT_EQ(nan_f16 & UINT16_C(0x8000), 0) << 305 | std::hex << std::uppercase << std::setfill('0') << 306 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 307 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 308 | 309 | /* Check exponent */ 310 | ASSERT_EQ(nan_f16 & UINT16_C(0x7C00), UINT16_C(0x7C00)) << 311 | std::hex << std::uppercase << std::setfill('0') << 312 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 313 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 314 | 315 | /* Check mantissa */ 316 | ASSERT_NE(nan_f16 & UINT16_C(0x03FF), 0) << 317 | std::hex << std::uppercase << std::setfill('0') << 318 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 319 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 320 | } 321 | } 322 | 323 | TEST(FP16_IEEE_FROM_FP32_VALUE, negative_nan) { 324 | for (uint32_t nan_f32 = UINT32_C(0xFFFFFFFF); nan_f32 > UINT32_C(0xFF800000); nan_f32--) { 325 | float nan_value; 326 | memcpy(&nan_value, &nan_f32, sizeof(nan_value)); 327 | const uint16_t nan_f16 = fp16_ieee_from_fp32_value(nan_value); 328 | 329 | /* Check sign */ 330 | ASSERT_EQ(nan_f16 & UINT16_C(0x8000), UINT16_C(0x8000)) << 331 | std::hex << std::uppercase << std::setfill('0') << 332 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 333 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 334 | 335 | /* Check exponent */ 336 | ASSERT_EQ(nan_f16 & UINT16_C(0x7C00), UINT16_C(0x7C00)) << 337 | std::hex << std::uppercase << std::setfill('0') << 338 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 339 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 340 | 341 | /* Check mantissa */ 342 | ASSERT_NE(nan_f16 & UINT16_C(0x03FF), 0) << 343 | std::hex << std::uppercase << std::setfill('0') << 344 | "F32 = 0x" << std::setw(8) << nan_f32 << ", " << 345 | "F16(F32) = 0x" << std::setw(4) << nan_f16; 346 | } 347 | } 348 | 349 | TEST(FP16_IEEE_FROM_FP32_VALUE, revertible) { 350 | /* Positive values */ 351 | for (uint16_t f16 = UINT16_C(0x0000); f16 < UINT16_C(0x7C00); f16++) { 352 | const float value_f32 = fp16_ieee_to_fp32_value(f16); 353 | uint32_t bits_f32; 354 | memcpy(&bits_f32, &value_f32, sizeof(bits_f32)); 355 | 356 | ASSERT_EQ(f16, fp16_ieee_from_fp32_value(value_f32)) << 357 | std::hex << std::uppercase << std::setfill('0') << 358 | "F16 = 0x" << std::setw(4) << f16 << ", " << 359 | "F32(F16) = 0x" << std::setw(8) << bits_f32 << ", " << 360 | "F16(F32(F16)) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value_f32); 361 | } 362 | 363 | /* Negative values */ 364 | for (uint16_t f16 = UINT16_C(0x8000); f16 < UINT16_C(0xFC00); f16++) { 365 | const float value_f32 = fp16_ieee_to_fp32_value(f16); 366 | uint32_t bits_f32; 367 | memcpy(&bits_f32, &value_f32, sizeof(bits_f32)); 368 | 369 | ASSERT_EQ(f16, fp16_ieee_from_fp32_value(value_f32)) << 370 | std::hex << std::uppercase << std::setfill('0') << 371 | "F16 = 0x" << std::setw(4) << f16 << ", " << 372 | "F32(F16) = 0x" << std::setw(8) << bits_f32 << ", " << 373 | "F16(F32(F16)) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value_f32); 374 | } 375 | } 376 | 377 | TEST(FP16_IEEE_FROM_FP32_VALUE, underflow) { 378 | const uint32_t min_nonzero_f32 = UINT32_C(0x33000001); 379 | const uint16_t zero_f16 = UINT16_C(0x0000); 380 | const uint16_t min_f16 = UINT16_C(0x0001); 381 | for (uint32_t bits = UINT32_C(0x00000001); bits < min_nonzero_f32; bits++) { 382 | float value; 383 | memcpy(&value, &bits, sizeof(value)); 384 | ASSERT_EQ(zero_f16, fp16_ieee_from_fp32_value(value)) << 385 | std::hex << std::uppercase << std::setfill('0') << 386 | "F32 = 0x" << std::setw(8) << bits << ", " << 387 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value) << ", " << 388 | "F16 = 0x" << std::setw(4) << zero_f16; 389 | } 390 | float min_nonzero_value; 391 | memcpy(&min_nonzero_value, &min_nonzero_f32, sizeof(min_nonzero_value)); 392 | ASSERT_EQ(min_f16, fp16_ieee_from_fp32_value(min_nonzero_value)) << 393 | std::hex << std::uppercase << std::setfill('0') << 394 | "F32 = 0x" << std::setw(8) << min_nonzero_f32 << ", " << 395 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(min_nonzero_value) << ", " << 396 | "F16 = 0x" << std::setw(4) << min_f16; 397 | } 398 | 399 | TEST(FP16_IEEE_FROM_FP32_VALUE, overflow) { 400 | const uint32_t max_finite_f32 = UINT32_C(0x477FEFFF); 401 | const uint16_t max_finite_f16 = UINT16_C(0x7BFF); 402 | const uint32_t positive_infinity_f32 = UINT32_C(0x7F800000); 403 | const uint16_t positive_infinity_f16 = UINT16_C(0x7C00); 404 | for (uint32_t bits = positive_infinity_f32; bits > max_finite_f32; bits--) { 405 | float value; 406 | memcpy(&value, &bits, sizeof(value)); 407 | ASSERT_EQ(positive_infinity_f16, fp16_ieee_from_fp32_value(value)) << 408 | std::hex << std::uppercase << std::setfill('0') << 409 | "F32 = 0x" << std::setw(8) << bits << ", " << 410 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value) << ", " << 411 | "F16 = 0x" << std::setw(4) << positive_infinity_f16; 412 | } 413 | float max_finite_value; 414 | memcpy(&max_finite_value, &max_finite_f32, sizeof(max_finite_value)); 415 | ASSERT_EQ(max_finite_f16, fp16_ieee_from_fp32_value(max_finite_value)) << 416 | std::hex << std::uppercase << std::setfill('0') << 417 | "F32 = 0x" << std::setw(8) << max_finite_f32 << ", " << 418 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(max_finite_value) << ", " << 419 | "F16 = 0x" << std::setw(4) << max_finite_f16; 420 | } 421 | 422 | TEST(FP16_IEEE_FROM_FP32_VALUE, positive_denormalized_values) { 423 | const uint32_t min_nonzero_f32 = UINT32_C(0x33000001); 424 | 425 | uint32_t f32_begin = min_nonzero_f32; 426 | for (uint16_t f16 = 0; f16 < UINT16_C(0x0400); f16++) { 427 | const uint32_t f32_end = fp16::denormalizedRanges[f16]; 428 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 429 | float value; 430 | memcpy(&value, &f32, sizeof(value)); 431 | ASSERT_EQ(f16, fp16_ieee_from_fp32_value(value)) << 432 | std::hex << std::uppercase << std::setfill('0') << 433 | "F32 = 0x" << std::setw(8) << f32 << ", " << 434 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value) << ", " << 435 | "F16 = 0x" << std::setw(4) << f16; 436 | } 437 | f32_begin = f32_end; 438 | } 439 | } 440 | 441 | TEST(FP16_IEEE_FROM_FP32_VALUE, negative_denormalized_values) { 442 | const uint32_t min_nonzero_f32 = UINT32_C(0x33000001); 443 | 444 | uint32_t f32_begin = min_nonzero_f32 | UINT32_C(0x80000000); 445 | for (uint16_t f16 = UINT16_C(0x8000); f16 < UINT16_C(0x8400); f16++) { 446 | const uint32_t f32_end = fp16::denormalizedRanges[f16 & UINT16_C(0x7FFF)] | UINT32_C(0x80000000); 447 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 448 | float value; 449 | memcpy(&value, &f32, sizeof(value)); 450 | ASSERT_EQ(f16, fp16_ieee_from_fp32_value(value)) << 451 | std::hex << std::uppercase << std::setfill('0') << 452 | "F32 = 0x" << std::setw(8) << f32 << ", " << 453 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value) << ", " << 454 | "F16 = 0x" << std::setw(4) << f16; 455 | } 456 | f32_begin = f32_end; 457 | } 458 | } 459 | 460 | TEST(FP16_IEEE_FROM_FP32_VALUE, positive_normalized_values) { 461 | /* Minimum number that rounds to 1.0h when converted to half-precision */ 462 | const uint32_t min_one_f32 = UINT32_C(0x3F7FF000); 463 | const uint32_t e_bias = 15; 464 | 465 | for (int32_t e = -14; e <= 15; e++) { 466 | uint32_t f32_begin = min_one_f32 + (uint32_t(e) << 23); 467 | for (uint16_t f16 = uint16_t(e + e_bias) << 10; f16 < uint16_t(e + e_bias + 1) << 10; f16++) { 468 | const uint32_t f32_end = fp16::normalizedRanges[f16 & UINT16_C(0x3FF)] + (uint32_t(e) << 23); 469 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 470 | float value; 471 | memcpy(&value, &f32, sizeof(value)); 472 | ASSERT_EQ(f16, fp16_ieee_from_fp32_value(value)) << 473 | std::hex << std::uppercase << std::setfill('0') << 474 | "F32 = 0x" << std::setw(8) << f32 << ", " << 475 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value) << ", " << 476 | "F16 = 0x" << std::setw(4) << f16; 477 | } 478 | f32_begin = f32_end; 479 | } 480 | } 481 | } 482 | 483 | TEST(FP16_IEEE_FROM_FP32_VALUE, negative_normalized_values) { 484 | /* Minimum number that rounds to 1.0h when converted to half-precision */ 485 | const uint32_t min_one_f32 = UINT32_C(0x3F7FF000); 486 | const uint32_t e_bias = 15; 487 | 488 | for (int32_t e = -14; e <= 15; e++) { 489 | uint32_t f32_begin = (min_one_f32 | UINT32_C(0x80000000)) + (uint32_t(e) << 23); 490 | for (uint16_t f16 = (UINT16_C(0x8000) | (uint16_t(e + e_bias) << 10)); f16 < (UINT16_C(0x8000) | (uint16_t(e + e_bias + 1) << 10)); f16++) { 491 | const uint32_t f32_end = (fp16::normalizedRanges[f16 & UINT16_C(0x3FF)] | UINT32_C(0x80000000)) + (uint32_t(e) << 23); 492 | for (uint32_t f32 = f32_begin; f32 < f32_end; f32++) { 493 | float value; 494 | memcpy(&value, &f32, sizeof(value)); 495 | ASSERT_EQ(f16, fp16_ieee_from_fp32_value(value)) << 496 | std::hex << std::uppercase << std::setfill('0') << 497 | "F32 = 0x" << std::setw(8) << f32 << ", " << 498 | "F16(F32) = 0x" << std::setw(4) << fp16_ieee_from_fp32_value(value) << ", " << 499 | "F16 = 0x" << std::setw(4) << f16; 500 | } 501 | f32_begin = f32_end; 502 | } 503 | } 504 | } 505 | -------------------------------------------------------------------------------- /include/fp16/fp16.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef FP16_FP16_H 3 | #define FP16_FP16_H 4 | 5 | #if defined(__cplusplus) && (__cplusplus >= 201103L) 6 | #include 7 | #include 8 | #elif !defined(__OPENCL_VERSION__) 9 | #include 10 | #include 11 | #endif 12 | 13 | #include 14 | #include 15 | 16 | #if defined(_MSC_VER) 17 | #include 18 | #endif 19 | #if defined(__F16C__) && FP16_USE_NATIVE_CONVERSION && !FP16_USE_FLOAT16_TYPE && !FP16_USE_FP16_TYPE 20 | #include 21 | #endif 22 | #if (defined(__aarch64__) || defined(_M_ARM64)) && FP16_USE_NATIVE_CONVERSION && !FP16_USE_FLOAT16_TYPE && !FP16_USE_FP16_TYPE 23 | #include 24 | #endif 25 | 26 | 27 | /* 28 | * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to 29 | * a 32-bit floating-point number in IEEE single-precision format, in bit representation. 30 | * 31 | * @note The implementation doesn't use any floating-point operations. 32 | */ 33 | static inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) { 34 | /* 35 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: 36 | * +---+-----+------------+-------------------+ 37 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| 38 | * +---+-----+------------+-------------------+ 39 | * Bits 31 26-30 16-25 0-15 40 | * 41 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. 42 | */ 43 | const uint32_t w = (uint32_t) h << 16; 44 | /* 45 | * Extract the sign of the input number into the high bit of the 32-bit word: 46 | * 47 | * +---+----------------------------------+ 48 | * | S |0000000 00000000 00000000 00000000| 49 | * +---+----------------------------------+ 50 | * Bits 31 0-31 51 | */ 52 | const uint32_t sign = w & UINT32_C(0x80000000); 53 | /* 54 | * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word: 55 | * 56 | * +---+-----+------------+-------------------+ 57 | * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| 58 | * +---+-----+------------+-------------------+ 59 | * Bits 30 27-31 17-26 0-16 60 | */ 61 | const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); 62 | /* 63 | * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized. 64 | * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one. 65 | * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift 66 | * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the 67 | * biased exponent into 1, and making mantissa normalized (i.e. without leading 1). 68 | */ 69 | #ifdef _MSC_VER 70 | unsigned long nonsign_bsr; 71 | _BitScanReverse(&nonsign_bsr, (unsigned long) nonsign); 72 | uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31; 73 | #else 74 | uint32_t renorm_shift = __builtin_clz(nonsign); 75 | #endif 76 | renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; 77 | /* 78 | * Iff half-precision number has exponent of 15, the addition overflows it into bit 31, 79 | * and the subsequent shift turns the high 9 bits into 1. Thus 80 | * inf_nan_mask == 81 | * 0x7F800000 if the half-precision number had exponent of 15 (i.e. was NaN or infinity) 82 | * 0x00000000 otherwise 83 | */ 84 | const int32_t inf_nan_mask = ((int32_t) (nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); 85 | /* 86 | * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0. 87 | * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus 88 | * zero_mask == 89 | * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h) 90 | * 0x00000000 otherwise 91 | */ 92 | const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31; 93 | /* 94 | * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal) 95 | * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa 96 | * shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number. 97 | * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias 98 | * (0x7F for single-precision number less 0xF for half-precision number). 99 | * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift 100 | * is less than 0x70, this can be combined with step 3. 101 | * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the input was NaN or infinity. 102 | * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero. 103 | * 7. Combine with the sign of the input number. 104 | */ 105 | return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); 106 | } 107 | 108 | /* 109 | * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to 110 | * a 32-bit floating-point number in IEEE single-precision format. 111 | * 112 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) 113 | * floating-point operations and bitcasts between integer and floating-point variables. 114 | */ 115 | static inline float fp16_ieee_to_fp32_value(uint16_t h) { 116 | #if FP16_USE_NATIVE_CONVERSION 117 | #if FP16_USE_FLOAT16_TYPE 118 | union { 119 | uint16_t as_bits; 120 | _Float16 as_value; 121 | } fp16 = { h }; 122 | return (float) fp16.as_value; 123 | #elif FP16_USE_FP16_TYPE 124 | union { 125 | uint16_t as_bits; 126 | __fp16 as_value; 127 | } fp16 = { h }; 128 | return (float) fp16.as_value; 129 | #else 130 | #if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__) 131 | return _cvtsh_ss((unsigned short) h); 132 | #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__) 133 | return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128((int) (unsigned int) h))); 134 | #elif defined(_M_ARM64) || defined(__aarch64__) 135 | return vgetq_lane_f32(vcvt_f32_f16(vreinterpret_f16_u16(vdup_n_u16(h))), 0); 136 | #else 137 | #error "Archtecture- or compiler-specific implementation required" 138 | #endif 139 | #endif 140 | #else 141 | /* 142 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: 143 | * +---+-----+------------+-------------------+ 144 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| 145 | * +---+-----+------------+-------------------+ 146 | * Bits 31 26-30 16-25 0-15 147 | * 148 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. 149 | */ 150 | const uint32_t w = (uint32_t) h << 16; 151 | /* 152 | * Extract the sign of the input number into the high bit of the 32-bit word: 153 | * 154 | * +---+----------------------------------+ 155 | * | S |0000000 00000000 00000000 00000000| 156 | * +---+----------------------------------+ 157 | * Bits 31 0-31 158 | */ 159 | const uint32_t sign = w & UINT32_C(0x80000000); 160 | /* 161 | * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word: 162 | * 163 | * +-----+------------+---------------------+ 164 | * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000| 165 | * +-----+------------+---------------------+ 166 | * Bits 27-31 17-26 0-16 167 | */ 168 | const uint32_t two_w = w + w; 169 | 170 | /* 171 | * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent 172 | * of a single-precision floating-point number: 173 | * 174 | * S|Exponent | Mantissa 175 | * +-+---+-----+------------+----------------+ 176 | * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000| 177 | * +-+---+-----+------------+----------------+ 178 | * Bits | 23-31 | 0-22 179 | * 180 | * Next, there are some adjustments to the exponent: 181 | * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision 182 | * formats (0x7F - 0xF = 0x70) 183 | * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number. 184 | * Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent 185 | * of the single-precision output must be 0xFF (max possible value). We do this correction in two steps: 186 | * - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested 187 | * by the difference in the exponent bias (see above). 188 | * - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of 189 | * exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias. 190 | * The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least 191 | * partially IEEE754-compliant implementations. 192 | * 193 | * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not 194 | * operate on denormal inputs, and do not produce denormal results. 195 | */ 196 | const uint32_t exp_offset = UINT32_C(0xE0) << 23; 197 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) 198 | const float exp_scale = 0x1.0p-112f; 199 | #else 200 | const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); 201 | #endif 202 | const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; 203 | 204 | /* 205 | * Convert denormalized half-precision inputs into single-precision results (always normalized). 206 | * Zero inputs are also handled here. 207 | * 208 | * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits. 209 | * First, we shift mantissa into bits 0-9 of the 32-bit word. 210 | * 211 | * zeros | mantissa 212 | * +---------------------------+------------+ 213 | * |0000 0000 0000 0000 0000 00|MM MMMM MMMM| 214 | * +---------------------------+------------+ 215 | * Bits 10-31 0-9 216 | * 217 | * Now, remember that denormalized half-precision numbers are represented as: 218 | * FP16 = mantissa * 2**(-24). 219 | * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input 220 | * and with an exponent which would scale the corresponding mantissa bits to 2**(-24). 221 | * A normalized single-precision floating-point number is represented as: 222 | * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127) 223 | * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision 224 | * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount. 225 | * 226 | * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number 227 | * is zero, the constructed single-precision number has the value of 228 | * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5 229 | * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of 230 | * the input half-precision number. 231 | */ 232 | const uint32_t magic_mask = UINT32_C(126) << 23; 233 | const float magic_bias = 0.5f; 234 | const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; 235 | 236 | /* 237 | * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the 238 | * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the 239 | * input is either a denormal number, or zero. 240 | * - Combine the result of conversion of exponent and mantissa with the sign of the input number. 241 | */ 242 | const uint32_t denormalized_cutoff = UINT32_C(1) << 27; 243 | const uint32_t result = sign | 244 | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); 245 | return fp32_from_bits(result); 246 | #endif 247 | } 248 | 249 | /* 250 | * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in 251 | * IEEE half-precision format, in bit representation. 252 | * 253 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) 254 | * floating-point operations and bitcasts between integer and floating-point variables. 255 | */ 256 | static inline uint16_t fp16_ieee_from_fp32_value(float f) { 257 | #if FP16_USE_NATIVE_CONVERSION 258 | #if FP16_USE_FLOAT16_TYPE 259 | union { 260 | _Float16 as_value; 261 | uint16_t as_bits; 262 | } fp16 = { (_Float16) f }; 263 | return fp16.as_bits; 264 | #elif FP16_USE_FP16_TYPE 265 | union { 266 | __fp16 as_value; 267 | uint16_t as_bits; 268 | } fp16 = { (__fp16) f }; 269 | return fp16.as_bits; 270 | #else 271 | #if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__) 272 | return _cvtss_sh(f, _MM_FROUND_CUR_DIRECTION); 273 | #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__) 274 | return (uint16_t) _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(f), _MM_FROUND_CUR_DIRECTION)); 275 | #elif defined(_M_ARM64) || defined(__aarch64__) 276 | return vget_lane_u16(vcvt_f16_f32(vdupq_n_f32(f)), 0); 277 | #else 278 | #error "Archtecture- or compiler-specific implementation required" 279 | #endif 280 | #endif 281 | #else 282 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) 283 | const float scale_to_inf = 0x1.0p+112f; 284 | const float scale_to_zero = 0x1.0p-110f; 285 | #else 286 | const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); 287 | const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); 288 | #endif 289 | #if defined(_MSC_VER) && defined(_M_IX86_FP) && (_M_IX86_FP == 0) || defined(__GNUC__) && defined(__FLT_EVAL_METHOD__) && (__FLT_EVAL_METHOD__ != 0) 290 | const volatile float saturated_f = fabsf(f) * scale_to_inf; 291 | #else 292 | const float saturated_f = fabsf(f) * scale_to_inf; 293 | #endif 294 | float base = saturated_f * scale_to_zero; 295 | 296 | const uint32_t w = fp32_to_bits(f); 297 | const uint32_t shl1_w = w + w; 298 | const uint32_t sign = w & UINT32_C(0x80000000); 299 | uint32_t bias = shl1_w & UINT32_C(0xFF000000); 300 | if (bias < UINT32_C(0x71000000)) { 301 | bias = UINT32_C(0x71000000); 302 | } 303 | 304 | base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; 305 | const uint32_t bits = fp32_to_bits(base); 306 | const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); 307 | const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); 308 | const uint32_t nonsign = exp_bits + mantissa_bits; 309 | return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); 310 | #endif 311 | } 312 | 313 | /* 314 | * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to 315 | * a 32-bit floating-point number in IEEE single-precision format, in bit representation. 316 | * 317 | * @note The implementation doesn't use any floating-point operations. 318 | */ 319 | static inline uint32_t fp16_alt_to_fp32_bits(uint16_t h) { 320 | /* 321 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: 322 | * +---+-----+------------+-------------------+ 323 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| 324 | * +---+-----+------------+-------------------+ 325 | * Bits 31 26-30 16-25 0-15 326 | * 327 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. 328 | */ 329 | const uint32_t w = (uint32_t) h << 16; 330 | /* 331 | * Extract the sign of the input number into the high bit of the 32-bit word: 332 | * 333 | * +---+----------------------------------+ 334 | * | S |0000000 00000000 00000000 00000000| 335 | * +---+----------------------------------+ 336 | * Bits 31 0-31 337 | */ 338 | const uint32_t sign = w & UINT32_C(0x80000000); 339 | /* 340 | * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word: 341 | * 342 | * +---+-----+------------+-------------------+ 343 | * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| 344 | * +---+-----+------------+-------------------+ 345 | * Bits 30 27-31 17-26 0-16 346 | */ 347 | const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); 348 | /* 349 | * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized. 350 | * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one. 351 | * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift 352 | * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the 353 | * biased exponent into 1, and making mantissa normalized (i.e. without leading 1). 354 | */ 355 | #ifdef _MSC_VER 356 | unsigned long nonsign_bsr; 357 | _BitScanReverse(&nonsign_bsr, (unsigned long) nonsign); 358 | uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31; 359 | #else 360 | uint32_t renorm_shift = __builtin_clz(nonsign); 361 | #endif 362 | renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; 363 | /* 364 | * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0. 365 | * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus 366 | * zero_mask == 367 | * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h) 368 | * 0x00000000 otherwise 369 | */ 370 | const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31; 371 | /* 372 | * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal) 373 | * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa 374 | * shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number. 375 | * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias 376 | * (0x7F for single-precision number less 0xF for half-precision number). 377 | * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift 378 | * is less than 0x70, this can be combined with step 3. 379 | * 5. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero. 380 | * 6. Combine with the sign of the input number. 381 | */ 382 | return sign | (((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) & ~zero_mask); 383 | } 384 | 385 | /* 386 | * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to 387 | * a 32-bit floating-point number in IEEE single-precision format. 388 | * 389 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) 390 | * floating-point operations and bitcasts between integer and floating-point variables. 391 | */ 392 | static inline float fp16_alt_to_fp32_value(uint16_t h) { 393 | /* 394 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: 395 | * +---+-----+------------+-------------------+ 396 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| 397 | * +---+-----+------------+-------------------+ 398 | * Bits 31 26-30 16-25 0-15 399 | * 400 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. 401 | */ 402 | const uint32_t w = (uint32_t) h << 16; 403 | /* 404 | * Extract the sign of the input number into the high bit of the 32-bit word: 405 | * 406 | * +---+----------------------------------+ 407 | * | S |0000000 00000000 00000000 00000000| 408 | * +---+----------------------------------+ 409 | * Bits 31 0-31 410 | */ 411 | const uint32_t sign = w & UINT32_C(0x80000000); 412 | /* 413 | * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word: 414 | * 415 | * +-----+------------+---------------------+ 416 | * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000| 417 | * +-----+------------+---------------------+ 418 | * Bits 27-31 17-26 0-16 419 | */ 420 | const uint32_t two_w = w + w; 421 | 422 | /* 423 | * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent 424 | * of a single-precision floating-point number: 425 | * 426 | * S|Exponent | Mantissa 427 | * +-+---+-----+------------+----------------+ 428 | * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000| 429 | * +-+---+-----+------------+----------------+ 430 | * Bits | 23-31 | 0-22 431 | * 432 | * Next, the exponent is adjusted for the difference in exponent bias between single-precision and half-precision 433 | * formats (0x7F - 0xF = 0x70). This operation never overflows or generates non-finite values, as the largest 434 | * half-precision exponent is 0x1F and after the adjustment is can not exceed 0x8F < 0xFE (largest single-precision 435 | * exponent for non-finite values). 436 | * 437 | * Note that this operation does not handle denormal inputs (where biased exponent == 0). However, they also do not 438 | * operate on denormal inputs, and do not produce denormal results. 439 | */ 440 | const uint32_t exp_offset = UINT32_C(0x70) << 23; 441 | const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset); 442 | 443 | /* 444 | * Convert denormalized half-precision inputs into single-precision results (always normalized). 445 | * Zero inputs are also handled here. 446 | * 447 | * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits. 448 | * First, we shift mantissa into bits 0-9 of the 32-bit word. 449 | * 450 | * zeros | mantissa 451 | * +---------------------------+------------+ 452 | * |0000 0000 0000 0000 0000 00|MM MMMM MMMM| 453 | * +---------------------------+------------+ 454 | * Bits 10-31 0-9 455 | * 456 | * Now, remember that denormalized half-precision numbers are represented as: 457 | * FP16 = mantissa * 2**(-24). 458 | * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input 459 | * and with an exponent which would scale the corresponding mantissa bits to 2**(-24). 460 | * A normalized single-precision floating-point number is represented as: 461 | * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127) 462 | * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision 463 | * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount. 464 | * 465 | * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number 466 | * is zero, the constructed single-precision number has the value of 467 | * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5 468 | * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of 469 | * the input half-precision number. 470 | */ 471 | const uint32_t magic_mask = UINT32_C(126) << 23; 472 | const float magic_bias = 0.5f; 473 | const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; 474 | 475 | /* 476 | * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the 477 | * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the 478 | * input is either a denormal number, or zero. 479 | * - Combine the result of conversion of exponent and mantissa with the sign of the input number. 480 | */ 481 | const uint32_t denormalized_cutoff = UINT32_C(1) << 27; 482 | const uint32_t result = sign | 483 | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); 484 | return fp32_from_bits(result); 485 | } 486 | 487 | /* 488 | * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in 489 | * ARM alternative half-precision format, in bit representation. 490 | * 491 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) 492 | * floating-point operations and bitcasts between integer and floating-point variables. 493 | */ 494 | static inline uint16_t fp16_alt_from_fp32_value(float f) { 495 | const uint32_t w = fp32_to_bits(f); 496 | const uint32_t sign = w & UINT32_C(0x80000000); 497 | const uint32_t shl1_w = w + w; 498 | 499 | const uint32_t shl1_max_fp16_fp32 = UINT32_C(0x8FFFC000); 500 | const uint32_t shl1_base = shl1_w > shl1_max_fp16_fp32 ? shl1_max_fp16_fp32 : shl1_w; 501 | uint32_t shl1_bias = shl1_base & UINT32_C(0xFF000000); 502 | const uint32_t exp_difference = 23 - 10; 503 | const uint32_t shl1_bias_min = (127 - 1 - exp_difference) << 24; 504 | if (shl1_bias < shl1_bias_min) { 505 | shl1_bias = shl1_bias_min; 506 | } 507 | 508 | const float bias = fp32_from_bits((shl1_bias >> 1) + ((exp_difference + 2) << 23)); 509 | const float base = fp32_from_bits((shl1_base >> 1) + (2 << 23)) + bias; 510 | 511 | const uint32_t exp_f = fp32_to_bits(base) >> 13; 512 | return (sign >> 16) | ((exp_f & UINT32_C(0x00007C00)) + (fp32_to_bits(base) & UINT32_C(0x00000FFF))); 513 | } 514 | 515 | #endif /* FP16_FP16_H */ 516 | --------------------------------------------------------------------------------