diff --git a/AK/FloatingPoint.h b/AK/FloatingPoint.h new file mode 100644 index 00000000000..f0e881ba203 --- /dev/null +++ b/AK/FloatingPoint.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2022, Jelle Raaijmakers + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include + +namespace AK { + +template +requires(S <= 1 && E >= 1 && M >= 1 && (S + E + M) <= 64) class FloatingPointBits final { +public: + static const size_t signbit = S; + static const size_t exponentbits = E; + static const size_t mantissabits = M; + + template + requires(IsIntegral&& IsUnsigned && sizeof(T) <= 8) constexpr FloatingPointBits(T bits) + : m_bits(bits) + { + } + + constexpr FloatingPointBits(double value) + : m_bits(bit_cast(value)) + { + } + + constexpr FloatingPointBits(float value) + : m_bits(bit_cast(value)) + { + } + + double as_double() const requires(S == 1 && E == 11 && M == 52) { return bit_cast(m_bits); } + float as_float() const requires(S == 1 && E == 8 && M == 23) { return bit_cast(static_cast(m_bits)); } + u64 bits() const { return m_bits; } + +private: + u64 m_bits; +}; + +typedef FloatingPointBits<1, 8, 23> SingleFloatingPointBits; +typedef FloatingPointBits<1, 11, 52> DoubleFloatingPointBits; + +/** + * Convert between two IEEE 754 floating point types in any arrangement of sign, exponent and mantissa bits. + */ +template +constexpr To float_to_float(From const input) +{ + constexpr u64 from_exponent_nonnumber = (1ull << From::exponentbits) - 1; + constexpr u64 from_exponent_bias = (1ull << (From::exponentbits - 1)) - 1; + constexpr u64 to_exponent_nonnumber = (1ull << To::exponentbits) - 1; + constexpr u64 to_exponent_bias = (1ull << (To::exponentbits - 1)) - 1; + constexpr u64 to_exponent_max = (1ull << To::exponentbits) - 2; + + // Deconstruct input bits to float components + u64 from_sign = (input.bits() >> (From::exponentbits + From::mantissabits)) & From::signbit; + u64 from_exponent = (input.bits() >> From::mantissabits) & ((1ull << From::exponentbits) - 1); + u64 from_mantissa = input.bits() & ((1ull << From::mantissabits) - 1); + + u64 to_sign = from_sign & To::signbit; + u64 to_exponent; + u64 to_mantissa; + auto target_value = [&to_sign, &to_exponent, &to_mantissa]() { + return To((to_sign << (To::exponentbits + To::mantissabits)) | (to_exponent << To::mantissabits) | to_mantissa); + }; + + auto shift_mantissa = [](u64 mantissa) -> u64 { + if constexpr (From::mantissabits < To::mantissabits) + return mantissa << (To::mantissabits - From::mantissabits); + else + return mantissa >> (From::mantissabits - To::mantissabits); + }; + + // If target is unsigned and source is negative, clamp to 0 or keep NaN + if constexpr (To::signbit == 0) { + if (from_sign == 1) { + if (from_exponent == from_exponent_nonnumber && from_mantissa > 0) { + to_exponent = to_exponent_nonnumber; + to_mantissa = 1; + } else { + to_exponent = 0; + to_mantissa = 0; + } + return target_value(); + } + } + + // If the source floating point is denormalized; + if (from_exponent == 0) { + // If the source mantissa is 0, the value is +/-0 + if (from_mantissa == 0) { + to_exponent = 0; + to_mantissa = 0; + return target_value(); + } + + // If the source has more exponent bits than the target, then the largest possible + // source mantissa still cannot be represented in the target denormalized value. + if constexpr (From::exponentbits > To::exponentbits) { + to_exponent = 0; + to_mantissa = 0; + return target_value(); + } + + // If the source and target have the same number of exponent bits, we only need to + // shift the mantissa. + if constexpr (From::exponentbits == To::exponentbits) { + to_exponent = 0; + to_mantissa = shift_mantissa(from_mantissa); + return target_value(); + } + + // The target has more exponent bits, so our denormalized value can be represented + // as a normalized value in the target floating point. Normalized values have an + // implicit leading 1, so we shift the mantissa left until we find our explicit + // leading 1 which is then dropped. + int adjust_exponent = -1; + to_mantissa = from_mantissa; + do { + ++adjust_exponent; + to_mantissa <<= 1; + } while ((to_mantissa & (1ull << From::mantissabits)) == 0); + to_exponent = to_exponent_bias - from_exponent_bias - adjust_exponent; + + // Drop the most significant bit from the mantissa + to_mantissa &= (1ull << From::mantissabits) - 1; + to_mantissa = shift_mantissa(to_mantissa); + return target_value(); + } + + // If the source is NaN or +/-Inf, keep it that way + if (from_exponent == from_exponent_nonnumber) { + to_exponent = to_exponent_nonnumber; + to_mantissa = (from_mantissa == 0) ? 0 : 1; + return target_value(); + } + + // Determine the target exponent + to_exponent = to_exponent_bias - from_exponent_bias + from_exponent; + + // If the calculated exponent exceeds the target's capacity, clamp both the exponent and the + // mantissa to their maximum values. + if (to_exponent > to_exponent_max) { + to_exponent = to_exponent_max; + to_mantissa = (1ull << To::mantissabits) - 1; + return target_value(); + } + + // If the new exponent is less than 1, we can only represent this value as a denormalized number + if (to_exponent < 1) { + to_exponent = 0; + + // Add a leading 1 and shift the mantissa right + int adjust_exponent = 1 - to_exponent_bias - from_exponent + from_exponent_bias; + to_mantissa = ((1ull << From::mantissabits) | from_mantissa) >> adjust_exponent; + to_mantissa = shift_mantissa(to_mantissa); + return target_value(); + } + + // New exponent fits; shift the mantissa to fit as well + to_mantissa = shift_mantissa(from_mantissa); + return target_value(); +} + +template +constexpr O convert_from_native_double(double input) { return float_to_float(DoubleFloatingPointBits(input)); } + +template +constexpr O convert_from_native_float(float input) { return float_to_float(SingleFloatingPointBits(input)); } + +template +constexpr double convert_to_native_double(I input) { return float_to_float(input).as_double(); } + +template +constexpr float convert_to_native_float(I input) { return float_to_float(input).as_float(); } + +} + +using AK::DoubleFloatingPointBits; +using AK::FloatingPointBits; +using AK::SingleFloatingPointBits; + +using AK::convert_from_native_double; +using AK::convert_from_native_float; +using AK::convert_to_native_double; +using AK::convert_to_native_float; +using AK::float_to_float; diff --git a/Tests/AK/CMakeLists.txt b/Tests/AK/CMakeLists.txt index 38b066fcbe1..ec48588c6fa 100644 --- a/Tests/AK/CMakeLists.txt +++ b/Tests/AK/CMakeLists.txt @@ -26,6 +26,7 @@ set(AK_TEST_SOURCES TestEnumBits.cpp TestFind.cpp TestFixedArray.cpp + TestFloatingPoint.cpp TestFormat.cpp TestGenericLexer.cpp TestHashFunctions.cpp diff --git a/Tests/AK/TestFloatingPoint.cpp b/Tests/AK/TestFloatingPoint.cpp new file mode 100644 index 00000000000..da6a2fe4cce --- /dev/null +++ b/Tests/AK/TestFloatingPoint.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022, Jelle Raaijmakers + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +TEST_CASE(f16_1_5_10_to_native_float) +{ + auto within_approximate = [](u16 lhs, float rhs) -> bool { + auto f32_lhs = convert_to_native_float(FloatingPointBits<1, 5, 10>(lhs)); + return fabsf(f32_lhs - rhs) <= 0.00001f; + }; + + EXPECT(within_approximate(0x0000, 0.f)); + EXPECT(within_approximate(0x03FF, 0.000061f)); + EXPECT(within_approximate(0x3CEF, 1.23339f)); + EXPECT(within_approximate(0xBC00, -1.f)); + EXPECT(within_approximate(0xA266, -0.0125f)); + + float result; + result = convert_to_native_float(FloatingPointBits<1, 5, 10>(0xFC01u)); + EXPECT(isnan(result)); + + result = convert_to_native_float(FloatingPointBits<1, 5, 10>(0x7C00u)); + EXPECT(isinf(result)); +} + +TEST_CASE(float_to_double_roundtrips) +{ + auto roundtrip = [](float floatvalue1) { + auto doublevalue = convert_from_native_float(floatvalue1).as_double(); + auto floatbits = convert_from_native_double(doublevalue); + auto floatvalue2 = convert_to_native_float(floatbits); + + EXPECT_APPROXIMATE(floatvalue1, floatvalue2); + }; + + roundtrip(-1.0f); + roundtrip(-0.1f); + roundtrip(0.0f); + roundtrip(0.000001f); + roundtrip(0.1f); + roundtrip(1.0f); + roundtrip(3.141592f); + roundtrip(16777216.0f); + roundtrip(33554432.0f); + + roundtrip(1 / 0.0f); + roundtrip(1 / -0.0f); + roundtrip(0 / 0.0f); +} + +TEST_CASE(normalize_denormalize) +{ + // Go from denormalized float to normalized double + auto denormalized_float = 6.709679e-39f; + auto denormalized_float_bits = SingleFloatingPointBits(denormalized_float); + auto normalized_double = convert_to_native_double(denormalized_float_bits); + EXPECT_APPROXIMATE(denormalized_float, normalized_double); + + // Go back from normalized double to denormalized float + auto normalized_double_bits = DoubleFloatingPointBits(normalized_double); + auto reconstructed_denormalized_float = convert_to_native_float(normalized_double_bits); + EXPECT_APPROXIMATE(denormalized_float, reconstructed_denormalized_float); +} + +TEST_CASE(large_exponent) +{ + // Make sure we support at least 62 bits of exponent + auto large_exponent_float = convert_from_native_double>(1.0); + auto converted_double = convert_to_native_double(large_exponent_float); + EXPECT_APPROXIMATE(converted_double, 1.0); +} + +TEST_CASE(large_mantissa) +{ + // Make sure we support at least 62 bits of mantissa + auto large_exponent_float = convert_from_native_double>(1.0); + auto converted_double = convert_to_native_double(large_exponent_float); + EXPECT_APPROXIMATE(converted_double, 1.0); +}