fq/internal/mathx/float16.go

// Copyright (C) 2014 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//	http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mathx

import "unsafe"

// Float16 represents a 16-bit floating point number, containing a single sign bit, 5 exponent bits
// and 10 fractional bits. This corresponds to IEEE 754-2008 binary16 (or half precision float) type.
//
//	 MSB                                                                         LSB
//	╔════╦════╤════╤════╤════╤════╦════╤════╤════╤════╤════╤════╤════╤════╤════╤════╗
//	║Sign║ E₄ │ E₃ │ E₂ │ E₁ │ E₀ ║ F₉ │ F₈ │ F₇ │ F₆ │ F₅ │ F₄ │ F₃ │ F₂ │ F₁ │ F₀ ║
//	╚════╩════╧════╧════╧════╧════╩════╧════╧════╧════╧════╧════╧════╧════╧════╧════╝
//	Where E is the exponent bits and F is the fractional bits.
type Float16 uint16

const (
	float16ExpMask  Float16 = 0x7c00
	float16ExpBias  uint32  = 0xf
	float16ExpShift uint32  = 10
	float16FracMask Float16 = 0x03ff
	float16SignMask Float16 = 0x8000
	float32ExpMask  uint32  = 0x7f800000
	float32ExpBias  uint32  = 0x7f
	float32ExpShift uint32  = 23
	float32FracMask uint32  = 0x007fffff
)

// Float32 returns the Float16 value expanded to a float32. Infinities and NaNs are expanded as
// such.
func (f Float16) Float32() float32 {
	u32 := expandF16ToF32(f)
	ptr := unsafe.Pointer(&u32)
	f32 := *(*float32)(ptr)
	return f32
}

// IsNaN reports whether f is an “not-a-number” value.
func (f Float16) IsNaN() bool {
	return (f&float16ExpMask == float16ExpMask) && (f&float16FracMask != 0)
}

// IsInf reports whether f is an infinity, according to sign. If sign > 0, IsInf reports whether
// f is positive infinity. If sign < 0, IsInf reports whether f is negative infinity. If sign ==
// 0, IsInf reports whether f is either infinity.
func (f Float16) IsInf(sign int) bool {
	return ((f == float16ExpMask) && sign >= 0) ||
		(f == (float16SignMask|float16ExpMask) && sign <= 0)
}

// Float16NaN returns an “not-a-number” value.
func NewFloat16NaN() Float16 { return float16ExpMask | float16FracMask }

// Float16Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.
func NewFloat16Inf(sign int) Float16 {
	if sign >= 0 {
		return float16ExpMask
	} else {
		return float16SignMask | float16ExpMask
	}
}

// NewFloat16 returns a Float16 encoding of a 32-bit floating point number. Infinities and NaNs
// are encoded as such. Very large and very small numbers get rounded to infinity and zero
// respectively.
func NewFloat16(f32 float32) Float16 {
	ptr := unsafe.Pointer(&f32)
	u32 := *(*uint32)(ptr)
	sign := Float16(u32>>16) & float16SignMask
	exp := (u32 & float32ExpMask) >> float32ExpShift
	frac := u32 & 0x7fffff
	if exp == 0xff {
		// NaN or Infinity
		if frac != 0 { // NaN
			frac = 0x3f
		}
		return sign | float16ExpMask | Float16(frac)
	}
	if exp+float16ExpBias <= float32ExpBias {
		// Exponent is too small to represent in a Float16 (or a zero). We need to output
		// denormalized numbers (possibly rounding very small numbers to zero).
		denorm := float32ExpBias - exp - 1
		frac += 1 << float32ExpShift
		frac >>= denorm
		return sign | Float16(frac)
	}
	if exp > float32ExpBias+float16ExpBias {
		// Number too large to represent in a Float16 => round to Infinity.
		return sign | float16ExpMask
	}
	// General case.
	return sign | Float16(((exp+float16ExpBias-float32ExpBias)<<float16ExpShift)|(frac>>13))
}
func expandF16ToF32(in Float16) uint32 {
	sign := uint32(in&float16SignMask) << 16
	frac := uint32(in&float16FracMask) << 13
	exp := uint32(in&float16ExpMask) >> float16ExpShift
	if exp == 0x1f {
		// NaN of Infinity
		return sign | float32ExpMask | frac
	}
	if exp == 0 {
		if frac == 0 {
			// Zero
			return sign
		}
		// Denormalized number. In a float32 it must be stored in a normalized form, so
		// we normalize it.
		exp++
		for frac&float32ExpMask == 0 {
			frac <<= 1
			exp--
		}
		frac &= float32FracMask
	}
	exp += (float32ExpBias - float16ExpBias)
	return sign | (exp << float32ExpShift) | frac
}