1
1
mirror of https://github.com/wader/fq.git synced 2024-11-25 05:55:43 +03:00
fq/internal/mathx/float16.go
Mattias Wadman bf7fa07c41 fq: Use go 1.20 and cleanup
Also rename *ex packages to *x
2024-04-01 19:14:10 +02:00

130 lines
4.7 KiB
Go

// Copyright (C) 2014 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mathx
import "unsafe"
// Float16 represents a 16-bit floating point number, containing a single sign bit, 5 exponent bits
// and 10 fractional bits. This corresponds to IEEE 754-2008 binary16 (or half precision float) type.
//
// MSB LSB
// ╔════╦════╤════╤════╤════╤════╦════╤════╤════╤════╤════╤════╤════╤════╤════╤════╗
// ║Sign║ E₄ │ E₃ │ E₂ │ E₁ │ E₀ ║ F₉ │ F₈ │ F₇ │ F₆ │ F₅ │ F₄ │ F₃ │ F₂ │ F₁ │ F₀ ║
// ╚════╩════╧════╧════╧════╧════╩════╧════╧════╧════╧════╧════╧════╧════╧════╧════╝
// Where E is the exponent bits and F is the fractional bits.
type Float16 uint16
const (
float16ExpMask Float16 = 0x7c00
float16ExpBias uint32 = 0xf
float16ExpShift uint32 = 10
float16FracMask Float16 = 0x03ff
float16SignMask Float16 = 0x8000
float32ExpMask uint32 = 0x7f800000
float32ExpBias uint32 = 0x7f
float32ExpShift uint32 = 23
float32FracMask uint32 = 0x007fffff
)
// Float32 returns the Float16 value expanded to a float32. Infinities and NaNs are expanded as
// such.
func (f Float16) Float32() float32 {
u32 := expandF16ToF32(f)
ptr := unsafe.Pointer(&u32)
f32 := *(*float32)(ptr)
return f32
}
// IsNaN reports whether f is an “not-a-number” value.
func (f Float16) IsNaN() bool {
return (f&float16ExpMask == float16ExpMask) && (f&float16FracMask != 0)
}
// IsInf reports whether f is an infinity, according to sign. If sign > 0, IsInf reports whether
// f is positive infinity. If sign < 0, IsInf reports whether f is negative infinity. If sign ==
// 0, IsInf reports whether f is either infinity.
func (f Float16) IsInf(sign int) bool {
return ((f == float16ExpMask) && sign >= 0) ||
(f == (float16SignMask|float16ExpMask) && sign <= 0)
}
// Float16NaN returns an “not-a-number” value.
func NewFloat16NaN() Float16 { return float16ExpMask | float16FracMask }
// Float16Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.
func NewFloat16Inf(sign int) Float16 {
if sign >= 0 {
return float16ExpMask
} else {
return float16SignMask | float16ExpMask
}
}
// NewFloat16 returns a Float16 encoding of a 32-bit floating point number. Infinities and NaNs
// are encoded as such. Very large and very small numbers get rounded to infinity and zero
// respectively.
func NewFloat16(f32 float32) Float16 {
ptr := unsafe.Pointer(&f32)
u32 := *(*uint32)(ptr)
sign := Float16(u32>>16) & float16SignMask
exp := (u32 & float32ExpMask) >> float32ExpShift
frac := u32 & 0x7fffff
if exp == 0xff {
// NaN or Infinity
if frac != 0 { // NaN
frac = 0x3f
}
return sign | float16ExpMask | Float16(frac)
}
if exp+float16ExpBias <= float32ExpBias {
// Exponent is too small to represent in a Float16 (or a zero). We need to output
// denormalized numbers (possibly rounding very small numbers to zero).
denorm := float32ExpBias - exp - 1
frac += 1 << float32ExpShift
frac >>= denorm
return sign | Float16(frac)
}
if exp > float32ExpBias+float16ExpBias {
// Number too large to represent in a Float16 => round to Infinity.
return sign | float16ExpMask
}
// General case.
return sign | Float16(((exp+float16ExpBias-float32ExpBias)<<float16ExpShift)|(frac>>13))
}
func expandF16ToF32(in Float16) uint32 {
sign := uint32(in&float16SignMask) << 16
frac := uint32(in&float16FracMask) << 13
exp := uint32(in&float16ExpMask) >> float16ExpShift
if exp == 0x1f {
// NaN of Infinity
return sign | float32ExpMask | frac
}
if exp == 0 {
if frac == 0 {
// Zero
return sign
}
// Denormalized number. In a float32 it must be stored in a normalized form, so
// we normalize it.
exp++
for frac&float32ExpMask == 0 {
frac <<= 1
exp--
}
frac &= float32FracMask
}
exp += (float32ExpBias - float16ExpBias)
return sign | (exp << float32ExpShift) | frac
}