2022-01-11 23:24:54 +03:00
|
|
|
// Copyright (C) 2014 The Android Open Source Project
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
2022-08-05 01:20:14 +03:00
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
2022-01-11 23:24:54 +03:00
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2024-04-01 19:39:45 +03:00
|
|
|
package mathx
|
2022-01-11 23:24:54 +03:00
|
|
|
|
|
|
|
import "unsafe"
|
|
|
|
|
|
|
|
// Float16 represents a 16-bit floating point number, containing a single sign bit, 5 exponent bits
|
|
|
|
// and 10 fractional bits. This corresponds to IEEE 754-2008 binary16 (or half precision float) type.
|
|
|
|
//
|
2022-08-05 01:20:14 +03:00
|
|
|
// MSB LSB
|
|
|
|
// ╔════╦════╤════╤════╤════╤════╦════╤════╤════╤════╤════╤════╤════╤════╤════╤════╗
|
|
|
|
// ║Sign║ E₄ │ E₃ │ E₂ │ E₁ │ E₀ ║ F₉ │ F₈ │ F₇ │ F₆ │ F₅ │ F₄ │ F₃ │ F₂ │ F₁ │ F₀ ║
|
|
|
|
// ╚════╩════╧════╧════╧════╧════╩════╧════╧════╧════╧════╧════╧════╧════╧════╧════╝
|
|
|
|
// Where E is the exponent bits and F is the fractional bits.
|
2022-01-11 23:24:54 +03:00
|
|
|
type Float16 uint16
|
|
|
|
|
|
|
|
const (
|
|
|
|
float16ExpMask Float16 = 0x7c00
|
|
|
|
float16ExpBias uint32 = 0xf
|
|
|
|
float16ExpShift uint32 = 10
|
|
|
|
float16FracMask Float16 = 0x03ff
|
|
|
|
float16SignMask Float16 = 0x8000
|
|
|
|
float32ExpMask uint32 = 0x7f800000
|
|
|
|
float32ExpBias uint32 = 0x7f
|
|
|
|
float32ExpShift uint32 = 23
|
|
|
|
float32FracMask uint32 = 0x007fffff
|
|
|
|
)
|
|
|
|
|
|
|
|
// Float32 returns the Float16 value expanded to a float32. Infinities and NaNs are expanded as
|
|
|
|
// such.
|
|
|
|
func (f Float16) Float32() float32 {
|
|
|
|
u32 := expandF16ToF32(f)
|
|
|
|
ptr := unsafe.Pointer(&u32)
|
|
|
|
f32 := *(*float32)(ptr)
|
|
|
|
return f32
|
|
|
|
}
|
|
|
|
|
|
|
|
// IsNaN reports whether f is an “not-a-number” value.
|
|
|
|
func (f Float16) IsNaN() bool {
|
|
|
|
return (f&float16ExpMask == float16ExpMask) && (f&float16FracMask != 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// IsInf reports whether f is an infinity, according to sign. If sign > 0, IsInf reports whether
|
|
|
|
// f is positive infinity. If sign < 0, IsInf reports whether f is negative infinity. If sign ==
|
|
|
|
// 0, IsInf reports whether f is either infinity.
|
|
|
|
func (f Float16) IsInf(sign int) bool {
|
|
|
|
return ((f == float16ExpMask) && sign >= 0) ||
|
|
|
|
(f == (float16SignMask|float16ExpMask) && sign <= 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Float16NaN returns an “not-a-number” value.
|
|
|
|
func NewFloat16NaN() Float16 { return float16ExpMask | float16FracMask }
|
|
|
|
|
|
|
|
// Float16Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.
|
|
|
|
func NewFloat16Inf(sign int) Float16 {
|
|
|
|
if sign >= 0 {
|
|
|
|
return float16ExpMask
|
|
|
|
} else {
|
|
|
|
return float16SignMask | float16ExpMask
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewFloat16 returns a Float16 encoding of a 32-bit floating point number. Infinities and NaNs
|
|
|
|
// are encoded as such. Very large and very small numbers get rounded to infinity and zero
|
|
|
|
// respectively.
|
|
|
|
func NewFloat16(f32 float32) Float16 {
|
|
|
|
ptr := unsafe.Pointer(&f32)
|
|
|
|
u32 := *(*uint32)(ptr)
|
|
|
|
sign := Float16(u32>>16) & float16SignMask
|
|
|
|
exp := (u32 & float32ExpMask) >> float32ExpShift
|
|
|
|
frac := u32 & 0x7fffff
|
|
|
|
if exp == 0xff {
|
|
|
|
// NaN or Infinity
|
|
|
|
if frac != 0 { // NaN
|
|
|
|
frac = 0x3f
|
|
|
|
}
|
|
|
|
return sign | float16ExpMask | Float16(frac)
|
|
|
|
}
|
|
|
|
if exp+float16ExpBias <= float32ExpBias {
|
|
|
|
// Exponent is too small to represent in a Float16 (or a zero). We need to output
|
|
|
|
// denormalized numbers (possibly rounding very small numbers to zero).
|
|
|
|
denorm := float32ExpBias - exp - 1
|
|
|
|
frac += 1 << float32ExpShift
|
|
|
|
frac >>= denorm
|
|
|
|
return sign | Float16(frac)
|
|
|
|
}
|
|
|
|
if exp > float32ExpBias+float16ExpBias {
|
|
|
|
// Number too large to represent in a Float16 => round to Infinity.
|
|
|
|
return sign | float16ExpMask
|
|
|
|
}
|
|
|
|
// General case.
|
|
|
|
return sign | Float16(((exp+float16ExpBias-float32ExpBias)<<float16ExpShift)|(frac>>13))
|
|
|
|
}
|
|
|
|
func expandF16ToF32(in Float16) uint32 {
|
|
|
|
sign := uint32(in&float16SignMask) << 16
|
|
|
|
frac := uint32(in&float16FracMask) << 13
|
|
|
|
exp := uint32(in&float16ExpMask) >> float16ExpShift
|
|
|
|
if exp == 0x1f {
|
|
|
|
// NaN of Infinity
|
|
|
|
return sign | float32ExpMask | frac
|
|
|
|
}
|
|
|
|
if exp == 0 {
|
|
|
|
if frac == 0 {
|
|
|
|
// Zero
|
|
|
|
return sign
|
|
|
|
}
|
|
|
|
// Denormalized number. In a float32 it must be stored in a normalized form, so
|
|
|
|
// we normalize it.
|
|
|
|
exp++
|
|
|
|
for frac&float32ExpMask == 0 {
|
|
|
|
frac <<= 1
|
|
|
|
exp--
|
|
|
|
}
|
|
|
|
frac &= float32FracMask
|
|
|
|
}
|
|
|
|
exp += (float32ExpBias - float16ExpBias)
|
|
|
|
return sign | (exp << float32ExpShift) | frac
|
|
|
|
}
|