mirror of
https://github.com/wez/wezterm.git
synced 2024-11-23 23:21:08 +03:00
improve linear f32 -> srgb8 conversion
This uses a combination of lookup tables and simd to reduce the cpu utilization by about ~15% compared to the prior brute force implementation.
This commit is contained in:
parent
07fcc96f5a
commit
b0efba9300
@ -42,7 +42,7 @@ pub trait Texture2d {
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod avx2 {
|
||||
mod avx {
|
||||
use super::*;
|
||||
#[inline]
|
||||
fn align_lo(size: usize, align: usize) -> usize {
|
||||
@ -173,9 +173,9 @@ pub trait BitmapImage {
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("avx2") {
|
||||
if is_x86_feature_detected!("avx") {
|
||||
unsafe {
|
||||
avx2::fill_pixel(self.pixel_data_mut(), width * 4, width, height, color);
|
||||
avx::fill_pixel(self.pixel_data_mut(), width * 4, width, height, color);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -196,9 +196,9 @@ pub trait BitmapImage {
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("avx2") {
|
||||
if is_x86_feature_detected!("avx") {
|
||||
unsafe {
|
||||
avx2::fill_pixel(
|
||||
avx::fill_pixel(
|
||||
self.pixel_data_mut()
|
||||
.offset(4 * ((dest_y * dim_width) + dest_x) as isize),
|
||||
dim_width * 4,
|
||||
|
@ -3,6 +3,7 @@ use palette::{Blend, LinSrgb, LinSrgba, Srgb, Srgba};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref SRGB_TO_F32_TABLE: [f32;256] = generate_srgb8_to_linear_f32_table();
|
||||
static ref F32_TO_U8_TABLE: [u32;104] = generate_linear_f32_to_srgb8_table();
|
||||
}
|
||||
|
||||
fn generate_srgb8_to_linear_f32_table() -> [f32; 256] {
|
||||
@ -19,6 +20,165 @@ fn generate_srgb8_to_linear_f32_table() -> [f32; 256] {
|
||||
table
|
||||
}
|
||||
|
||||
fn generate_linear_f32_to_srgb8_table() -> [u32; 104] {
|
||||
// My intent was to generate this array on the fly using the code that is commented
|
||||
// out below. It is based on this gist:
|
||||
// https://gist.github.com/rygorous/2203834
|
||||
// but for whatever reason, the rust translation yields different numbers.
|
||||
// I haven't had an opportunity to dig in to why that is, and I just wanted
|
||||
// to get things rolling, so we're in a slightly gross state for now.
|
||||
[
|
||||
0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d,
|
||||
0x00a1000d, 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a,
|
||||
0x00f4001a, 0x0101001a, 0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033,
|
||||
0x018f0033, 0x01a80033, 0x01c20033, 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
|
||||
0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067, 0x037800ce, 0x03df00ce, 0x044600ce,
|
||||
0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5, 0x06970158, 0x07420142,
|
||||
0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2, 0x0b0f01cb,
|
||||
0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
|
||||
0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0,
|
||||
0x182401af, 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270,
|
||||
0x21520256, 0x227d0240, 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367,
|
||||
0x2d1d0341, 0x2ebe031f, 0x304d0300, 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
|
||||
0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401, 0x44c20798, 0x488e071e, 0x4c1c06b6,
|
||||
0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559, 0x5e0c0a23, 0x631c0980,
|
||||
0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
|
||||
]
|
||||
/*
|
||||
let numexp = 13;
|
||||
let mantissa_msb = 3;
|
||||
let nbuckets = numexp << mantissa_msb;
|
||||
let bucketsize = 1 << (23 - mantissa_msb);
|
||||
let mantshift = 12;
|
||||
|
||||
let mut table = [0;104];
|
||||
|
||||
let sum_aa = bucketsize as f64;
|
||||
let mut sum_ab = 0.0f64;
|
||||
let mut sum_bb = 0.0f64;
|
||||
|
||||
for i in 0..bucketsize {
|
||||
let j = (i >> mantshift) as f64;
|
||||
|
||||
sum_ab += j;
|
||||
sum_bb += j * j;
|
||||
}
|
||||
|
||||
let inv_det = 1.0 / (sum_aa * sum_bb - sum_ab * sum_ab);
|
||||
eprintln!("sum_ab={:e} sum_bb={:e} inv_det={:e}", sum_ab, sum_bb, inv_det);
|
||||
|
||||
for bucket in 0..nbuckets {
|
||||
let start = ((127 - numexp) << 23) + bucket*bucketsize;
|
||||
|
||||
let mut sum_a = 0.0;
|
||||
let mut sum_b = 0.0;
|
||||
|
||||
for i in 0..bucketsize {
|
||||
let j = i >> mantshift;
|
||||
|
||||
let val = linear_f32_to_srgbf32(f32::from_bits(start + i)) as f64 + 0.5;
|
||||
sum_a += val;
|
||||
sum_b += j as f64 * val;
|
||||
}
|
||||
|
||||
let solved_a = inv_det * (sum_bb*sum_a - sum_ab*sum_b);
|
||||
let solved_b = inv_det * (sum_aa*sum_b - sum_ab*sum_a);
|
||||
let scaled_a = solved_a * 65536.0 / 512.0;
|
||||
let scaled_b = solved_b * 65536.0;
|
||||
|
||||
let int_a = (scaled_a + 0.5) as u32;
|
||||
let int_b = (scaled_b + 0.5) as u32;
|
||||
|
||||
table[bucket as usize] = (int_a << 16) + int_b;
|
||||
}
|
||||
|
||||
table
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
/// Convert from linear rgb in floating point form (0-1.0) to srgb in floating point (0-255.0)
|
||||
fn linear_f32_to_srgbf32(f: f32) -> f32 {
|
||||
if f <= 0.0031308 {
|
||||
f * 12.92
|
||||
} else {
|
||||
f.powf(1.0 / 2.4) * 1.055 - 0.055
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
const ALMOST_ONE: u32 = 0x3f7fffff;
|
||||
const MINVAL: u32 = (127 - 13) << 23;
|
||||
|
||||
fn linear_f32_to_srgb8_using_table(f: f32) -> u8 {
|
||||
let minval = f32::from_bits(MINVAL);
|
||||
let almost_one = f32::from_bits(ALMOST_ONE);
|
||||
|
||||
let f = if f < minval {
|
||||
minval
|
||||
} else if f > almost_one {
|
||||
almost_one
|
||||
} else {
|
||||
f
|
||||
};
|
||||
|
||||
let f_bits = f.to_bits();
|
||||
let tab = unsafe { *F32_TO_U8_TABLE.get_unchecked(((f_bits - MINVAL) >> 20) as usize) };
|
||||
let bias = (tab >> 16) << 9;
|
||||
let scale = tab & 0xffff;
|
||||
|
||||
let t = (f_bits >> 12) & 0xff;
|
||||
|
||||
((bias + scale * t) >> 16) as u8
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn linear_f32_to_srgb8_vec(s: LinSrgba) -> Color {
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
unsafe fn i32_get(m: *const __m128i, idx: isize) -> i32 {
|
||||
let u: *const i32 = m as _;
|
||||
*u.offset(idx)
|
||||
}
|
||||
|
||||
unsafe {
|
||||
let clamp_min_4 = _mm_set1_epi32((127 - 13) << 23);
|
||||
let almost_one_4 = _mm_set1_epi32(0x3f7fffff);
|
||||
let mant_mask_4 = _mm_set1_epi32(0xff);
|
||||
let top_scale_4 = _mm_set1_epi32(0x02000000);
|
||||
|
||||
let f = _mm_set_ps(s.red, s.green, s.blue, s.alpha);
|
||||
|
||||
let clamped = _mm_min_ps(
|
||||
_mm_max_ps(f, _mm_castsi128_ps(clamp_min_4)),
|
||||
_mm_castsi128_ps(almost_one_4),
|
||||
);
|
||||
|
||||
let tabidx = _mm_srli_epi32(_mm_castps_si128(clamped), 20);
|
||||
|
||||
let tabval = _mm_set_epi32(
|
||||
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 0) - (127 - 13) * 8) as usize) as i32,
|
||||
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 1) - (127 - 13) * 8) as usize) as i32,
|
||||
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 2) - (127 - 13) * 8) as usize) as i32,
|
||||
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 3) - (127 - 13) * 8) as usize) as i32,
|
||||
);
|
||||
|
||||
let tabmult1 = _mm_srli_epi32(_mm_castps_si128(clamped), 12);
|
||||
let tabmult2 = _mm_and_si128(tabmult1, mant_mask_4);
|
||||
let tabmult3 = _mm_or_si128(tabmult2, top_scale_4);
|
||||
let tabprod = _mm_madd_epi16(tabval, tabmult3);
|
||||
let result = _mm_srli_epi32(tabprod, 16);
|
||||
|
||||
Color::rgba(
|
||||
i32_get(&result, 0) as u8,
|
||||
i32_get(&result, 1) as u8,
|
||||
i32_get(&result, 2) as u8,
|
||||
i32_get(&result, 3) as u8,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert from srgb in u8 0-255 to linear floating point rgb 0-1.0
|
||||
fn srgb8_to_linear_f32(val: u8) -> f32 {
|
||||
unsafe { *SRGB_TO_F32_TABLE.get_unchecked(val as usize) }
|
||||
}
|
||||
@ -27,6 +187,21 @@ fn srgb8_to_linear_f32(val: u8) -> f32 {
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct Color(pub u32);
|
||||
|
||||
impl From<LinSrgba> for Color {
|
||||
#[inline]
|
||||
fn from(s: LinSrgba) -> Color {
|
||||
if is_x86_feature_detected!("sse2") {
|
||||
linear_f32_to_srgb8_vec(s)
|
||||
} else {
|
||||
let r = linear_f32_to_srgb8_using_table(s.red);
|
||||
let g = linear_f32_to_srgb8_using_table(s.green);
|
||||
let b = linear_f32_to_srgb8_using_table(s.blue);
|
||||
let a = linear_f32_to_srgb8_using_table(s.alpha);
|
||||
Color::rgba(r, g, b, a)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Srgb> for Color {
|
||||
#[inline]
|
||||
fn from(s: Srgb) -> Color {
|
||||
@ -119,14 +294,14 @@ impl Color {
|
||||
&Operator::Over => {
|
||||
let src: LinSrgba = (*self).into();
|
||||
let dest: LinSrgba = dest.into();
|
||||
Srgba::from_linear(src.over(dest)).into()
|
||||
src.over(dest).into()
|
||||
}
|
||||
&Operator::Source => *self,
|
||||
&Operator::Multiply => {
|
||||
let src: LinSrgba = (*self).into();
|
||||
let dest: LinSrgba = dest.into();
|
||||
let result: Color = Srgba::from_linear(src.multiply(dest)).into();
|
||||
result.into()
|
||||
let result: Color = src.multiply(dest).into();
|
||||
result
|
||||
}
|
||||
&Operator::MultiplyThenOver(ref tint) => {
|
||||
// First multiply by the tint color. This colorizes the glyph.
|
||||
@ -139,7 +314,7 @@ impl Color {
|
||||
tinted.alpha = src.alpha;
|
||||
// Then blend the tinted glyph over the destination background
|
||||
let dest: LinSrgba = dest.into();
|
||||
Srgba::from_linear(tinted.over(dest)).into()
|
||||
tinted.over(dest).into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user