1
1
mirror of https://github.com/wez/wezterm.git synced 2024-11-23 23:21:08 +03:00

improve linear f32 -> srgb8 conversion

This uses a combination of lookup tables and simd to reduce the
cpu utilization by about ~15% compared to the prior brute force
implementation.
This commit is contained in:
Wez Furlong 2019-09-30 00:43:47 -07:00
parent 07fcc96f5a
commit b0efba9300
2 changed files with 184 additions and 9 deletions

View File

@ -42,7 +42,7 @@ pub trait Texture2d {
}
#[cfg(target_arch = "x86_64")]
mod avx2 {
mod avx {
use super::*;
#[inline]
fn align_lo(size: usize, align: usize) -> usize {
@ -173,9 +173,9 @@ pub trait BitmapImage {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
if is_x86_feature_detected!("avx") {
unsafe {
avx2::fill_pixel(self.pixel_data_mut(), width * 4, width, height, color);
avx::fill_pixel(self.pixel_data_mut(), width * 4, width, height, color);
}
return;
}
@ -196,9 +196,9 @@ pub trait BitmapImage {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
if is_x86_feature_detected!("avx") {
unsafe {
avx2::fill_pixel(
avx::fill_pixel(
self.pixel_data_mut()
.offset(4 * ((dest_y * dim_width) + dest_x) as isize),
dim_width * 4,

View File

@ -3,6 +3,7 @@ use palette::{Blend, LinSrgb, LinSrgba, Srgb, Srgba};
lazy_static::lazy_static! {
static ref SRGB_TO_F32_TABLE: [f32;256] = generate_srgb8_to_linear_f32_table();
static ref F32_TO_U8_TABLE: [u32;104] = generate_linear_f32_to_srgb8_table();
}
fn generate_srgb8_to_linear_f32_table() -> [f32; 256] {
@ -19,6 +20,165 @@ fn generate_srgb8_to_linear_f32_table() -> [f32; 256] {
table
}
fn generate_linear_f32_to_srgb8_table() -> [u32; 104] {
// My intent was to generate this array on the fly using the code that is commented
// out below. It is based on this gist:
// https://gist.github.com/rygorous/2203834
// but for whatever reason, the rust translation yields different numbers.
// I haven't had an opportunity to dig in to why that is, and I just wanted
// to get things rolling, so we're in a slightly gross state for now.
[
0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d,
0x00a1000d, 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a,
0x00f4001a, 0x0101001a, 0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033,
0x018f0033, 0x01a80033, 0x01c20033, 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067, 0x037800ce, 0x03df00ce, 0x044600ce,
0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5, 0x06970158, 0x07420142,
0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2, 0x0b0f01cb,
0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0,
0x182401af, 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270,
0x21520256, 0x227d0240, 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367,
0x2d1d0341, 0x2ebe031f, 0x304d0300, 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401, 0x44c20798, 0x488e071e, 0x4c1c06b6,
0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559, 0x5e0c0a23, 0x631c0980,
0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
]
/*
let numexp = 13;
let mantissa_msb = 3;
let nbuckets = numexp << mantissa_msb;
let bucketsize = 1 << (23 - mantissa_msb);
let mantshift = 12;
let mut table = [0;104];
let sum_aa = bucketsize as f64;
let mut sum_ab = 0.0f64;
let mut sum_bb = 0.0f64;
for i in 0..bucketsize {
let j = (i >> mantshift) as f64;
sum_ab += j;
sum_bb += j * j;
}
let inv_det = 1.0 / (sum_aa * sum_bb - sum_ab * sum_ab);
eprintln!("sum_ab={:e} sum_bb={:e} inv_det={:e}", sum_ab, sum_bb, inv_det);
for bucket in 0..nbuckets {
let start = ((127 - numexp) << 23) + bucket*bucketsize;
let mut sum_a = 0.0;
let mut sum_b = 0.0;
for i in 0..bucketsize {
let j = i >> mantshift;
let val = linear_f32_to_srgbf32(f32::from_bits(start + i)) as f64 + 0.5;
sum_a += val;
sum_b += j as f64 * val;
}
let solved_a = inv_det * (sum_bb*sum_a - sum_ab*sum_b);
let solved_b = inv_det * (sum_aa*sum_b - sum_ab*sum_a);
let scaled_a = solved_a * 65536.0 / 512.0;
let scaled_b = solved_b * 65536.0;
let int_a = (scaled_a + 0.5) as u32;
let int_b = (scaled_b + 0.5) as u32;
table[bucket as usize] = (int_a << 16) + int_b;
}
table
*/
}
/*
/// Convert from linear rgb in floating point form (0-1.0) to srgb in floating point (0-255.0)
fn linear_f32_to_srgbf32(f: f32) -> f32 {
if f <= 0.0031308 {
f * 12.92
} else {
f.powf(1.0 / 2.4) * 1.055 - 0.055
}
}
*/
const ALMOST_ONE: u32 = 0x3f7fffff;
const MINVAL: u32 = (127 - 13) << 23;
fn linear_f32_to_srgb8_using_table(f: f32) -> u8 {
let minval = f32::from_bits(MINVAL);
let almost_one = f32::from_bits(ALMOST_ONE);
let f = if f < minval {
minval
} else if f > almost_one {
almost_one
} else {
f
};
let f_bits = f.to_bits();
let tab = unsafe { *F32_TO_U8_TABLE.get_unchecked(((f_bits - MINVAL) >> 20) as usize) };
let bias = (tab >> 16) << 9;
let scale = tab & 0xffff;
let t = (f_bits >> 12) & 0xff;
((bias + scale * t) >> 16) as u8
}
#[cfg(target_arch = "x86_64")]
fn linear_f32_to_srgb8_vec(s: LinSrgba) -> Color {
use std::arch::x86_64::*;
unsafe fn i32_get(m: *const __m128i, idx: isize) -> i32 {
let u: *const i32 = m as _;
*u.offset(idx)
}
unsafe {
let clamp_min_4 = _mm_set1_epi32((127 - 13) << 23);
let almost_one_4 = _mm_set1_epi32(0x3f7fffff);
let mant_mask_4 = _mm_set1_epi32(0xff);
let top_scale_4 = _mm_set1_epi32(0x02000000);
let f = _mm_set_ps(s.red, s.green, s.blue, s.alpha);
let clamped = _mm_min_ps(
_mm_max_ps(f, _mm_castsi128_ps(clamp_min_4)),
_mm_castsi128_ps(almost_one_4),
);
let tabidx = _mm_srli_epi32(_mm_castps_si128(clamped), 20);
let tabval = _mm_set_epi32(
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 0) - (127 - 13) * 8) as usize) as i32,
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 1) - (127 - 13) * 8) as usize) as i32,
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 2) - (127 - 13) * 8) as usize) as i32,
*F32_TO_U8_TABLE.get_unchecked((i32_get(&tabidx, 3) - (127 - 13) * 8) as usize) as i32,
);
let tabmult1 = _mm_srli_epi32(_mm_castps_si128(clamped), 12);
let tabmult2 = _mm_and_si128(tabmult1, mant_mask_4);
let tabmult3 = _mm_or_si128(tabmult2, top_scale_4);
let tabprod = _mm_madd_epi16(tabval, tabmult3);
let result = _mm_srli_epi32(tabprod, 16);
Color::rgba(
i32_get(&result, 0) as u8,
i32_get(&result, 1) as u8,
i32_get(&result, 2) as u8,
i32_get(&result, 3) as u8,
)
}
}
/// Convert from srgb in u8 0-255 to linear floating point rgb 0-1.0
fn srgb8_to_linear_f32(val: u8) -> f32 {
unsafe { *SRGB_TO_F32_TABLE.get_unchecked(val as usize) }
}
@ -27,6 +187,21 @@ fn srgb8_to_linear_f32(val: u8) -> f32 {
#[derive(Copy, Clone, Debug)]
pub struct Color(pub u32);
impl From<LinSrgba> for Color {
#[inline]
fn from(s: LinSrgba) -> Color {
if is_x86_feature_detected!("sse2") {
linear_f32_to_srgb8_vec(s)
} else {
let r = linear_f32_to_srgb8_using_table(s.red);
let g = linear_f32_to_srgb8_using_table(s.green);
let b = linear_f32_to_srgb8_using_table(s.blue);
let a = linear_f32_to_srgb8_using_table(s.alpha);
Color::rgba(r, g, b, a)
}
}
}
impl From<Srgb> for Color {
#[inline]
fn from(s: Srgb) -> Color {
@ -119,14 +294,14 @@ impl Color {
&Operator::Over => {
let src: LinSrgba = (*self).into();
let dest: LinSrgba = dest.into();
Srgba::from_linear(src.over(dest)).into()
src.over(dest).into()
}
&Operator::Source => *self,
&Operator::Multiply => {
let src: LinSrgba = (*self).into();
let dest: LinSrgba = dest.into();
let result: Color = Srgba::from_linear(src.multiply(dest)).into();
result.into()
let result: Color = src.multiply(dest).into();
result
}
&Operator::MultiplyThenOver(ref tint) => {
// First multiply by the tint color. This colorizes the glyph.
@ -139,7 +314,7 @@ impl Color {
tinted.alpha = src.alpha;
// Then blend the tinted glyph over the destination background
let dest: LinSrgba = dest.into();
Srgba::from_linear(tinted.over(dest)).into()
tinted.over(dest).into()
}
}
}