Skip to content

Commit

Permalink
Merge pull request #191 from johannesvollmer/f16_batch_conversion
Browse files Browse the repository at this point in the history
batched f16 conversion
  • Loading branch information
johannesvollmer authored Jul 8, 2023
2 parents 0cb958c + 85a311d commit 3e0f6cd
Show file tree
Hide file tree
Showing 7 changed files with 279 additions and 76 deletions.
17 changes: 8 additions & 9 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: Install or use cached foresterre/cargo-msrv
uses: baptiste0928/cargo-install@v1
with:
crate: cargo-msrv
- name: Install foresterre/cargo-msrv without cache (takes longer, but caching produces unexpected behaviour)
run: cargo install cargo-msrv

- name: Verify the Rustc version declared in `cargo.toml`
- name: Verify the Rustc version declared in `cargo.toml` without cache (takes longer, but caching produces unexpected behaviour)
run: |
rm -f Cargo.lock
cargo update
cargo-msrv verify
# github actions does not support big endian systems directly, but it does support QEMU.
Expand Down Expand Up @@ -82,13 +82,11 @@ jobs:
run: sudo systemctl start docker

- name: Cross-Compile project to mips-unknown-linux-gnu
run: |
cross build --target=mips-unknown-linux-gnu --verbose
run: cross build --target=mips-unknown-linux-gnu --verbose

# https://github.com/cross-rs/cross#supported-targets
- name: Cross-Run Tests in mips-unknown-linux-gnu using Qemu
run: |
cross test --target mips-unknown-linux-gnu --verbose
run: cross test --target mips-unknown-linux-gnu --verbose

wasm32:
runs-on: ubuntu-latest
Expand All @@ -109,3 +107,4 @@ jobs:

- name: Run tests without default features
run: cargo test --verbose --no-default-features

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ proc-macro = false

[dependencies]
lebe = "^0.5.2" # generic binary serialization
half = ">=2.1.0, <2.3" # 16 bit float pixel data type
half = ">=2.1.0, <2.3" # 16 bit float pixel data type
bit_field = "^0.10.1" # exr file version bit flags
miniz_oxide = "^0.7.1" # zip compression for pxr24
smallvec = "^1.7.0" # make cache-friendly allocations TODO profile if smallvec is really an improvement!
Expand Down
126 changes: 88 additions & 38 deletions benches/pixel_format_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,62 +8,112 @@ use bencher::Bencher;
use std::fs;
use std::io::Cursor;
use exr::image::pixel_vec::PixelVec;
use exr::io::Data;
use exr::block::samples::FromNativeSample;

const F32_ZIPS_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zips.exr";
const F32_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed.exr";
const F16_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed_half.exr";
const F16_ZIP_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zip_half.exr";

/// Read an image from an in-memory buffer into its native f32 format
fn read_image_rgba_f32_to_f32(bench: &mut Bencher) {
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
bencher::black_box(&mut file);
fn read_f32_as_f32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F32_UNCOMPRESSED_PATH, false);
}

bench.iter(||{
let image = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(f32,f32,f32,f32)>::constructor, PixelVec::set_pixel)
.all_layers().all_attributes()
.non_parallel()
.from_buffered(Cursor::new(file.as_slice())).unwrap();
/// Read image and convert the samples to u32 (from native f32)
fn read_f32_as_u32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<u32>(bench, F32_UNCOMPRESSED_PATH, false);
}

bencher::black_box(image);
})
/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
fn read_f32_as_f16_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F32_UNCOMPRESSED_PATH, false);
}

/// Read image and convert the samples to u32 (from native f32)
fn read_image_rgba_f32_to_u32(bench: &mut Bencher) {
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
bencher::black_box(&mut file);
fn read_f16_as_f16_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F16_UNCOMPRESSED_PATH, false);
}

bench.iter(||{
let image = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(u32,u32,u32,u32)>::constructor, PixelVec::set_pixel)
.all_layers().all_attributes()
.non_parallel()
.from_buffered(Cursor::new(file.as_slice())).unwrap();
fn read_f16_as_f32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F16_UNCOMPRESSED_PATH, false);
}

bencher::black_box(image);
})
fn read_f16_as_u32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<u32>(bench, F16_UNCOMPRESSED_PATH, false);
}

/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
fn read_image_rgba_f32_to_f16(bench: &mut Bencher) {
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();

fn read_f32_as_f16_zips_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, false);
}

fn read_f16_as_f32_zip_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, false);
}

fn read_f32_as_f16_zips_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, true);
}

fn read_f16_as_f32_zip_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, true);
}

fn read_f32_as_f32_zips_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, true);
}

fn read_f16_as_f16_zip_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, true);
}

fn read_f32_as_f32_zips_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, false);
}

fn read_f16_as_f16_zip_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, false);
}

fn bench_read_image_rgba_as<T>(bench: &mut Bencher, path: &str, parallel: bool) {
let mut file = fs::read(path).unwrap();
bencher::black_box(&mut file);

bench.iter(||{
let image = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(f16,f16,f16,f16)>::constructor, PixelVec::set_pixel)
.all_layers().all_attributes()
.non_parallel()
.from_buffered(Cursor::new(file.as_slice())).unwrap();

let image = read_file_from_memory_as::<f16>(file.as_slice(), parallel);
bencher::black_box(image);
})
}

fn read_file_from_memory_as<T>(file: &[u8], parallel: bool) -> RgbaImage<PixelVec<(T, T, T, T)>>
where T: FromNativeSample
{
let read = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(T, T, T, T)>::constructor, PixelVec::set_pixel)
.first_valid_layer().all_attributes();

let read = if parallel { read } else { read.non_parallel() };
read.from_buffered(Cursor::new(file)).unwrap()
}

benchmark_group!(pixel_format_conversion,
read_image_rgba_f32_to_f32,
read_image_rgba_f32_to_u32,
read_image_rgba_f32_to_f16,
read_f32_as_f32_uncompressed_1thread,
read_f32_as_u32_uncompressed_1thread,
read_f32_as_f16_uncompressed_1thread,
read_f32_as_f16_zips_1thread,
read_f32_as_f16_zips_nthreads,
read_f32_as_f32_zips_nthreads,
read_f32_as_f32_zips_1thread,

read_f16_as_f16_uncompressed_1thread,
read_f16_as_u32_uncompressed_1thread,
read_f16_as_f32_uncompressed_1thread,
read_f16_as_f32_zip_1thread,
read_f16_as_f32_zip_nthreads,
read_f16_as_f16_zip_nthreads,
read_f16_as_f16_zip_1thread,
);

benchmark_main!(pixel_format_conversion);
80 changes: 68 additions & 12 deletions src/block/samples.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Extract pixel samples from a block of pixel bytes.
use crate::prelude::*;
use half::prelude::HalfFloatSliceExt;


/// A single red, green, blue, or alpha value.
Expand Down Expand Up @@ -112,6 +113,7 @@ impl From<Sample> for u32 { #[inline] fn from(s: Sample) -> Self { s.to_u32() }

/// Create an arbitrary sample type from one of the defined sample types.
/// Should be compiled to a no-op where the file contains the predicted sample type.
/// The slice functions should be optimized into a `memcpy` where there is no conversion needed.
pub trait FromNativeSample: Sized + Copy + Default + 'static {

/// Create this sample from a f16, trying to represent the same numerical value
Expand All @@ -122,31 +124,85 @@ pub trait FromNativeSample: Sized + Copy + Default + 'static {

/// Create this sample from a u32, trying to represent the same numerical value
fn from_u32(value: u32) -> Self;

/// Convert all values from the slice into this type.
/// This function exists to allow the compiler to perform a vectorization optimization.
/// Note that this default implementation will **not** be vectorized by the compiler automatically.
/// For maximum performance you will need to override this function and implement it via
/// an explicit batched conversion such as [`convert_to_f32_slice`](https://docs.rs/half/2.3.1/half/slice/trait.HalfFloatSliceExt.html#tymethod.convert_to_f32_slice)
#[inline]
fn from_f16s(from: &[f16], to: &mut [Self]) {
assert_eq!(from.len(), to.len(), "slices must have the same length");
for (from, to) in from.iter().zip(to.iter_mut()) {
*to = Self::from_f16(*from);
}
}

/// Convert all values from the slice into this type.
/// This function exists to allow the compiler to perform a vectorization optimization.
/// Note that this default implementation will be vectorized by the compiler automatically.
#[inline]
fn from_f32s(from: &[f32], to: &mut [Self]) {
assert_eq!(from.len(), to.len(), "slices must have the same length");
for (from, to) in from.iter().zip(to.iter_mut()) {
*to = Self::from_f32(*from);
}
}

/// Convert all values from the slice into this type.
/// This function exists to allow the compiler to perform a vectorization optimization.
/// Note that this default implementation will be vectorized by the compiler automatically,
/// provided that the CPU supports the necessary conversion instructions.
/// For example, x86_64 lacks the instructions to convert `u32` to floats,
/// so this will inevitably be slow on x86_64.
#[inline]
fn from_u32s(from: &[u32], to: &mut [Self]) {
assert_eq!(from.len(), to.len(), "slices must have the same length");
for (from, to) in from.iter().zip(to.iter_mut()) {
*to = Self::from_u32(*from);
}
}
}

// TODO haven't i implemented this exact behaviour already somewhere else in this library...??
impl FromNativeSample for f32 {
fn from_f16(value: f16) -> Self { value.to_f32() }
fn from_f32(value: f32) -> Self { value } // this branch means that we never have to match every single sample if the file format matches the expected output
fn from_u32(value: u32) -> Self { value as f32 }
#[inline] fn from_f16(value: f16) -> Self { value.to_f32() }
#[inline] fn from_f32(value: f32) -> Self { value }
#[inline] fn from_u32(value: u32) -> Self { value as f32 }

// f16 is a custom type
// so the compiler can not automatically vectorize the conversion
// that's why we need to specialize this function
#[inline]
fn from_f16s(from: &[f16], to: &mut [Self]) {
from.convert_to_f32_slice(to);
}
}

impl FromNativeSample for u32 {
fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
fn from_f32(value: f32) -> Self { value as u32 }
fn from_u32(value: u32) -> Self { value }
#[inline] fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
#[inline] fn from_f32(value: f32) -> Self { value as u32 }
#[inline] fn from_u32(value: u32) -> Self { value }
}

impl FromNativeSample for f16 {
fn from_f16(value: f16) -> Self { value }
fn from_f32(value: f32) -> Self { f16::from_f32(value) }
fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
#[inline] fn from_f16(value: f16) -> Self { value }
#[inline] fn from_f32(value: f32) -> Self { f16::from_f32(value) }
#[inline] fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }

// f16 is a custom type
// so the compiler can not automatically vectorize the conversion
// that's why we need to specialize this function
#[inline]
fn from_f32s(from: &[f32], to: &mut [Self]) {
to.convert_from_f32_slice(from)
}
}

impl FromNativeSample for Sample {
fn from_f16(value: f16) -> Self { Self::from(value) }
fn from_f32(value: f32) -> Self { Self::from(value) }
fn from_u32(value: u32) -> Self { Self::from(value) }
#[inline] fn from_f16(value: f16) -> Self { Self::from(value) }
#[inline] fn from_f32(value: f32) -> Self { Self::from(value) }
#[inline] fn from_u32(value: u32) -> Self { Self::from(value) }
}


Expand Down
Loading

0 comments on commit 3e0f6cd

Please sign in to comment.