Merge pull request #191 from johannesvollmer/f16_batch_conversion

batched f16 conversion
johannesvollmer · Jul 8, 2023 · 3e0f6cd · 3e0f6cd
2 parents 0cb958c + 85a311d
commit 3e0f6cd
Show file tree

Hide file tree

Showing 7 changed files with 279 additions and 76 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -47,13 +47,13 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Install or use cached foresterre/cargo-msrv
-        uses: baptiste0928/cargo-install@v1
-        with:
-          crate: cargo-msrv
+      - name: Install foresterre/cargo-msrv without cache (takes longer, but caching produces unexpected behaviour)
+        run: cargo install cargo-msrv
 
-      - name: Verify the Rustc version declared in `cargo.toml`
+      - name: Verify the Rustc version declared in `cargo.toml` without cache (takes longer, but caching produces unexpected behaviour)
         run: |
+          rm -f Cargo.lock
+          cargo update
           cargo-msrv verify
 
   # github actions does not support big endian systems directly, but it does support QEMU.
@@ -82,13 +82,11 @@ jobs:
         run: sudo systemctl start docker
 
       - name: Cross-Compile project to mips-unknown-linux-gnu
-        run: |
-          cross build --target=mips-unknown-linux-gnu --verbose
+        run: cross build --target=mips-unknown-linux-gnu --verbose
 
       # https://github.com/cross-rs/cross#supported-targets
       - name: Cross-Run Tests in mips-unknown-linux-gnu using Qemu
-        run: |
-          cross test --target mips-unknown-linux-gnu --verbose
+        run: cross test --target mips-unknown-linux-gnu --verbose
 
   wasm32:
     runs-on: ubuntu-latest
@@ -109,3 +107,4 @@ jobs:
 
     - name: Run tests without default features
       run: cargo test --verbose --no-default-features
+
diff --git a/Cargo.toml b/Cargo.toml
@@ -28,7 +28,7 @@ proc-macro = false
 
 [dependencies]
 lebe = "^0.5.2"                # generic binary serialization
-half = ">=2.1.0, <2.3"         # 16 bit float pixel data type
+half = ">=2.1.0, <2.3"               # 16 bit float pixel data type
 bit_field = "^0.10.1"          # exr file version bit flags
 miniz_oxide = "^0.7.1"         # zip compression for pxr24
 smallvec = "^1.7.0"            # make cache-friendly allocations        TODO profile if smallvec is really an improvement!

diff --git a/benches/pixel_format_conversion.rs b/benches/pixel_format_conversion.rs
@@ -8,62 +8,112 @@ use bencher::Bencher;
 use std::fs;
 use std::io::Cursor;
 use exr::image::pixel_vec::PixelVec;
+use exr::io::Data;
+use exr::block::samples::FromNativeSample;
+
+const F32_ZIPS_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zips.exr";
+const F32_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed.exr";
+const F16_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed_half.exr";
+const F16_ZIP_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zip_half.exr";
 
 /// Read an image from an in-memory buffer into its native f32 format
-fn read_image_rgba_f32_to_f32(bench: &mut Bencher) {
-    let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
-    bencher::black_box(&mut file);
+fn read_f32_as_f32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F32_UNCOMPRESSED_PATH, false);
+}
 
-    bench.iter(||{
-        let image = exr::prelude::read()
-            .no_deep_data().largest_resolution_level()
-            .rgba_channels(PixelVec::<(f32,f32,f32,f32)>::constructor, PixelVec::set_pixel)
-            .all_layers().all_attributes()
-            .non_parallel()
-            .from_buffered(Cursor::new(file.as_slice())).unwrap();
+/// Read image and convert the samples to u32 (from native f32)
+fn read_f32_as_u32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<u32>(bench, F32_UNCOMPRESSED_PATH, false);
+}
 
-        bencher::black_box(image);
-    })
+/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
+fn read_f32_as_f16_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F32_UNCOMPRESSED_PATH, false);
 }
 
-/// Read image and convert the samples to u32 (from native f32)
-fn read_image_rgba_f32_to_u32(bench: &mut Bencher) {
-    let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
-    bencher::black_box(&mut file);
+fn read_f16_as_f16_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F16_UNCOMPRESSED_PATH, false);
+}
 
-    bench.iter(||{
-        let image = exr::prelude::read()
-            .no_deep_data().largest_resolution_level()
-            .rgba_channels(PixelVec::<(u32,u32,u32,u32)>::constructor, PixelVec::set_pixel)
-            .all_layers().all_attributes()
-            .non_parallel()
-            .from_buffered(Cursor::new(file.as_slice())).unwrap();
+fn read_f16_as_f32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F16_UNCOMPRESSED_PATH, false);
+}
 
-        bencher::black_box(image);
-    })
+fn read_f16_as_u32_uncompressed_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<u32>(bench, F16_UNCOMPRESSED_PATH, false);
 }
 
-/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
-fn read_image_rgba_f32_to_f16(bench: &mut Bencher) {
-    let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
+
+fn read_f32_as_f16_zips_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, false);
+}
+
+fn read_f16_as_f32_zip_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, false);
+}
+
+fn read_f32_as_f16_zips_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, true);
+}
+
+fn read_f16_as_f32_zip_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, true);
+}
+
+fn read_f32_as_f32_zips_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, true);
+}
+
+fn read_f16_as_f16_zip_nthreads(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, true);
+}
+
+fn read_f32_as_f32_zips_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, false);
+}
+
+fn read_f16_as_f16_zip_1thread(bench: &mut Bencher) {
+    bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, false);
+}
+
+fn bench_read_image_rgba_as<T>(bench: &mut Bencher, path: &str, parallel: bool) {
+    let mut file = fs::read(path).unwrap();
     bencher::black_box(&mut file);
 
     bench.iter(||{
-        let image = exr::prelude::read()
-            .no_deep_data().largest_resolution_level()
-            .rgba_channels(PixelVec::<(f16,f16,f16,f16)>::constructor, PixelVec::set_pixel)
-            .all_layers().all_attributes()
-            .non_parallel()
-            .from_buffered(Cursor::new(file.as_slice())).unwrap();
-
+        let image = read_file_from_memory_as::<f16>(file.as_slice(), parallel);
         bencher::black_box(image);
     })
 }
 
+fn read_file_from_memory_as<T>(file: &[u8], parallel: bool) -> RgbaImage<PixelVec<(T, T, T, T)>>
+    where T: FromNativeSample
+{
+    let read = exr::prelude::read()
+        .no_deep_data().largest_resolution_level()
+        .rgba_channels(PixelVec::<(T, T, T, T)>::constructor, PixelVec::set_pixel)
+        .first_valid_layer().all_attributes();
+
+    let read = if parallel { read } else { read.non_parallel() };
+    read.from_buffered(Cursor::new(file)).unwrap()
+}
+
 benchmark_group!(pixel_format_conversion,
-    read_image_rgba_f32_to_f32,
-    read_image_rgba_f32_to_u32,
-    read_image_rgba_f32_to_f16,
+    read_f32_as_f32_uncompressed_1thread,
+    read_f32_as_u32_uncompressed_1thread,
+    read_f32_as_f16_uncompressed_1thread,
+    read_f32_as_f16_zips_1thread,
+    read_f32_as_f16_zips_nthreads,
+    read_f32_as_f32_zips_nthreads,
+    read_f32_as_f32_zips_1thread,
+
+    read_f16_as_f16_uncompressed_1thread,
+    read_f16_as_u32_uncompressed_1thread,
+    read_f16_as_f32_uncompressed_1thread,
+    read_f16_as_f32_zip_1thread,
+    read_f16_as_f32_zip_nthreads,
+    read_f16_as_f16_zip_nthreads,
+    read_f16_as_f16_zip_1thread,
 );
 
 benchmark_main!(pixel_format_conversion);
diff --git a/src/block/samples.rs b/src/block/samples.rs
@@ -1,6 +1,7 @@
 //! Extract pixel samples from a block of pixel bytes.
 
 use crate::prelude::*;
+use half::prelude::HalfFloatSliceExt;
 
 
 /// A single red, green, blue, or alpha value.
@@ -112,6 +113,7 @@ impl From<Sample> for u32 { #[inline] fn from(s: Sample) -> Self { s.to_u32() }
 
 /// Create an arbitrary sample type from one of the defined sample types.
 /// Should be compiled to a no-op where the file contains the predicted sample type.
+/// The slice functions should be optimized into a `memcpy` where there is no conversion needed.
 pub trait FromNativeSample: Sized + Copy + Default + 'static {
 
     /// Create this sample from a f16, trying to represent the same numerical value
@@ -122,31 +124,85 @@ pub trait FromNativeSample: Sized + Copy + Default + 'static {
 
     /// Create this sample from a u32, trying to represent the same numerical value
     fn from_u32(value: u32) -> Self;
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    /// Note that this default implementation will **not** be vectorized by the compiler automatically.
+    /// For maximum performance you will need to override this function and implement it via
+    /// an explicit batched conversion such as [`convert_to_f32_slice`](https://docs.rs/half/2.3.1/half/slice/trait.HalfFloatSliceExt.html#tymethod.convert_to_f32_slice)
+    #[inline]
+    fn from_f16s(from: &[f16], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_f16(*from);
+        }
+    }
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    /// Note that this default implementation will be vectorized by the compiler automatically.
+    #[inline]
+    fn from_f32s(from: &[f32], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_f32(*from);
+        }
+    }
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    /// Note that this default implementation will be vectorized by the compiler automatically,
+    /// provided that the CPU supports the necessary conversion instructions.
+    /// For example, x86_64 lacks the instructions to convert `u32` to floats,
+    /// so this will inevitably be slow on x86_64.
+    #[inline]
+    fn from_u32s(from: &[u32], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_u32(*from);
+        }
+    }
 }
 
 // TODO haven't i implemented this exact behaviour already somewhere else in this library...??
 impl FromNativeSample for f32 {
-    fn from_f16(value: f16) -> Self { value.to_f32() }
-    fn from_f32(value: f32) -> Self { value } // this branch means that we never have to match every single sample if the file format matches the expected output
-    fn from_u32(value: u32) -> Self { value as f32 }
+    #[inline] fn from_f16(value: f16) -> Self { value.to_f32() }
+    #[inline] fn from_f32(value: f32) -> Self { value }
+    #[inline] fn from_u32(value: u32) -> Self { value as f32 }
+
+    // f16 is a custom type
+    // so the compiler can not automatically vectorize the conversion
+    // that's why we need to specialize this function
+    #[inline]
+    fn from_f16s(from: &[f16], to: &mut [Self]) {
+        from.convert_to_f32_slice(to);
+    }
 }
 
 impl FromNativeSample for u32 {
-    fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
-    fn from_f32(value: f32) -> Self { value as u32 }
-    fn from_u32(value: u32) -> Self { value }
+    #[inline] fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
+    #[inline] fn from_f32(value: f32) -> Self { value as u32 }
+    #[inline] fn from_u32(value: u32) -> Self { value }
 }
 
 impl FromNativeSample for f16 {
-    fn from_f16(value: f16) -> Self { value }
-    fn from_f32(value: f32) -> Self { f16::from_f32(value) }
-    fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
+    #[inline] fn from_f16(value: f16) -> Self { value }
+    #[inline] fn from_f32(value: f32) -> Self { f16::from_f32(value) }
+    #[inline] fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
+
+    // f16 is a custom type
+    // so the compiler can not automatically vectorize the conversion
+    // that's why we need to specialize this function
+    #[inline]
+    fn from_f32s(from: &[f32], to: &mut [Self]) {
+        to.convert_from_f32_slice(from)
+    }
 }
 
 impl FromNativeSample for Sample {
-    fn from_f16(value: f16) -> Self { Self::from(value) }
-    fn from_f32(value: f32) -> Self { Self::from(value) }
-    fn from_u32(value: u32) -> Self { Self::from(value) }
+    #[inline] fn from_f16(value: f16) -> Self { Self::from(value) }
+    #[inline] fn from_f32(value: f32) -> Self { Self::from(value) }
+    #[inline] fn from_u32(value: u32) -> Self { Self::from(value) }
 }