From f7ff7d48f17838a32af6848efaa6cfeee47ad0a6 Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Wed, 20 Nov 2024 19:38:44 -0400 Subject: [PATCH] More changes to matmul blog * Add header for dispatch * Move to 16,16 workgroups everywhere 2D * Make it clear dispatch count it running on CPU not dispatches are for CPU --- .../code/crates/cpu/matmul/src/variants.rs | 2 +- .../code/crates/gpu/workgroup_2d/src/lib.rs | 2 +- blog/2024-11-21-optimizing-matrix-mul/index.md | 7 +++++-- blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx | 4 ++-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs index 357d73b..79058ab 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs @@ -81,7 +81,7 @@ impl Gpu for Workgroup2d { impl GridComputation for Workgroup2d { fn workgroup(&self) -> UVec3 { - UVec3::new(8, 8, 1) + UVec3::new(16, 16, 1) } fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/src/lib.rs index 54e5021..9faafdd 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/src/lib.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/src/lib.rs @@ -4,7 +4,7 @@ use settings::Dimensions; use spirv_std::glam::UVec3; use spirv_std::spirv; -#[spirv(compute(threads(8, 8)))] +#[spirv(compute(threads(16, 16)))] pub fn matmul( #[spirv(global_invocation_id)] global_id: UVec3, #[spirv(uniform, descriptor_set = 0, binding = 0)] dimensions: &Dimensions, diff --git a/blog/2024-11-21-optimizing-matrix-mul/index.md b/blog/2024-11-21-optimizing-matrix-mul/index.md index 06d81b9..e1c7112 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/index.md +++ b/blog/2024-11-21-optimizing-matrix-mul/index.md @@ -175,7 +175,10 @@ examples. ::: -Each workgroup, since it’s only one thread, processes one `result[i, j]`. +#### Dispatching workgroups + +Each workgroup, since it’s only one thread (`#[spirv(compute(threads(1)))]`), processes +one `result[i, j]`. To calculate the full matrix, we need to launch as many entries as there are in the matrix. Here we specify that (`Uvec3::new(m * n, 1, 1`) on the CPU: @@ -241,7 +244,7 @@ Although we don't change much about our code, if we distribute our work in 2 dim we're able to bypass these limits and launch more workgroups that are larger. This allows us to calculate a 4096x4096 matmul. -We update our `compute(threads(256)))` to `compute(threads((8, 8)))`, and make the small +We update our `compute(threads(256)))` to `compute(threads((16, 16)))`, and make the small change to `row` and `col` from Zach's post to increase speed: import { RustWorkgroup2d } from './snippets/workgroup_2d.tsx'; diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx index 4dad47d..d637a49 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx @@ -59,7 +59,7 @@ export const RustNaiveWorkgroupCount: React.FC = () => ( language="rust" className="text-xs" lines="26-34" - title="Calculating how many workgroup dispatches are needed on the CPU" + title="Calculating on the CPU how many workgroup dispatches are needed" > {RustWorkgroupCount} @@ -71,7 +71,7 @@ export const RustNaiveDispatch: React.FC = () => ( className="text-xs" lines="145,147" strip_leading_spaces - title="Using wgpu on the CPU to dispatch to the GPU" + title="Using wgpu on the CPU to dispatch workgroups to the GPU" > {RustWgpuBackend}