Add new pack-peel pipeline with 4 tiling levels

nod-ai · Jan 10, 2025 · abf2de4 · abf2de4
1 parent acca626
commit abf2de4
Show file tree

Hide file tree

Showing 13 changed files with 537 additions and 66 deletions.
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -363,6 +363,7 @@ def __init__(
         name_suffix="",
         use_ukernel=False,
         run_on_target=["npu1_4col"],
+        tile_pipeline="pack-peel",
         additional_labels=None,
         aie_compilation_flags=None,
         n_repeats=1,
@@ -380,7 +381,7 @@ def __init__(
             K=K,
             input_type=input_type,
             acc_type=acc_type,
-            tile_pipeline="pack-peel",
+            tile_pipeline=tile_pipeline,
             use_ukernel=use_ukernel,
             n_repeats=n_repeats,
             n_kernel_runs=n_kernel_runs,
@@ -417,6 +418,7 @@ def __init__(
         name_suffix="",
         use_ukernel=False,
         run_on_target=["npu1_4col"],
+        tile_pipeline="pack-peel",
         additional_labels=None,
         aie_compilation_flags=None,
         n_repeats=1,
@@ -429,6 +431,7 @@ def __init__(
             K=K,
             input_type=input_type,
             acc_type=acc_type,
+            tile_pipeline=tile_pipeline,
             use_ukernel=use_ukernel,
             function_name="matmul_transpose_b",
             n_repeats=n_repeats,
@@ -471,6 +474,7 @@ def __init__(
         name_suffix="",
         use_ukernel=False,
         run_on_target=["npu1_4col"],
+        tile_pipeline="pack-peel",
         additional_labels=None,
         aie_compilation_flags=None,
         n_repeats=1,
@@ -488,7 +492,7 @@ def __init__(
             K=K,
             input_type=input_type,
             acc_type=acc_type,
-            tile_pipeline="pack-peel",
+            tile_pipeline=tile_pipeline,
             use_ukernel=use_ukernel,
             n_repeats=n_repeats,
             n_kernel_runs=n_kernel_runs,
@@ -526,6 +530,7 @@ def __init__(
         name_suffix="",
         use_ukernel=False,
         run_on_target=["npu1_4col"],
+        tile_pipeline="pack-peel",
         additional_labels=None,
         aie_compilation_flags=None,
         n_repeats=1,
@@ -538,6 +543,7 @@ def __init__(
             K=K,
             input_type=input_type,
             acc_type=acc_type,
+            tile_pipeline=tile_pipeline,
             use_ukernel=use_ukernel,
             function_name="matmul_transpose_a",
             n_repeats=n_repeats,
@@ -580,6 +586,7 @@ def __init__(
         name_suffix="",
         use_ukernel=False,
         run_on_target=["npu1_4col"],
+        tile_pipeline="pack-peel",
         additional_labels=None,
         aie_compilation_flags=None,
         n_repeats=1,
@@ -597,7 +604,7 @@ def __init__(
             K=K,
             input_type=input_type,
             acc_type=acc_type,
-            tile_pipeline="pack-peel",
+            tile_pipeline=tile_pipeline,
             use_ukernel=use_ukernel,
             n_repeats=n_repeats,
             n_kernel_runs=n_kernel_runs,
@@ -1734,6 +1741,7 @@ def __init__(self):
                 "outline": False,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 512,
@@ -1744,6 +1752,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 512,
@@ -1754,6 +1763,7 @@ def __init__(self):
                 "outline": False,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 512,
@@ -1764,6 +1774,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 512,
@@ -1774,6 +1785,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 512,
@@ -1784,6 +1796,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 512,
@@ -1794,6 +1807,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 512,
@@ -1804,6 +1818,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": True,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 4096,
@@ -1814,6 +1829,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 4096,
@@ -1824,6 +1840,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": False,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             {
                 "M": 4096,
@@ -1834,6 +1851,7 @@ def __init__(self):
                 "outline": True,
                 "transpose_a": True,
                 "transpose_b": False,
+                "tile_pipeline": "pack-peel",
             },
             # Test where the compute is omitted, this should help triangulate
             # how much performance gain can be obtained with better matmul
@@ -1849,6 +1867,45 @@ def __init__(self):
                 "transpose_a": False,
                 "transpose_b": False,
                 "skip_numerics": True,
+                "tile_pipeline": "pack-peel",
+            },
+            {
+                "M": 512,
+                "N": 4096,
+                "K": 512,
+                "use_ukernel": False,
+                "peano_opt_level": 3,
+                "outline": True,
+                "transpose_a": False,
+                "transpose_b": False,
+                "tile_pipeline": "pack-peel-4-level-tiling",
+            },
+            {
+                "M": 512,
+                "N": 4096,
+                "K": 512,
+                "use_ukernel": True,
+                "peano_opt_level": 3,
+                "outline": True,
+                "transpose_a": False,
+                "transpose_b": False,
+                "tile_pipeline": "pack-peel-4-level-tiling",
+            },
+            # Test where the compute is omitted, this should help triangulate
+            # how much performance gain can be obtained with better matmul
+            # on core vs data movement.
+            {
+                "M": 512,
+                "N": 4096,
+                "K": 512,
+                "use_ukernel": False,
+                "peano_opt_level": 3,
+                "outline": True,
+                "outline_to_empty_function": True,
+                "transpose_a": False,
+                "transpose_b": False,
+                "skip_numerics": True,
+                "tile_pipeline": "pack-peel-4-level-tiling",
             },
         ]
 
@@ -1862,6 +1919,7 @@ def __init__(self):
             outline = test["outline"]
             transpose_a = test["transpose_a"]
             transpose_b = test["transpose_b"]
+            tile_pipeline = test["tile_pipeline"]
 
             outlining_string = "--iree-amdaie-enable-function-outlining=" + str(
                 int(outline)
@@ -1902,6 +1960,9 @@ def __init__(self):
             else:
                 raise ValueError("Transposing both LHS and RHS is not supported.")
 
+            if tile_pipeline == "pack-peel-4-level-tiling":
+                name_suffix += "_4_level_tiling"
+
             # This should only be the case for benchmark tests which we expect
             # to not pass numerically.
             if "skip_numerics" in test and test["skip_numerics"]:
@@ -1914,6 +1975,7 @@ def __init__(self):
                         K,
                         "bf16",
                         "f32",
+                        tile_pipeline=tile_pipeline,
                         use_ukernel=use_ukernel,
                         n_repeats=2,
                         aie_compilation_flags=aie_compilation_flags,
@@ -1929,6 +1991,7 @@ def __init__(self):
                     K,
                     "bf16",
                     "f32",
+                    tile_pipeline=tile_pipeline,
                     additional_labels=["Performance"],
                     use_ukernel=use_ukernel,
                     n_repeats=5,

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h
@@ -153,6 +153,11 @@ struct AMDAIEOptions {
             clEnumValN(TilePassPipeline::PackPeelPipeline, "pack-peel",
                        "Use the pack-peel based lowering strategy for "
                        "matmul-like ops"),
+            clEnumValN(TilePassPipeline::PackPeel4LevelTilingPipeline,
+                       "pack-peel-4-level-tiling",
+                       "Use the pack-peel based lowering strategy with 4 "
+                       "levels of tiling for "
+                       "matmul-like ops"),
             clEnumValN(
                 TilePassPipeline::PadPackPipeline, "pad-pack",
                 "Use the pad-pack based lowering strategy for matmul-like ops"),

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp
@@ -53,8 +53,8 @@ class AMDAIELowerExecutableTargetPass
   }
 
   AMDAIELowerExecutableTargetPass() = default;
-  AMDAIELowerExecutableTargetPass(const AMDAIELowerExecutableTargetPass &pass) {
-  };
+  AMDAIELowerExecutableTargetPass(
+      const AMDAIELowerExecutableTargetPass &pass){};
   AMDAIELowerExecutableTargetPass(
       const AMDAIELowerExecutableTargetOptions &options)
       : AMDAIELowerExecutableTargetBase(options) {}
@@ -111,6 +111,12 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() {
         addPackPeelBasedPassPipeline(executableLoweringPipeline, tilingConfig,
                                      pathToUkernels, enableVectorizationPasses,
                                      TilePassPipeline::PackPeelPipeline);
+      } else if (useTilePipeline ==
+                 TilePassPipeline::PackPeel4LevelTilingPipeline) {
+        addPackPeel4LevelTilingBasedPassPipeline(
+            executableLoweringPipeline, tilingConfig, pathToUkernels,
+            enableVectorizationPasses,
+            TilePassPipeline::PackPeel4LevelTilingPipeline);
       } else if (useTilePipeline == TilePassPipeline::PadPackPipeline) {
         addPadPackBasedPassPipeline(executableLoweringPipeline, tilingConfig,
                                     pathToUkernels, enableVectorizationPasses,

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
@@ -400,10 +400,14 @@ void AMDAIETileAndFusePass::runOnOperation() {
           "expected to be an scf.forall operation.");
       signalPassFailure();
     }
-    auto groupType =
-        tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread;
-    if (failed(setGpuAttributeOnForall(groupType, loopForAll, consumerOp))) {
-      return signalPassFailure();
+    if (hardwareMapping == HardwareMapping::Core ||
+        hardwareMapping == HardwareMapping::Block) {
+      auto groupType = hardwareMapping == HardwareMapping::Core
+                           ? GPUGroupType::Thread
+                           : GPUGroupType::Block;
+      if (failed(setGpuAttributeOnForall(groupType, loopForAll, consumerOp))) {
+        return signalPassFailure();
+      }
     }
   }
 }