Skip to content

Commit

Permalink
Add new pack-peel pipeline with 4 tiling levels
Browse files Browse the repository at this point in the history
  • Loading branch information
jtuyls committed Jan 10, 2025
1 parent acca626 commit abf2de4
Show file tree
Hide file tree
Showing 13 changed files with 537 additions and 66 deletions.
69 changes: 66 additions & 3 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ def __init__(
name_suffix="",
use_ukernel=False,
run_on_target=["npu1_4col"],
tile_pipeline="pack-peel",
additional_labels=None,
aie_compilation_flags=None,
n_repeats=1,
Expand All @@ -380,7 +381,7 @@ def __init__(
K=K,
input_type=input_type,
acc_type=acc_type,
tile_pipeline="pack-peel",
tile_pipeline=tile_pipeline,
use_ukernel=use_ukernel,
n_repeats=n_repeats,
n_kernel_runs=n_kernel_runs,
Expand Down Expand Up @@ -417,6 +418,7 @@ def __init__(
name_suffix="",
use_ukernel=False,
run_on_target=["npu1_4col"],
tile_pipeline="pack-peel",
additional_labels=None,
aie_compilation_flags=None,
n_repeats=1,
Expand All @@ -429,6 +431,7 @@ def __init__(
K=K,
input_type=input_type,
acc_type=acc_type,
tile_pipeline=tile_pipeline,
use_ukernel=use_ukernel,
function_name="matmul_transpose_b",
n_repeats=n_repeats,
Expand Down Expand Up @@ -471,6 +474,7 @@ def __init__(
name_suffix="",
use_ukernel=False,
run_on_target=["npu1_4col"],
tile_pipeline="pack-peel",
additional_labels=None,
aie_compilation_flags=None,
n_repeats=1,
Expand All @@ -488,7 +492,7 @@ def __init__(
K=K,
input_type=input_type,
acc_type=acc_type,
tile_pipeline="pack-peel",
tile_pipeline=tile_pipeline,
use_ukernel=use_ukernel,
n_repeats=n_repeats,
n_kernel_runs=n_kernel_runs,
Expand Down Expand Up @@ -526,6 +530,7 @@ def __init__(
name_suffix="",
use_ukernel=False,
run_on_target=["npu1_4col"],
tile_pipeline="pack-peel",
additional_labels=None,
aie_compilation_flags=None,
n_repeats=1,
Expand All @@ -538,6 +543,7 @@ def __init__(
K=K,
input_type=input_type,
acc_type=acc_type,
tile_pipeline=tile_pipeline,
use_ukernel=use_ukernel,
function_name="matmul_transpose_a",
n_repeats=n_repeats,
Expand Down Expand Up @@ -580,6 +586,7 @@ def __init__(
name_suffix="",
use_ukernel=False,
run_on_target=["npu1_4col"],
tile_pipeline="pack-peel",
additional_labels=None,
aie_compilation_flags=None,
n_repeats=1,
Expand All @@ -597,7 +604,7 @@ def __init__(
K=K,
input_type=input_type,
acc_type=acc_type,
tile_pipeline="pack-peel",
tile_pipeline=tile_pipeline,
use_ukernel=use_ukernel,
n_repeats=n_repeats,
n_kernel_runs=n_kernel_runs,
Expand Down Expand Up @@ -1734,6 +1741,7 @@ def __init__(self):
"outline": False,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
Expand All @@ -1744,6 +1752,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
Expand All @@ -1754,6 +1763,7 @@ def __init__(self):
"outline": False,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
Expand All @@ -1764,6 +1774,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
Expand All @@ -1774,6 +1785,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
Expand All @@ -1784,6 +1796,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
Expand All @@ -1794,6 +1807,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
Expand All @@ -1804,6 +1818,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": True,
"tile_pipeline": "pack-peel",
},
{
"M": 4096,
Expand All @@ -1814,6 +1829,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 4096,
Expand All @@ -1824,6 +1840,7 @@ def __init__(self):
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
{
"M": 4096,
Expand All @@ -1834,6 +1851,7 @@ def __init__(self):
"outline": True,
"transpose_a": True,
"transpose_b": False,
"tile_pipeline": "pack-peel",
},
# Test where the compute is omitted, this should help triangulate
# how much performance gain can be obtained with better matmul
Expand All @@ -1849,6 +1867,45 @@ def __init__(self):
"transpose_a": False,
"transpose_b": False,
"skip_numerics": True,
"tile_pipeline": "pack-peel",
},
{
"M": 512,
"N": 4096,
"K": 512,
"use_ukernel": False,
"peano_opt_level": 3,
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel-4-level-tiling",
},
{
"M": 512,
"N": 4096,
"K": 512,
"use_ukernel": True,
"peano_opt_level": 3,
"outline": True,
"transpose_a": False,
"transpose_b": False,
"tile_pipeline": "pack-peel-4-level-tiling",
},
# Test where the compute is omitted, this should help triangulate
# how much performance gain can be obtained with better matmul
# on core vs data movement.
{
"M": 512,
"N": 4096,
"K": 512,
"use_ukernel": False,
"peano_opt_level": 3,
"outline": True,
"outline_to_empty_function": True,
"transpose_a": False,
"transpose_b": False,
"skip_numerics": True,
"tile_pipeline": "pack-peel-4-level-tiling",
},
]

Expand All @@ -1862,6 +1919,7 @@ def __init__(self):
outline = test["outline"]
transpose_a = test["transpose_a"]
transpose_b = test["transpose_b"]
tile_pipeline = test["tile_pipeline"]

outlining_string = "--iree-amdaie-enable-function-outlining=" + str(
int(outline)
Expand Down Expand Up @@ -1902,6 +1960,9 @@ def __init__(self):
else:
raise ValueError("Transposing both LHS and RHS is not supported.")

if tile_pipeline == "pack-peel-4-level-tiling":
name_suffix += "_4_level_tiling"

# This should only be the case for benchmark tests which we expect
# to not pass numerically.
if "skip_numerics" in test and test["skip_numerics"]:
Expand All @@ -1914,6 +1975,7 @@ def __init__(self):
K,
"bf16",
"f32",
tile_pipeline=tile_pipeline,
use_ukernel=use_ukernel,
n_repeats=2,
aie_compilation_flags=aie_compilation_flags,
Expand All @@ -1929,6 +1991,7 @@ def __init__(self):
K,
"bf16",
"f32",
tile_pipeline=tile_pipeline,
additional_labels=["Performance"],
use_ukernel=use_ukernel,
n_repeats=5,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ struct AMDAIEOptions {
clEnumValN(TilePassPipeline::PackPeelPipeline, "pack-peel",
"Use the pack-peel based lowering strategy for "
"matmul-like ops"),
clEnumValN(TilePassPipeline::PackPeel4LevelTilingPipeline,
"pack-peel-4-level-tiling",
"Use the pack-peel based lowering strategy with 4 "
"levels of tiling for "
"matmul-like ops"),
clEnumValN(
TilePassPipeline::PadPackPipeline, "pad-pack",
"Use the pad-pack based lowering strategy for matmul-like ops"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ class AMDAIELowerExecutableTargetPass
}

AMDAIELowerExecutableTargetPass() = default;
AMDAIELowerExecutableTargetPass(const AMDAIELowerExecutableTargetPass &pass) {
};
AMDAIELowerExecutableTargetPass(
const AMDAIELowerExecutableTargetPass &pass){};
AMDAIELowerExecutableTargetPass(
const AMDAIELowerExecutableTargetOptions &options)
: AMDAIELowerExecutableTargetBase(options) {}
Expand Down Expand Up @@ -111,6 +111,12 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() {
addPackPeelBasedPassPipeline(executableLoweringPipeline, tilingConfig,
pathToUkernels, enableVectorizationPasses,
TilePassPipeline::PackPeelPipeline);
} else if (useTilePipeline ==
TilePassPipeline::PackPeel4LevelTilingPipeline) {
addPackPeel4LevelTilingBasedPassPipeline(
executableLoweringPipeline, tilingConfig, pathToUkernels,
enableVectorizationPasses,
TilePassPipeline::PackPeel4LevelTilingPipeline);
} else if (useTilePipeline == TilePassPipeline::PadPackPipeline) {
addPadPackBasedPassPipeline(executableLoweringPipeline, tilingConfig,
pathToUkernels, enableVectorizationPasses,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -400,10 +400,14 @@ void AMDAIETileAndFusePass::runOnOperation() {
"expected to be an scf.forall operation.");
signalPassFailure();
}
auto groupType =
tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread;
if (failed(setGpuAttributeOnForall(groupType, loopForAll, consumerOp))) {
return signalPassFailure();
if (hardwareMapping == HardwareMapping::Core ||
hardwareMapping == HardwareMapping::Block) {
auto groupType = hardwareMapping == HardwareMapping::Core
? GPUGroupType::Thread
: GPUGroupType::Block;
if (failed(setGpuAttributeOnForall(groupType, loopForAll, consumerOp))) {
return signalPassFailure();
}
}
}
}
Expand Down
Loading

0 comments on commit abf2de4

Please sign in to comment.