From 4f0caea3e468c57389ad841762d92e4d0c211253 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 11 Sep 2024 08:09:13 +0000 Subject: [PATCH] Assign correct tiles to reusable L1 buffer --- .../AMDAIELogicalObjFifoSplittingUtils.cpp | 99 +++++++++++++------ ..._logicalobjfifos_for_connection_reuse.mlir | 8 +- 2 files changed, 71 insertions(+), 36 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp index 299c0fccc..ee09c56de 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -834,18 +834,18 @@ LogicalResult combineLogicalObjectFifos( // will make an attempt to combine the logical objectFifos as per the // following algorithm :- // a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops. - // b. Since step a would create a new L2 buffer (with combined shape), we - // will - // need to update the corresponding two L2->L1 Dma ops by indeed creating - // new ones. NOTE: Both of these new L2->L1 Dma ops will be reusing the - // same L1 buffers as well. - // c. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1 - // Dma - // ops and do the following :- + // b. Form reusable L1 buffer by assigning the cumulative tiles of the + // intended core ops. + // c. Since step a would create a new L2 buffer (with combined shape), we + // will need to update the corresponding two L2->L1 Dma ops by indeed + // creating new ones. NOTE: Both of these new L2->L1 Dma ops will be + // reusing the same L1 buffers as well. + // d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1 + // Dma ops and do the following :- // 1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards - // the end. + // the end. // 2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right - // before the corresponding AccessOp within the same CoreOp. + // before the corresponding AccessOp within the same CoreOp. for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) { // Step 1. Combine the picked L3->L2 DmaCpyNd pair. FailureOr maybeNewL2ObjectFifo = @@ -855,14 +855,56 @@ LogicalResult combineLogicalObjectFifos( LogicalObjectFifoFromMemrefOp newL2ObjectFifo = maybeNewL2ObjectFifo.value(); - // Step 2. We now have need to create two L2->L1 ops since the size has + // Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of + // the intended core ops. + LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp = + l2ToL1DmaOps[i].getTargetObjectFifo(); + SmallVector tiles; + auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult { + OpBuilder::InsertionGuard guard(rewriter); + TileOp tileOp = coreOp.getTileOp(); + std::optional column = getConstantIntValue(tileOp.getCol()); + std::optional row = getConstantIntValue(tileOp.getRow()); + if (!column || !row) { + return coreOp.emitOpError() << "has non-constant tile location"; + } + rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp); + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), *column); + auto rowIndex = rewriter.create( + rewriter.getUnknownLoc(), *row); + tileOp = + rewriter.create(rewriter.getUnknownLoc(), colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + return success(); + }; + std::optional maybeFirstCoreOp = fetchUniqueCoreOp(l2ToL1DmaOps[i]); + if (!maybeFirstCoreOp) return failure(); + CoreOp firstCoreOp = maybeFirstCoreOp.value(); + std::optional maybeSecondCoreOp = + fetchUniqueCoreOp(l2ToL1DmaOps[i + 1]); + if (!maybeSecondCoreOp) return failure(); + CoreOp secondCoreOp = maybeSecondCoreOp.value(); + if (failed(addNewTileFrom(firstCoreOp)) || + failed(addNewTileFrom(secondCoreOp))) { + return failure(); + } + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp); + reuseL1LogicalObjectFifoOp = + rewriter.replaceOpWithNewOp( + reuseL1LogicalObjectFifoOp, + cast( + reuseL1LogicalObjectFifoOp.getOutput().getType()), + reuseL1LogicalObjectFifoOp.getMemref(), tiles); + + // Step 3. We now have need to create two L2->L1 ops since the size has // changed. But for this we first need to find the new offset for L2 as // source. // TODO: For now I'm hardcoding the offsets but later it'd just depend // on combining/non-combining dimensions. // Offset = 0,0 - LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp = - l2ToL1DmaOps[i].getTargetObjectFifo(); SmallVector newL2AsSourceOffsets = l2ToL1DmaOps[i].getSourceMixedOffsets(); DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse( @@ -872,31 +914,24 @@ LogicalResult combineLogicalObjectFifos( // the first L2->L1 Dma. newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets(); newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1); - DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse( - rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp, - newL2ObjectFifo, newL2AsSourceOffsets); + createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i + 1], + reuseL1LogicalObjectFifoOp, newL2ObjectFifo, + newL2AsSourceOffsets); - // Step 3. PICK the CoreOps associated with the 1:1 L2->L1. + // Step 4. Pick the CoreOps associated with the 1:1 L2->L1. // For the first Core op we'll insert Read at the end. It doesn't matter // for now so we're gonna insert it right before amdaie.end op. - std::optional maybeFirstCoreOp = - fetchUniqueCoreOp(newFirstL2ToL1DmaOp); - if (!maybeFirstCoreOp) return failure(); - CoreOp firstCoreOp = maybeFirstCoreOp.value(); - firstCoreOp.walk([&](AMDAIE::EndOp endOp) { - OpBuilder::InsertionGuard guard(rewriter); - // Hardcoding to `AMDAIE::MemoryAccess::Read`. - rewriter.setInsertionPoint(endOp); - rewriter.create( - rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), - AMDAIE::MemoryAccess::Read); + firstCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp.getInput() == newFirstL2ToL1DmaOp.getTargetObjectFifo()) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfter(accessOp); + rewriter.create( + rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), + accessOp.getAccessType()); + } }); // For the second Core op we'll insert `Read` right before the first read // from the corresponding L1 logicalobjectFifo. - std::optional maybeSecondCoreOp = - fetchUniqueCoreOp(newSecondL2ToL1DmaOp); - if (!maybeSecondCoreOp) return failure(); - CoreOp secondCoreOp = maybeSecondCoreOp.value(); secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) { OpBuilder::InsertionGuard guard(rewriter); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir index aee2023e3..a493efdce 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir @@ -32,16 +32,16 @@ // CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1] // CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1] -// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]], %[[TILE_0]]} // CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] // CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] // CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : // CHECK: linalg.generic // CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) // CHECK: linalg.generic // CHECK-SAME: %[[FIRST_READ]] -// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) // CHECK: amdaie.end // CHECK: } // CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( @@ -55,16 +55,16 @@ // CHECK-SAME: %[[SECOND_READ]] // CHECK: amdaie.end // CHECK: } -// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]} +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]], %[[TILE_2]]} // CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] // CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] // CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out : // CHECK: linalg.generic // CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) // CHECK: linalg.generic // CHECK-SAME: %[[FIRST_READ]] -// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) // CHECK: amdaie.end // CHECK: } // CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(