diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index 567c239053a..1283f04ae52 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -1063,8 +1063,7 @@ std::vector TensorIndexer::getPredicates( ForLoop* unswitched_loop) const { const auto& zero_val = tv->fusion()->zeroVal(); - const std::vector& predicate_domains = - getPredicateDomains(tv, expr); + std::vector predicate_domains = getPredicateDomains(tv, expr); const IndexingInfo& index_info = computeIndex(expr, predicate_domains, for_loops); @@ -1093,6 +1092,50 @@ std::vector TensorIndexer::getPredicates( /*is_start_predicate=*/false, /*unswitched_loop=*/unswitched_loop); + // When resize is involved, predicate its input ID as well to avoid + // redudancy. This is only necessary if a predicated resize is + // preceded by a split, however, for now it's always predicated + // with an exception of static resize. See + // PredicateIndexingTest.SplitThenPad for a concrete example. + for (const auto& [eg, direction] : index_info.traversal_path) { + auto resize = dynamic_cast(eg->front()); + if (resize == nullptr) { + continue; + } + + // TODO: It seems this shouldn't be predicated when the direction is + // Forward, i.e., when resize ops are propagated from + // producers to consumers. For example, ResizeTest.SliceThenPadLeftHalf + // would fail with this. Revisit for the Forward case if necessary. + if (direction == Direction::Forward) { + continue; + } + + // If the input ID is guaranteed to cover the output ID, then + // the input index should never exceed its boundary. + if (resize->leftExpand()->isConstInt() && + resize->rightExpand()->isConstInt()) { + auto left_int = resize->leftExpand()->evaluate().as(); + auto right_int = resize->rightExpand()->evaluate().as(); + // If the traversal direction is forward, the predicate is not + // necessary if both of the left and right factors are + // non-negative as the ouput ID is guaranteed to cover the + // input ID. Similarly, if it's backward, it is not necessary + // if they are non-positive. + if ((direction == Direction::Forward && left_int >= 0 && + right_int >= 0) || + (direction == Direction::Backward && left_int <= 0 && + right_int <= 0)) { + continue; + } + } + + IterDomain* id_to_predicate = + direction == Direction::Forward ? resize->out() : resize->in(); + + predicate_domains.push_back(id_to_predicate); + } + const std::unordered_map contig_domains = isContigIndexingEnabled() ? getContigDomains( diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index 641957407d5..fed2c5b6837 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -5497,6 +5498,293 @@ TEST_F(PredicateIndexingTest, VectorizedResizeRotation) { testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); } +// Check if resize input IDs are predicated. Repro of issue +// https://github.com/NVIDIA/Fuser/issues/3710. +TEST_F(PredicateIndexingTest, SplitThenPad) { + Fusion fusion; + FusionGuard fg(&fusion); + + const int64_t i0 = 4; + const int64_t i1 = 32; + + auto zero = fusion.zeroVal(); + + auto tv0 = makeContigConcreteTensor({i0 * i1}); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = + reshape(tv1, {IrBuilder::create(i0), IrBuilder::create(i1)}); + auto tv3 = pad(tv2, {zero, IrBuilder::create(i1)}); + auto tv4 = set(tv3); + fusion.addOutput(tv4); + + scheduler_tools::propagateResizeToInputs(tv3->definition()); + + inlineMost(); + + // tv1 should be scheduled as: + // + // T1_l_float[iS11{4}, iS13{64}] + // logical domain : (iS1{128}) + // contiguity: t + // Outer split: iS1{128} by factor 4 -> iS11{4}, iS12{32} + // Resize: iS12{32} by 0 and 32 -> iS13{64} + // loop domain : (iS11{4}, iS13{64}) + // + // In addition to its logical ID, the resize input ID should be + // predicated. + + struct GetReference : AbstractGetReference { + GetReference(const TensorIndexer& indexer, const IdModel& id_model) + : AbstractGetReference(indexer, id_model) {} + + Val* getInlinePredicate(TensorView* tv) const override { + if (tv->name() != 1) { + return nullptr; + } + + // Without index hoist and expr simplification, the predicate + // should look like: + // + // (((((((i0 * 32LL) + i1) >= 0LL) && + // (((i0 * 32LL) + i1) < 128LL)) && + // (i1 >= 0LL)) && + // (i1 < 32LL))) + + std::vector loop_indices = getLoopIndices(tv, indexer_, for_loops_); + + Val* zero = tv->fusion()->zeroVal(); + + auto resize = dynamic_cast(tv->axis(1)->definition()); + NVF_ERROR(resize != nullptr); + + auto logical_idx = addExpr( + mulExpr(loop_indices.at(0), createInt(i1)), loop_indices.at(1)); + + auto resize_idx = loop_indices.at(1); + + return andExpr( + andExpr( + andExpr( + geExpr(logical_idx, zero), + ltExpr(logical_idx, createInt(i0 * i1))), + geExpr(resize_idx, zero)), + ltExpr(resize_idx, createInt(i1))); + } + }; + + PredicateIndexValidator::validate(&fusion, false); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({i0 * i1}, options); + std::vector inputs{t0}; + + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); + + testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); +} + +TEST_F(PredicateIndexingTest, SplitThenPadTwice) { + Fusion fusion; + FusionGuard fg(&fusion); + + const int64_t i0 = 4; + const int64_t i1 = 32; + + auto zero = fusion.zeroVal(); + + auto tv0 = makeContigConcreteTensor({i0 * i1}); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = + reshape(tv1, {IrBuilder::create(i0), IrBuilder::create(i1)}); + auto tv3 = pad(tv2, {zero, IrBuilder::create(1L)}); + auto tv4 = pad(tv3, {IrBuilder::create(1L), zero}); + auto tv5 = set(tv4); + fusion.addOutput(tv5); + + scheduler_tools::propagateResizeToInputs(tv3->definition()); + scheduler_tools::propagateResizeToInputs(tv4->definition()); + + inlineMost(); + + // tv1 should be scheduled as: + // + // T1_l_float[iS14{4}, iS18{34}] ca_pos( 2 ) + // logical domain : (iS1{128}) + // contiguity: t + // Outer split: iS1{128} by factor 4 -> iS14{4}, iS15{32} + // Resize: iS15{32} by 0 and 1 -> iS16{33} + // Resize: iS16{33} by 1 and 0 -> iS18{34} + // loop domain : (iS14{4}, iS18{34}) + // + // In addition to its logical ID, the two resize input IDs should be + // predicated. + + struct GetReference : AbstractGetReference { + GetReference(const TensorIndexer& indexer, const IdModel& id_model) + : AbstractGetReference(indexer, id_model) {} + + Val* getInlinePredicate(TensorView* tv) const override { + if (tv->name() != 1) { + return nullptr; + } + + // Without index hoist and expr simplification, the predicate + // should look like: + // + // (((((((((i0 * 32LL) + (i1 - 1LL)) >= 0LL) && + // (((i0 * 32LL) + (i1 - 1LL)) < 128LL)) && + // ((i1 - 1LL) >= 0LL)) && + // ((i1 - 1LL) < 33LL)) && + // ((i1 - 1LL) >= 0LL)) && + // ((i1 - 1LL) < 32LL))) + + std::vector loop_indices = getLoopIndices(tv, indexer_, for_loops_); + + Val* zero = tv->fusion()->zeroVal(); + Val* one = tv->fusion()->oneVal(); + + auto resize = dynamic_cast(tv->axis(1)->definition()); + NVF_ERROR(resize != nullptr); + + auto logical_idx = addExpr( + mulExpr(loop_indices.at(0), createInt(i1)), + subExpr(loop_indices.at(1), one)); + + auto resize_idx = subExpr(loop_indices.at(1), one); + + return andExpr( + andExpr( + andExpr( + andExpr( + andExpr( + geExpr(logical_idx, zero), + ltExpr(logical_idx, createInt(i0 * i1))), + geExpr(resize_idx, zero)), + ltExpr(resize_idx, createInt(i1 + 1))), + geExpr(resize_idx, zero)), + ltExpr(resize_idx, createInt(i1))); + } + }; + + PredicateIndexValidator::validate(&fusion, false); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({i0 * i1}, options); + std::vector inputs{t0}; + + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); + + testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); +} + +// Testing a split reshape followed by slice and pad, which is a +// common pattern in RoPE. +TEST_F(PredicateIndexingTest, SplitThenSliceAndPad) { + Fusion fusion; + FusionGuard fg(&fusion); + + const int64_t i0 = 4; + const int64_t i1 = 32; + + auto zero = fusion.zeroVal(); + + auto tv0 = makeContigConcreteTensor({i0 * i1}); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = + reshape(tv1, {IrBuilder::create(i0), IrBuilder::create(i1)}); + auto tv3 = slice( + tv2, + {{zero, IrBuilder::create(i0)}, + {IrBuilder::create(i1 / 2), IrBuilder::create(i1)}}); + auto tv4 = pad(tv3, {zero, IrBuilder::create(i1 / 2)}); + auto tv5 = set(tv4); + fusion.addOutput(tv5); + + scheduler_tools::propagateResizeToInputs(tv3->definition()); + scheduler_tools::propagateResizeToInputs(tv4->definition()); + + inlineMost(); + + // tv1 should be scheduled as: + // + // T1_l_float[iS14{4}, iS18{32}] ca_pos( 2 ) + // logical domain : (iS1{128}) + // contiguity: t + // Outer split: iS1{128} by factor 4 -> iS14{4}, iS15{32} + // Resize: iS15{32} by -16 and 0 -> iS16{16} + // Resize: iS16{16} by 0 and 16 -> iS18{32} + // loop domain : (iS14{4}, iS18{32}) + // + // In addition to its logical ID, the input of the second resize + // should be predicated. The first resize should not be predicated + // as its input can be known to cover the output since the expansion + // factors are static, so as long as the index of the + // output is within the boundary, its index should never need to be + // predicated. + + struct GetReference : AbstractGetReference { + GetReference(const TensorIndexer& indexer, const IdModel& id_model) + : AbstractGetReference(indexer, id_model) {} + + Val* getInlinePredicate(TensorView* tv) const override { + if (tv->name() != 1) { + return nullptr; + } + + // Without index hoist and expr simplification, the predicate + // should look like: + // + // (((((((i0 * 32LL) + (i1 + 16LL)) >= 0LL) && + // (((i0 * 32LL) + (i1 + 16LL)) < 128LL)) && + // (i1 >= 0LL)) && + // (i1 < 16LL))) + + std::vector loop_indices = getLoopIndices(tv, indexer_, for_loops_); + + Val* zero = tv->fusion()->zeroVal(); + + auto resize = dynamic_cast(tv->axis(1)->definition()); + NVF_ERROR(resize != nullptr); + + auto logical_idx = addExpr( + mulExpr(loop_indices.at(0), createInt(i1)), + addExpr(loop_indices.at(1), createInt(i1 / 2))); + + auto resize_idx = loop_indices.at(1); + + return andExpr( + andExpr( + andExpr( + geExpr(logical_idx, zero), + ltExpr(logical_idx, createInt(i0 * i1))), + geExpr(resize_idx, zero)), + ltExpr(resize_idx, createInt(i1 / 2))); + } + }; + + PredicateIndexValidator::validate(&fusion, false); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn({i0 * i1}, options); + std::vector inputs{t0}; + + KernelExecutor ke; + ke.compile(&fusion, inputs); + auto outputs = ke.run(inputs); + + testValidate(&fusion, outputs, inputs, __LINE__, __FILE__); +} + // Repro of issue #3505. The indexing WAR for resize triggered an // assertion due to loop promotion. TEST_F(IndexingTest, Issue3505Repro1) {