Migrate Tutorial.TMABankConflictFreeTranspose to direct bindings (#5249)

rdspring1 · web-flow · commit 9b3875af8b81 · 2025-10-08T07:23:55.000-07:00
diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h
@@ -565,7 +565,7 @@ void transformPropagateToAllFrom(TensorView* from_tv, int64_t pos);
 //!
 //! There are currently three modes of propagation: forward, backward and
 //! both-way, see comment on the interface functions for details.
-struct BoundedDirectionalTransformPropagator {
+struct NVF_API BoundedDirectionalTransformPropagator {
   //! Custom option container for configuring
   //!  the transform propagation actions.
   //! All option values default to false unless
diff --git a/python/python_direct/ir.cpp b/python/python_direct/ir.cpp
@@ -403,6 +403,29 @@ Returns
 -------
 TensorView
     A TensorView with the reordered axes in its loop domain.
+)")
+      .def(
+          "swizzle",
+          [](TensorView* self, int64_t x, int64_t y) {
+            return self->swizzle(SwizzleType::XOR, x, y);
+          },
+          py::return_value_policy::reference,
+          py::arg("x"),
+          py::arg("y"),
+          R"(
+Swizzle the axes of this tensor.
+
+Parameters
+----------
+x : int
+    The x axis to swizzle.
+y : int
+    The y axis to swizzle.
+
+Returns
+-------
+TensorView
+    A TensorView with the swizzled axes in its loop domain.
 )")
       .def(
           "rfactor",
diff --git a/python/python_direct/schedule.cpp b/python/python_direct/schedule.cpp
@@ -15,6 +15,43 @@ namespace nvfuser::python {
 namespace {
 
 void bindTensorviewScheduleOps(py::module_& schedule) {
+  schedule.def(
+      "bounded_transform_backward",
+      [](TensorView* from,
+         int64_t pos,
+         std::vector<TensorView*> to,
+         bool propagate_parallel_type) {
+        using TransformPropagator =
+            scheduler_utils::BoundedDirectionalTransformPropagator;
+        TransformPropagator::Options options;
+        if (propagate_parallel_type) {
+          options.propagateParallelType();
+        }
+        TransformPropagator::backward(from, pos, to, options);
+      },
+      R"(
+      Propagate scheduler transformations from a reference TensorView to other TensorViews.
+
+      Parameters
+      ----------
+      from : TensorView
+          The reference TensorView whose transformations will be propagated.
+      pos : int
+          The position up to which dimensions should be selected. -1 means all dimensions.
+      to : List[TensorView]
+          List of TensorViews to propagate transformations to.
+      propagate_parallel_type : bool
+          Whether to propagate parallel type.
+
+      Returns
+      -------
+      None
+      )",
+      py::arg("from"),
+      py::arg("pos"),
+      py::arg("to"),
+      py::arg("propagate_parallel_type") = false);
+
   schedule.def(
       "transform_like",
       [](TensorView* reference_tv,
diff --git a/tests/python/direct/test_tutorial.py b/tests/python/direct/test_tutorial.py
@@ -1330,3 +1330,107 @@ def test_tutorial_pointwise_broadcast_tma(nvfuser_direct_test):
     ke.compile(fd.fusion, [t0, t1], compile_params=index32bit)
     outputs = ke.run([t0, t1])
     assert outputs[0].equal(t2)
+
+
+@pytest.mark.skipif(
+    is_pre_hopper(), reason="Only supported on Hopper and newer devices."
+)
+def test_tutorial_tma_bank_conflict_free_transpose(nvfuser_direct_test):
+    with FusionDefinition() as fd:
+        input = fd.define_tensor(shape=[-1, -1], contiguity=[True, True])
+        output = fd.ops.permute(input, [1, 0])
+        fd.add_output(output)
+
+        # Change the fusion to input->smem->register->smem->output where the
+        # smem->register part does the transpose
+        input_smem_cache = input.cache_after(LoadStoreOpType.tma)
+        input_smem_cache.set_memory_type(MemoryType.shared)
+
+        output_smem_cache = output.cache_before(LoadStoreOpType.tma)
+        output_smem_cache.set_memory_type(MemoryType.shared)
+
+        output_reg_cache = output_smem_cache.cache_before()
+
+        # Create 32x32 tile. Each CTA has one tile, and the entire tile will be
+        # loaded to shared memory by TMA, and stored back to global memory by TMA.
+
+        # [I1, I0]
+        output.split(1, 32)
+        output.split(0, 32)
+        # [I1, 32', I0, 32]
+        output.reorder({0: 1, 1: 2, 2: 0})
+        output.merge(0, 1)
+        # [I0/32 * I1/32', 32', 32]
+        output.axis(0).parallelize(ParallelType.grid_x)
+        # [BIDx, 32', 32]
+
+        fd.sched.bounded_transform_backward(
+            output, -1, [input], propagate_parallel_type=True
+        )
+
+        # For fusion output, we just use TMA to store the entire tile back to global
+        # memory. There is no need to further schedule the output tensor.
+        output.axis(1).parallelize(ParallelType.tma)
+        output.axis(2).parallelize(ParallelType.tma)
+        # [BIDx, Bulk, Bulk]
+
+        # output_smem_cache and output_reg_cache are scheduled in the same way.
+        # We use each warp to load one column of input_smem_cache. We vectorize
+        # the load to 16 bytes, and use 8 warps to load all these 8 columns. Then,
+        # when we write to output_smem_cache, we unroll the write. Each warp writes
+        # one row in output_smem_cache in each iteration, so there is no bank
+        # conflict.
+
+        # [BIDx, 32', 32]
+        output_smem_cache.set_allocation_domain(
+            output_smem_cache.get_loop_domain(), new_contiguity=True
+        )
+        output_smem_cache.split(1, 4)
+        # [BIDx, 8', 4', 32]
+
+        fd.sched.bounded_transform_backward(output_smem_cache, -1, [input])
+
+        output_smem_cache.merge(1, 3)
+        # [BIDx, 256, 4']
+        output_smem_cache.axis(1).parallelize(ParallelType.block_x)
+
+        fd.sched.bounded_transform_backward(
+            output_smem_cache, -1, [input_smem_cache], propagate_parallel_type=True
+        )
+
+        output_smem_cache.axis(2).parallelize(ParallelType.unroll)
+        output_reg_cache.axis(2).parallelize(ParallelType.vectorize)
+        output_reg_cache.set_allocation_domain(
+            output_reg_cache.get_loop_domain(), new_contiguity=True
+        )
+
+        # Schedule the memory format for 128 byte swizzle
+        # [BIDx, 8', 4', 32]
+        input_smem_cache.reorder({3: 1, 1: 2, 2: 3})
+        # [BIDx, 32, 8', 4']
+        input_smem_cache.split(1, 8)
+        # [BIDx, 4, 8, 8', 4']
+        input_smem_cache.swizzle(2, 3)
+        # [BIDx, 4, 8, 8', 4']
+        input_smem_cache.set_allocation_domain(
+            input_smem_cache.get_loop_domain(), new_contiguity=True
+        )
+
+        input_smem_cache.axis(1).parallelize(ParallelType.tma)
+        input_smem_cache.axis(2).parallelize(ParallelType.tma)
+        input_smem_cache.axis(3).parallelize(ParallelType.tma)
+        input_smem_cache.axis(4).parallelize(ParallelType.tma)
+        # [BIDx, Bulk, Bulk, Bulk, Bulk]
+
+        if verbose_:
+            print(fd.fusion.print_math())
+            print(fd.fusion.print_kernel())
+
+    index32bit = CompileParams(
+        index_type=DataType.Int32, maxrregcount=255, enable_magic_zero=False
+    )
+    t0 = torch.randn(10000, 10000, dtype=torch.float, device="cuda:0")
+    ke = KernelExecutor()
+    ke.compile(fd.fusion, [t0], compile_params=index32bit)
+    outputs = ke.run([t0])
+    assert outputs[0].equal(t0.t())