Migrate Tutorial.VectorizeStorePointwiseTMA and Tutorial.PointwiseBroadcastTMA to direct bindings (#5248)

rdspring1 · web-flow · commit 618673aa1cf9 · 2025-10-07T19:44:37.000-07:00
* Add `parallelize_like` and `inline_most` to schedule API PR stack * #5247 * #5248 **<< This PR** * #5249
diff --git a/python/python_direct/schedule.cpp b/python/python_direct/schedule.cpp
@@ -6,6 +6,8 @@
  */
 // clang-format on
 #include <bindings.h>
+#include <scheduler/tools/inlining.h>
+#include <scheduler/utils.h>
 #include <transform_replay.h>
 
 namespace nvfuser::python {
@@ -49,6 +51,71 @@ void bindTensorviewScheduleOps(py::module_& schedule) {
     )",
       py::arg("reference_tv"),
       py::arg("selected_tensors") = std::vector<TensorView*>());
+
+  schedule.def(
+      "parallelize_like",
+      [](TensorView* reference_tv,
+         int64_t pos,
+         const std::vector<TensorView*>& selected_tensors,
+         const std::unordered_set<ParallelType>& selected_parallel_types,
+         bool propagate_padding) {
+        scheduler_utils::parallelizeAllLike(
+            reference_tv,
+            pos,
+            selected_tensors,
+            selected_parallel_types,
+            propagate_padding);
+      },
+      R"(
+          Propagate the parallelization from the selected dimensions of the
+          reference tensor to their corresponding dimensions in all selected
+          tensors in the DAG.
+
+          Parameters
+          ----------
+          reference_tv : TensorView
+              The reference TensorView whose parallelization will be propagated.
+          pos : int, optional
+              The position up to which dimensions should be selected. -1 means all dimensions.
+          selected_tensors : List[TensorView], optional
+              List of TensorViews to propagate parallelization to. If empty, propagates to all TensorViews.
+          selected_parallel_types : Set[ParallelType], optional
+              Set of parallel types to propagate. If empty, propagates all parallel types.
+          propagate_padding : bool, optional
+              Whether to propagate padding (default: True).
+
+          Returns
+          -------
+          None
+        )",
+      py::arg("reference_tv"),
+      py::arg("pos") = -1,
+      py::arg("selected_tensors") = std::vector<TensorView*>(),
+      py::arg("selected_parallel_types") = std::unordered_set<ParallelType>(),
+      py::arg("propagate_padding") = true);
+
+  schedule.def(
+      "inline_most",
+      [](const std::vector<TensorView*>& selected_tensors) {
+        if (selected_tensors.empty()) {
+          inlineMost();
+        } else {
+          inlineMost(selected_tensors);
+        }
+      },
+      R"(
+          Inline operations to the right most allowed position for the selected tensors.
+
+          Parameters
+          ----------
+          selected_tensors : List[TensorView], optional
+              List of TensorViews to inline. If empty, inlines all operations.
+
+          Returns
+          -------
+          None
+        )",
+      py::arg("selected_tensors") = std::vector<TensorView*>());
 }
 
 } // namespace
diff --git a/tests/python/direct/test_tutorial.py b/tests/python/direct/test_tutorial.py
@@ -1141,3 +1141,192 @@ def test_tutorial_basic_tma_example6(nvfuser_direct_test):
     ke.compile(fd.fusion, [t0], compile_params=index32bit)
     outputs = ke.run([t0])
     assert outputs[0].equal(t0)
+
+
+@pytest.mark.skipif(
+    is_pre_hopper(), reason="Only supported on Hopper and newer devices."
+)
+def test_tutorial_vectorize_store_pointwise_tma(nvfuser_direct_test):
+    with FusionDefinition() as fd:
+        tv0 = fd.define_tensor(shape=[-1, -1], contiguity=[True, True])
+        tv1 = fd.define_tensor(shape=[-1, -1], contiguity=[True, True])
+        tv2 = fd.ops.add(tv0, tv1)
+        fd.add_output(tv2)
+
+        # Create cache_tvs
+        tv0a = tv0.cache_after(LoadStoreOpType.tma)
+        tv1a = tv1.cache_after(LoadStoreOpType.tma)
+        tv2b = tv2.cache_before()
+
+        tv0a.set_memory_type(MemoryType.shared)
+        tv1a.set_memory_type(MemoryType.shared)
+
+        reference_tv = tv2
+
+        # Step 1: Create tma domain
+        # Use the root domain as TMA domain
+        #   root domain: [I0, I1]
+
+        num_threads = 128
+        vectorization = 2
+        tma_tile = num_threads * vectorization
+        num_stages = 4
+        num_ctas_for_hopper = 132
+
+        # Step 2: Create Box
+        # After TMA domain creation
+        #         split: [I0, I3, 256]
+        reference_tv.split(-1, tma_tile)
+        #         split: [I2, 4, I3, 256]
+        reference_tv.split(0, num_stages)
+
+        # Step 3: Create Tile
+        # Do nothing here because box == tile
+
+        # Step 4: Schedule Shared Memory Tensor
+        #         split: [I2, 4, I3, 128, 2]
+        reference_tv.split(-1, vectorization)
+        #         split: [I4, 132, 4, I3, 128, 2]
+        reference_tv.split(0, num_ctas_for_hopper)
+        #         reorder: [I4, 132, I3, 4, 128, 2]
+        reference_tv.reorder({3: 2, 2: 3})
+
+        # Transform Operations between cache operations and output reference
+        fd.sched.transform_like(reference_tv)
+
+        # Propagate common parallel dimensions
+        reference_tv.axis(1).parallelize(ParallelType.grid_x)
+        fd.sched.parallelize_like(reference_tv)
+
+        tv2b.axis(-2).parallelize(ParallelType.block_x)
+
+        # Vectorization for writing results to gmem
+        reference_tv.axis(-3).parallelize(ParallelType.unroll)
+        reference_tv.axis(-2).parallelize(ParallelType.block_x)
+        reference_tv.axis(-1).parallelize(ParallelType.vectorize)
+
+        # Apply bulk type to TMA tensors
+        tv0a.axis(-1).parallelize(ParallelType.tma)
+        tv0a.axis(-2).parallelize(ParallelType.tma)
+        tv0a.axis(-3).parallelize(ParallelType.tma)
+
+        tv1a.axis(-1).parallelize(ParallelType.tma)
+        tv1a.axis(-2).parallelize(ParallelType.tma)
+        tv1a.axis(-3).parallelize(ParallelType.tma)
+
+        # ComputeAt
+        fd.sched.inline_most()
+
+        if verbose_:
+            print(fd.fusion.print_math())
+            print(fd.fusion.print_kernel())
+
+    dim0 = 16384
+    dim1 = 16384
+
+    # Compile with KernelExecutor directly to avoid scheduling
+    index32bit = CompileParams(
+        index_type=DataType.Int32, maxrregcount=255, enable_magic_zero=False
+    )
+    t0 = torch.randn(dim0, dim1, dtype=torch.float, device="cuda:0")
+    t1 = torch.randn(dim0, dim1, dtype=torch.float, device="cuda:0")
+    t2 = t0 + t1
+    ke = KernelExecutor()
+    ke.compile(fd.fusion, [t0, t1], compile_params=index32bit)
+    outputs = ke.run([t0, t1])
+    assert outputs[0].equal(t2)
+
+
+@pytest.mark.skipif(
+    is_pre_hopper(), reason="Only supported on Hopper and newer devices."
+)
+def test_tutorial_pointwise_broadcast_tma(nvfuser_direct_test):
+    with FusionDefinition() as fd:
+        tv0 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[True, True, True])
+        tv1 = fd.define_tensor(
+            shape=[-1, -1, -1, -1], contiguity=[True, False, True, True]
+        )
+        tv2 = fd.ops.broadcast(tv0, [True, False, False, False])
+        tv3 = fd.ops.add(tv2, tv1)
+        fd.add_output(tv3)
+
+        # Create cache_tvs
+        tv0a = tv0.cache_after(LoadStoreOpType.tma)
+        tv1a = tv1.cache_after(LoadStoreOpType.tma)
+        tv3b = tv3.cache_before(LoadStoreOpType.tma)
+
+        tv0a.set_memory_type(MemoryType.shared)
+        tv1a.set_memory_type(MemoryType.shared)
+        tv3b.set_memory_type(MemoryType.shared)
+
+        reference_tv = tv3
+
+        # Step 1: Create tma domain
+        #   root domain: [I0, I1, I2, I3]
+        #    TMA domain: [I0, I1, I4]
+        reference_tv.merge(-2, -1)
+
+        # Step 2: Define TMA Box
+        #         split: [I0, I1, I5, 256]
+        reference_tv.split(-1, 256)
+
+        # Step 3: Define Tile
+        # Do nothing here because tile == box.
+
+        # Step 4: Schedule Shared Memory Tensor
+        #         merge: [I10, I5, 256]
+        reference_tv.merge(0, 1)
+        #         split: [I10, I7, 4, 256]
+        reference_tv.split(-2, 4)
+        #         merge: [I11, 4, 256]
+        reference_tv.merge(0, 1)
+
+        # Transform Operations between cache operations and output reference
+        fd.sched.transform_like(reference_tv)
+
+        # Define Parallelization Schema
+        # Intermediate Tensors
+        tv3b.axis(0).parallelize(ParallelType.grid_x)
+        tv3b.axis(1).parallelize(ParallelType.unroll)
+        tv3b.axis(2).parallelize(ParallelType.block_x)
+
+        tv2.axis(0).parallelize(ParallelType.grid_x)
+        tv2.axis(1).parallelize(ParallelType.unroll)
+        tv2.axis(2).parallelize(ParallelType.block_x)
+
+        # TMA Tensors
+        tv1a.axis(0).parallelize(ParallelType.grid_x)
+        tv1a.axis(1).parallelize(ParallelType.block_x)
+        tv1a.axis(2).parallelize(ParallelType.tma)
+
+        tv0a.axis(0).parallelize(ParallelType.grid_x)
+        tv0a.axis(1).parallelize(ParallelType.block_x)
+        tv0a.axis(2).parallelize(ParallelType.tma)
+
+        tv3.axis(0).parallelize(ParallelType.grid_x)
+        tv3.axis(1).parallelize(ParallelType.block_x)
+        tv3.axis(2).parallelize(ParallelType.tma)
+
+        # ComputeAt
+        fd.sched.inline_most()
+
+        if verbose_:
+            print(fd.fusion.print_math())
+            print(fd.fusion.print_kernel())
+
+    dim0 = 32
+    dim1 = 2
+    dim2 = 4
+    dim3 = 256
+
+    # Compile with KernelExecutor directly to avoid scheduling
+    index32bit = CompileParams(
+        index_type=DataType.Int32, maxrregcount=255, enable_magic_zero=False
+    )
+    t0 = torch.randn(dim1, dim2, dim3, dtype=torch.float, device="cuda:0")
+    t1 = torch.randn(dim0, dim1, dim2, dim3, dtype=torch.float, device="cuda:0")
+    t2 = t0 + t1
+    ke = KernelExecutor()
+    ke.compile(fd.fusion, [t0, t1], compile_params=index32bit)
+    outputs = ke.run([t0, t1])
+    assert outputs[0].equal(t2)