NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎csrc/multidevice/executor.h‎
Lines changed: 1 addition & 0 deletions b/‎csrc/multidevice/executor.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/nvfuser_direct/__init__.py‎
Lines changed: 36 additions & 0 deletions b/‎python/nvfuser_direct/__init__.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎python/python_common/distributed_tensor.cpp‎
Lines changed: 70 additions & 0 deletions b/‎python/python_common/distributed_tensor.cpp‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎…hon/python_frontend/distributed_tensor.h‎ ‎python/python_common/distributed_tensor.h‎python/python_frontend/distributed_tensor.h renamed to python/python_common/distributed_tensor.h
Lines changed: 9 additions & 2 deletions b/‎…hon/python_frontend/distributed_tensor.h‎ ‎python/python_common/distributed_tensor.h‎python/python_frontend/distributed_tensor.h renamed to python/python_common/distributed_tensor.h
Lines changed: 9 additions & 2 deletions
diff --git a/‎python/python_direct/bindings.cpp‎
Lines changed: 5 additions & 0 deletions b/‎python/python_direct/bindings.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/python_direct/bindings.h‎
Lines changed: 3 additions & 0 deletions b/‎python/python_direct/bindings.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/python_direct/enum.cpp‎
Lines changed: 20 additions & 0 deletions b/‎python/python_direct/enum.cpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎python/python_direct/ir.cpp‎
Lines changed: 83 additions & 0 deletions b/‎python/python_direct/ir.cpp‎
Lines changed: 83 additions & 0 deletions
@@ -335,14 +335,14 @@ endif()
 
 if(BUILD_PYTHON)
   list(APPEND NVFUSER_SRCS
-    ${NVFUSER_PYTHON_BINDINGS}/distributed_tensor.cpp
     ${NVFUSER_PYTHON_BINDINGS}/fusion_cache.cpp
     ${NVFUSER_PYTHON_BINDINGS}/fusion_definition.cpp
     ${NVFUSER_PYTHON_BINDINGS}/fusion_state.cpp
     ${NVFUSER_PYTHON_BINDINGS}/segmentation.cpp
     ${NVFUSER_PYTHON_BINDINGS}/translation.cpp
     ${NVFUSER_PYTHON_BINDINGS}/translation_utils.cpp
     ${NVFUSER_SRCS_DIR}/serde/fusion_record.cpp
+    ${NVFUSER_PYTHON_COMMON}/distributed_tensor.cpp
     ${NVFUSER_PYTHON_COMMON}/python_utils.cpp
     ${NVFUSER_PYTHON_COMMON}/translation_names.cpp
   )
@@ -608,6 +608,7 @@ if(BUILD_PYTHON)
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/bindings.cpp
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/enum.cpp
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/ir.cpp
+    ${NVFUSER_PYTHON_DIRECT_BINDINGS}/multidevice.cpp
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/ops.cpp
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/runtime.cpp
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/direct_utils.cpp
 
@@ -17,6 +17,7 @@
 #include <multidevice/communication.h>
 #include <multidevice/communicator.h>
 #include <multidevice/multidevice.h>
+#include <runtime/fusion_kernel_runtime.h>
 
 namespace nvfuser {
 
 
@@ -21,6 +21,42 @@
 from ._C_DIRECT import *  # noqa: F401,F403
 
 
+def execute_with_dtensors(fd, in_dtensors):
+    """
+    Execute a fusion on a list of DTensor inputs.
+
+    Parameters
+    ----------
+    fd : FusionDefinition
+        The fusion definition to execute
+    in_dtensors : list of DTensor
+        The list of DTensor inputs to the fusion
+
+    Returns
+    -------
+    list of DTensor
+        The list of DTensor outputs from the fusion
+    """
+    import torch.distributed as dist
+    from torch.distributed.tensor import DTensor
+    from torch.distributed.tensor.placement_types import Placement, Shard, Replicate
+
+    inputs = [in_dtensor.to_local() for in_dtensor in in_dtensors]
+    out_tensors = self.execute(inputs, auto_schedule=True)
+    out_shardings = self.fec.get_output_shardings()
+    assert len(out_tensors) == len(out_shardings)
+
+    out_dtensors: list[DTensor] = []
+    for out_tensor, out_sharding in zip(out_tensors, out_shardings):
+        mesh = dist.device_mesh.init_device_mesh("cuda", (out_sharding.mesh.size,))
+        placements: list[Placement] = []
+        for parallel_type in [_C_DIRECT.ParallelType.mesh_x]:
+            axis: int = out_sharding.axis_sharded_on(parallel_type)
+            placements.append(Replicate() if axis == -1 else Shard(axis))
+        out_dtensors.append(DTensor.from_local(out_tensor, mesh, placements))
+    return out_dtensors
+
+
 class FusionDefinition:
     """
     A class for defining and executing fused operations in nvFuser.
 
@@ -0,0 +1,70 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include <distributed_tensor.h>
+#include <exceptions.h>
+#include <ir/interface_nodes.h>
+#include <type.h>
+#include <utils.h>
+
+namespace nvfuser {
+
+void Sharding::setAxisIsShardedOn(
+    const int64_t axis,
+    const ParallelType parallel_type) {
+  NVF_CHECK(isParallelTypeDeviceDim(parallel_type));
+  NVF_CHECK(mesh_.size() > 0, "Cannot shard a non-distributed tensor.");
+  const auto i = axis_sharded_on_.find(parallel_type);
+  NVF_CHECK(
+      i == axis_sharded_on_.end(),
+      "Parallel type ",
+      parallel_type,
+      " was already used to shard axis ",
+      i->second);
+  axis_sharded_on_[parallel_type] = axis;
+}
+
+int64_t Sharding::axisShardedOn(const ParallelType parallel_type) const {
+  return getOrDefault(axis_sharded_on_, parallel_type, -1L);
+}
+
+std::vector<Sharding> getOutputShardings(Fusion* fusion) {
+  std::vector<TensorView*> all_tvs = fusion->allTvs();
+  if (std::none_of(
+          all_tvs.begin(),
+          all_tvs.end(),
+          std::mem_fn(&TensorView::hasDeviceMesh))) {
+    return {};
+  }
+
+  std::vector<Sharding> output_shardings;
+  output_shardings.reserve(fusion->outputs().size());
+  for (Val* out_val : fusion->outputs()) {
+    if (auto* out_tv = dynamic_cast<TensorView*>(out_val)) {
+      if (fusion->getOutputAlias(out_tv).hide_output) {
+        continue;
+      }
+      const DeviceMesh& mesh = out_tv->getDeviceMesh();
+      Sharding& output_sharding = output_shardings.emplace_back(mesh);
+      if (mesh.size() > 0) {
+        for (const ParallelType parallel_type : kParallelTypeDIDs) {
+          if (const auto axis = getShardedLogicalAxis(out_tv, parallel_type);
+              axis != -1) {
+            output_sharding.setAxisIsShardedOn(axis, parallel_type);
+          }
+        }
+      }
+    } else {
+      output_shardings.emplace_back(DeviceMesh());
+    }
+  }
+
+  return output_shardings;
+}
+
+} // namespace nvfuser
@@ -10,10 +10,12 @@
 
 #include <ATen/core/TensorBody.h>
 
+#include <fusion.h>
 #include <multidevice/device_mesh.h>
+#include <multidevice/utils.h>
 #include <type.h>
 
-namespace nvfuser::python_frontend {
+namespace nvfuser {
 
 class Sharding {
  public:
@@ -36,4 +38,9 @@ class Sharding {
   std::unordered_map<ParallelType, int64_t> axis_sharded_on_;
 };
 
-} // namespace nvfuser::python_frontend
+// Returns the output shardings of the given fusion. As a short cut, if none of
+// the outputs have a device mesh, returns an empty vector indicating single-GPU
+// execution.
+std::vector<Sharding> getOutputShardings(Fusion* fusion);
+
+} // namespace nvfuser
@@ -7,6 +7,7 @@
 // clang-format on
 
 #include <bindings.h>
+#include <multidevice/communicator.h>
 
 namespace nvfuser::python {
 
@@ -16,7 +17,11 @@ void initNvFuserPythonBindings(PyObject* module) {
   bindFusionIr(nvfuser);
   bindRuntime(nvfuser);
   bindOperations(nvfuser);
+  bindMultiDevice(nvfuser);
   nvfuser.def("translate_fusion", &translateFusion);
+
+  auto cleanup = []() -> void { Communicator::getInstance().cleanup(); };
+  nvfuser.add_object("_cleanup", py::capsule(cleanup));
 }
 
 } // namespace nvfuser::python
@@ -27,6 +27,9 @@ void bindRuntime(py::module& nvfuser);
 // Add bindings for CPP Fusion Operations
 void bindOperations(py::module& nvfuser);
 
+// Add bindings for MultiDevice features
+void bindMultiDevice(py::module& nvfuser);
+
 // Translate a CPP Fusion to a bindings python function
 std::string translateFusion(Fusion* f);
 
 
@@ -33,6 +33,26 @@ void bindEnums(py::module& nvfuser) {
       .value("ComplexFloat", DataType::ComplexFloat)
       .value("ComplexDouble", DataType::ComplexDouble)
       .value("Null", DataType::Null);
+
+  py::enum_<ParallelType>(nvfuser, "ParallelType")
+      .value("mesh_x", ParallelType::DIDx)
+      .value("grid_x", ParallelType::BIDx)
+      .value("grid_y", ParallelType::BIDy)
+      .value("grid_z", ParallelType::BIDz)
+      .value("block_x", ParallelType::TIDx)
+      .value("block_y", ParallelType::TIDy)
+      .value("block_z", ParallelType::TIDz)
+      .value("mma", ParallelType::Mma)
+      .value("serial", ParallelType::Serial)
+      .value("tma", ParallelType::Bulk)
+      .value("unroll", ParallelType::Unroll)
+      .value("unswitch", ParallelType::Unswitch)
+      .value("vectorize", ParallelType::Vectorize)
+      .value("stream", ParallelType::Stream);
+
+  py::enum_<CommunicatorBackend>(nvfuser, "CommunicatorBackend")
+      .value("nccl", CommunicatorBackend::kNccl)
+      .value("ucc", CommunicatorBackend::kUcc);
 }
 
 } // namespace nvfuser::python
@@ -67,6 +67,23 @@ Returns
 -------
 Val
     The extent of this domain.
+)")
+      .def(
+          "parallelize",
+          &IterDomain::parallelize,
+          py::arg("parallel_type"),
+          R"(
+Set the parallel type of this domain.
+
+Parameters
+----------
+parallel_type : ParallelType
+    The type of parallelization to apply (e.g., BIDx, TIDx, etc.).
+
+Notes
+-----
+This is a key function used in scheduling to specify how the domain should be parallelized
+across CUDA threads and blocks.
 )");
 
   // TensorDomain
@@ -111,6 +128,72 @@ TensorDomain
     - Logical domain (The original dimensions. It may contain rFactor iterDomains.)
     - Allocation domain (How the memory is allocated for the tensor?)
     - Loop domain (The for-loop structure for the tensor.)
+)")
+      .def(
+          "get_loop_domain",
+          &TensorView::getLoopDomain,
+          R"(
+Get the loop domain of this tensor.
+
+Returns
+-------
+list of IterDomain
+    The loop iteration domains.
+)")
+      .def(
+          "split",
+          static_cast<TensorView* (TensorView::*)(int64_t, int64_t, bool)>(
+              &TensorView::split),
+          py::arg("axis"),
+          py::arg("factor"),
+          py::arg("inner_split") = true,
+          py::return_value_policy::reference,
+          R"(
+Split an axis into two axes.
+
+Parameters
+----------
+axis : int
+    The axis to split.
+factor : int
+    The factor to split by.
+inner_split : bool, optional
+    If True, the factor determines the size of the inner domain.
+    If False, the factor determines the size of the outer domain.
+    Default is True.
+
+Returns
+-------
+TensorView
+    A TensorView with the split axes in its loop domain.
+)")
+      .def(
+          "set_allocation_domain",
+          static_cast<void (TensorView::*)(std::vector<IterDomain*>, bool)>(
+              &TensorView::setAllocationDomain),
+          py::arg("new_allocation_domain"),
+          py::arg("new_contiguity"),
+          R"(
+Set the allocation domain of this tensor.
+
+Parameters
+----------
+new_allocation_domain : list of IterDomain
+    The new allocation iteration domains.
+new_contiguity : bool
+    The new contiguity flag.
+)")
+      .def(
+          "set_device_mesh",
+          &TensorView::setDeviceMesh,
+          py::arg("mesh"),
+          R"(
+Set the device mesh of this tensor.
+
+Parameters
+----------
+mesh : DeviceMesh
+    The device mesh to set.
 )")
       .def(
           "axis",