Add addmm, mm, bmm, baddbmm on SparseCsrXPU (#2758)

jenniew · CuiYifeng · web-flow · commit e3bd5f9ce4a9 · 2026-03-17T10:03:10.000+08:00
Add addmm, mm, bmm, baddbmm, support on SparseCsrXPU. Enable related tests. Related issue: #2211 #2213 --------- Co-authored-by: Cui, Yifeng <yifeng.cui@intel.com>
diff --git a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
@@ -8,7 +8,11 @@
  * http://www.apache.org/licenses/LICENSE-2.0
  */
 
+#include <ATen/ExpandUtils.h>
 #include <ATen/SparseCsrTensorUtils.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/sparse/SparseCsrTensorMath.h>
 #include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>
 #include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
@@ -19,6 +23,11 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/add.h>
+#include <ATen/ops/addmm.h>
+#include <ATen/ops/baddbmm.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/sparse_compressed_tensor.h>
 #endif
 
 namespace at::native {
@@ -62,6 +71,318 @@ Tensor _sparse_csr_prod_xpu(
       input, dims_to_reduce, keepdim, dtype);
 }
 
+Tensor addmm_calculation(
+    const Tensor& input,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  Tensor mat1_dense = mat1.layout() != kStrided ? mat1.to_dense() : mat1;
+  Tensor mat2_dense = mat2.layout() != kStrided ? mat2.to_dense() : mat2;
+
+  Tensor result_dense = mat1_dense.mm(mat2_dense) * alpha;
+  if (beta.toComplexDouble() != 0.) {
+    Tensor input_dense = input.layout() != kStrided ? input.to_dense() : input;
+    result_dense.add_(input_dense, beta);
+  }
+  return result_dense;
+}
+
+void addmm_out_sparse_csr(
+    const Tensor& input,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result) {
+  TORCH_INTERNAL_ASSERT(
+      !((mat1.layout() == kStrided) && (mat2.layout() == kStrided) &&
+        (result.layout() == kStrided)),
+      "Expected at least one sparse input");
+
+  // Layout checks are nested mat1, mat2, result
+  // Conditions are ordered strided, csr, csc, bsr, bsc.
+  // Valid combinations terminate in a return
+  // Invalid combinations are omitted and will fall though to the TORCH check
+  // generating an informative error message
+
+  if ((mat1.layout() == kSparseBsr) && (mat2.layout() == kStrided) &&
+      (result.layout() == kStrided)) {
+    Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+    result.copy_(result_dense);
+    return;
+  }
+
+  if ((mat1.layout() == kStrided) && (mat2.layout() == kSparseBsc) &&
+      (result.layout() == kStrided)) {
+    Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+    result.copy_(result_dense);
+    return;
+  }
+
+  if (mat1.layout() == kStrided) {
+    if ((mat2.layout() == kSparseCsr) && (result.layout() == kStrided)) {
+      Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+      result.copy_(result_dense);
+      return;
+    }
+    if ((mat2.layout() == kSparseCsc) && (result.layout() == kStrided)) {
+      Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+      result.copy_(result_dense);
+      return;
+    }
+  }
+  if (mat1.layout() == kSparseCsr) {
+    if ((mat2.layout() == kStrided) && (result.layout() == kStrided)) {
+      Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+      result.copy_(result_dense);
+      return;
+    }
+    if ((mat2.layout() == kSparseCsr) && (result.layout() == kSparseCsr)) {
+      Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+      result = result_dense.to_sparse_csr();
+      return;
+    }
+    if ((mat2.layout() == kSparseCsc) && (result.layout() == kSparseCsr)) {
+      Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+      result = result_dense.to_sparse_csr();
+      return;
+    }
+  }
+  if (mat1.layout() == kSparseCsc) {
+    if ((mat2.layout() == kStrided) && (result.layout() == kStrided)) {
+      Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+      result.copy_(result_dense);
+      return;
+    }
+    if ((mat2.layout() == kSparseCsr) && (result.layout() == kSparseCsr)) {
+      Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+      result = result_dense.to_sparse_csr();
+      return;
+    }
+    if (mat2.layout() == kSparseCsc) {
+      if (result.layout() == kSparseCsr) {
+        Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+        result = result_dense.to_sparse_csr();
+        return;
+      }
+      if (result.layout() == kSparseCsc) {
+        Tensor result_dense = addmm_calculation(input, mat1, mat2, beta, alpha);
+        result = result_dense.to_sparse_csc();
+        return;
+      }
+    }
+  }
+  TORCH_CHECK(
+      false,
+      "addmm: computation on XPU is not implemented for ",
+      result.layout(),
+      " + ",
+      mat1.layout(),
+      " @ ",
+      mat2.layout());
+}
+
+// result = beta * self + alpha * (mat1 @ mat2)
+Tensor& addmm_out_sparse_compressed_xpu(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result) {
+  TORCH_CHECK(
+      self.is_xpu(),
+      "Expected all tensors to be on the same device. addmm expected self to be XPU tensor, but got ",
+      self.device(),
+      " tensor");
+  TORCH_CHECK(
+      mat1.is_xpu(),
+      "Expected all tensors to be on the same device. addmm expected mat1 to be XPU tensor, but got ",
+      mat1.device(),
+      " tensor");
+  TORCH_CHECK(
+      mat2.is_xpu(),
+      "Expected all tensors to be on the same device. addmm expected mat2 to be XPU tensor, but got ",
+      mat2.device(),
+      " tensor");
+  TORCH_CHECK(
+      result.is_xpu(),
+      "Expected all tensors to be on the same device. addmm expected result to be XPU tensor, but got ",
+      result.device(),
+      " tensor");
+
+  // Same checks as in TORCH_META_FUNC(addmm) at
+  // aten/src/ATen/native/LinearAlgebra.cpp
+  sparse::impl::_check_dim(mat1, 2, "mat1");
+  sparse::impl::_check_dim(mat2, 2, "mat2");
+
+  TORCH_CHECK(
+      mat1.size(1) == mat2.size(0),
+      "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.size(0),
+      "x",
+      mat1.size(1),
+      " and ",
+      mat2.sizes()[0],
+      "x",
+      mat2.sizes()[1],
+      ")");
+
+  c10::MaybeOwned<at::Tensor> self_;
+  // Don't expand self if this is an in-place operation
+  if (&result == &self) {
+    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
+  } else {
+    self_ = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm");
+  }
+
+  sparse::impl::_check_dim(*self_, 2, "self");
+  TORCH_CHECK(
+      ((self_->dim() == 2) && (self_->size(0) == mat1.size(0)) &&
+       (self_->size(1) == mat2.size(1))),
+      "The input tensor must be a matrix with size ",
+      mat1.size(0),
+      "x",
+      mat2.size(1),
+      ", but got a ",
+      self_->dim(),
+      "-D tensor with size ",
+      self_->size(0),
+      "x",
+      self_->size(1));
+
+  if (!result.is_same(self)) {
+    if (result.layout() == kStrided) {
+      at::native::resize_output(result, self_->sizes());
+    } else {
+      result.resize_as_sparse_(*self_);
+    }
+  }
+
+  if (result.numel() == 0) {
+    return result;
+  }
+
+  if (sparse::impl::_is_sparse_and_zero(mat1) ||
+      sparse::impl::_is_sparse_and_zero(mat2)) {
+    // According to docs, when beta==0 values in self should be ignored.
+    // nans and infs should not propagate
+    const auto beta_val = beta.toComplexDouble();
+    if (beta_val == 0.) {
+      result.zero_();
+    } else {
+      if (!result.is_same(self)) {
+        result.copy_(*self_);
+      }
+      if (beta_val != 1.) {
+        result.mul_(beta);
+      }
+    }
+    return result;
+  }
+
+  addmm_out_sparse_csr(*self_, mat1, mat2, beta, alpha, result);
+  return result;
+}
+
+Tensor expand_batch_if_necessary(const Tensor& mat) {
+  auto indice_batch_ndim = sparse_csr::numBatchDimensions(mat);
+  auto [compressed_indices, plain_indices] =
+      sparse_csr::getCompressedPlainIndices(mat);
+  auto values = mat.values();
+  auto batch_diff_size = mat.sizes().vec();
+  auto real_batch_ndim = mat.sizes().size() - 2;
+  if (indice_batch_ndim < real_batch_ndim) {
+    batch_diff_size.erase(
+        batch_diff_size.begin() + (real_batch_ndim - indice_batch_ndim),
+        batch_diff_size.end());
+    auto reshaped_compressed_indices_shape = compressed_indices.sizes().vec();
+    reshaped_compressed_indices_shape.insert(
+        std::begin(reshaped_compressed_indices_shape),
+        std::begin(batch_diff_size),
+        std::end(batch_diff_size));
+    compressed_indices =
+        compressed_indices.expand(reshaped_compressed_indices_shape);
+    auto reshaped_plain_indices_shape = plain_indices.sizes().vec();
+    reshaped_plain_indices_shape.insert(
+        reshaped_plain_indices_shape.begin(),
+        batch_diff_size.begin(),
+        batch_diff_size.end());
+    plain_indices = plain_indices.expand(reshaped_plain_indices_shape);
+    auto reshaped_values_indices_shape = values.sizes().vec();
+    reshaped_values_indices_shape.insert(
+        reshaped_values_indices_shape.begin(),
+        batch_diff_size.begin(),
+        batch_diff_size.end());
+    values = values.expand(reshaped_values_indices_shape);
+  }
+  auto updated_sparse_tensor = at::sparse_compressed_tensor(
+      compressed_indices, plain_indices, values, mat.sizes(), mat.options());
+  return updated_sparse_tensor;
+}
+
+Tensor& baddbmm_out_sparse_csr_xpu(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr());
+
+  TORCH_CHECK(
+      self.layout() == kStrided,
+      "torch.baddbmm: Expected self to be strided, but got layout ",
+      self.layout());
+  TORCH_CHECK(
+      mat2.layout() == kStrided,
+      "torch.baddbmm: Expect mat2 to be strided, but got ",
+      mat2.layout());
+  TORCH_CHECK(
+      result.layout() == kStrided,
+      "torch.baddbmm: Expect result to be strided, but got ",
+      result.layout());
+
+  if (!result.is_same(self)) {
+    at::native::resize_output(result, self.sizes());
+  }
+
+  if (mat1._nnz() == 0) {
+    // According to docs, when beta==0 values in self should be ignored
+    // nans and infs should not propagate
+    if (beta.toComplexDouble() == 0.) {
+      result.zero_();
+    } else {
+      if (!result.is_same(self)) {
+        result.copy_(self);
+      }
+      if (beta.toComplexDouble() != 1.) {
+        result.mul_(beta);
+      }
+    }
+    return result;
+  }
+
+  // broadcast batch of sparse indices and values if not compatible with sizes
+  // before to_dense() to_dense issue:
+  // https://github.com/intel/torch-xpu-ops/issues/2801
+  auto mat1_new = expand_batch_if_necessary(mat1);
+
+  at::baddbmm_out(result, self, mat1_new.to_dense(), mat2, beta, alpha);
+  return result;
+}
+
+Tensor& bmm_out_sparse_csr_xpu(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor& result) {
+  Scalar beta(0.0);
+  Scalar alpha(1.0);
+  return at::native::baddbmm_out_sparse_csr_xpu(
+      result, mat1, mat2, beta, alpha, result);
+}
+
 Tensor& add_out_sparse_compressed_xpu(
     const Tensor& self,
     const SparseCsrTensor& other,
diff --git a/test/xpu/test_sparse_csr_xpu.py b/test/xpu/test_sparse_csr_xpu.py
@@ -46,6 +46,8 @@
     skipCUDAIfNoSparseGeneric,
     skipCUDAIfRocm,
     skipMeta,
+    tol,
+    toleranceOverride,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex,
@@ -2142,6 +2144,7 @@ def test_csr_matvec(self, device, dtype):
 
     @onlyOn(["cuda", "xpu"])
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float64: 2e-6})
     def test_baddbmm(self, device, dtype):
         # TODO: disable the invariant checks within torch.baddbmm that
         # constructs unconventional csr tensors leading to
@@ -2803,7 +2806,9 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
         )
     )
     @dtypesIfXPU(*floating_and_complex_types_and(torch.half, torch.bfloat16))
-    @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2})
+    @precisionOverride(
+        {torch.bfloat16: 3.5e-2, torch.float16: 1e-2, torch.float64: 2e-6}
+    )
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
             if alpha_beta is None:
@@ -2845,6 +2850,7 @@ def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
             torch.cdouble: 1e-8,
         }
     )
+    @toleranceOverride({torch.double: tol(atol=2e-6, rtol=1e-6)})
     @dtypesIfCUDA(
         *floating_types_and(
             torch.complex64,
@@ -2987,6 +2993,7 @@ def maybe_transpose(cond, m):
             torch.cdouble: 1e-8,
         }
     )
+    @toleranceOverride({torch.double: tol(atol=2e-6, rtol=1e-6)})
     def test_addmm_sizes_all_sparse_csr(self, device, dtype, m, n, k):
         M = torch.randn(n, m, device=device).to(dtype)
         m1 = torch.randn(n, k, device=device).to(dtype)
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,8 @@`
`46`	`46`	`skipCUDAIfNoSparseGeneric,`
`47`	`47`	`skipCUDAIfRocm,`
`48`	`48`	`skipMeta,`
	`49`	`+ tol,`
	`50`	`+ toleranceOverride,`
`49`	`51`	`)`
`50`	`52`	`from torch.testing._internal.common_dtype import (`
`51`	`53`	`all_types_and_complex,`
`@@ -2142,6 +2144,7 @@ def test_csr_matvec(self, device, dtype):`
`2142`	`2144`
`2143`	`2145`	`@onlyOn(["cuda", "xpu"])`
`2144`	`2146`	`@dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)`
	`2147`	`+ @precisionOverride({torch.float64: 2e-6})`
`2145`	`2148`	`def test_baddbmm(self, device, dtype):`
`2146`	`2149`	`# TODO: disable the invariant checks within torch.baddbmm that`
`2147`	`2150`	`# constructs unconventional csr tensors leading to`
`@@ -2803,7 +2806,9 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):`
`2803`	`2806`	`)`
`2804`	`2807`	`)`
`2805`	`2808`	`@dtypesIfXPU(*floating_and_complex_types_and(torch.half, torch.bfloat16))`
`2806`		`- @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2})`
	`2809`	`+ @precisionOverride(`
	`2810`	`+ {torch.bfloat16: 3.5e-2, torch.float16: 1e-2, torch.float64: 2e-6}`
	`2811`	`+ )`
`2807`	`2812`	`def test_sparse_addmm(self, device, dtype):`
`2808`	`2813`	`def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):`
`2809`	`2814`	`if alpha_beta is None:`
`@@ -2845,6 +2850,7 @@ def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):`
`2845`	`2850`	`torch.cdouble: 1e-8,`
`2846`	`2851`	`}`
`2847`	`2852`	`)`
	`2853`	`+ @toleranceOverride({torch.double: tol(atol=2e-6, rtol=1e-6)})`
`2848`	`2854`	`@dtypesIfCUDA(`
`2849`	`2855`	`*floating_types_and(`
`2850`	`2856`	`torch.complex64,`
`@@ -2987,6 +2993,7 @@ def maybe_transpose(cond, m):`
`2987`	`2993`	`torch.cdouble: 1e-8,`
`2988`	`2994`	`}`
`2989`	`2995`	`)`
	`2996`	`+ @toleranceOverride({torch.double: tol(atol=2e-6, rtol=1e-6)})`
`2990`	`2997`	`def test_addmm_sizes_all_sparse_csr(self, device, dtype, m, n, k):`
`2991`	`2998`	`M = torch.randn(n, m, device=device).to(dtype)`
`2992`	`2999`	`m1 = torch.randn(n, k, device=device).to(dtype)`