[Feat] add cudaMemcpy2DAsync wrapper (#2674)

rhdong · web-flow · commit a3f2d986d762 · 2025-05-20T14:30:27.000Z
Authors: - rhdong (https://github.com/rhdong) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #2674
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
@@ -148,6 +148,42 @@ void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
   RAFT_CUDA_TRY(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
+/**
+ * @brief Generic matrix copy method with pitch support
+ *
+ * Performs an asynchronous 2D memory copy from source to destination, where each row
+ * may include padding (i.e., the pitch is larger than the row width). This is useful
+ * when working with pitched memory allocations or copying submatrices.
+ *
+ * @tparam Type       Data type of the elements
+ * @param dst         Destination pointer
+ * @param dst_pitch   Pitch (number of elements) between consecutive rows in the destination
+ * @param src         Source pointer
+ * @param src_pitch   Pitch (number of elements) between consecutive rows in the source
+ * @param width       Number of elements to copy per row
+ * @param height      Number of rows to copy
+ * @param stream      CUDA stream used to perform the asynchronous copy
+ */
+template <typename Type>
+void copy_matrix(Type* dst,
+                 size_t dst_pitch,
+                 const Type* src,
+                 size_t src_pitch,
+                 size_t width,
+                 size_t height,
+                 rmm::cuda_stream_view stream)
+{
+  constexpr size_t elem_size = sizeof(Type);
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst,
+                                  dst_pitch * elem_size,
+                                  src,
+                                  src_pitch * elem_size,
+                                  width * elem_size,
+                                  height,
+                                  cudaMemcpyDefault,
+                                  stream));
+}
+
 /**
  * @defgroup Copy Copy methods
  * These are here along with the generic 'copy' method in order to improve
diff --git a/cpp/tests/util/cudart_utils.cpp b/cpp/tests/util/cudart_utils.cpp
@@ -18,6 +18,7 @@
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <gtest/gtest.h>
@@ -99,4 +100,51 @@ TEST(Raft, GetDeviceForAddress)
   ASSERT_EQ(0, raft::get_device_for_address(d.data()));
 }
 
+TEST(Raft, Copy2DAsync)
+{
+  using DType = float;
+
+  constexpr size_t rows      = 4;
+  constexpr size_t cols      = 5;
+  constexpr size_t pitch     = 8;
+  constexpr size_t elem_size = sizeof(DType);
+  constexpr size_t width     = cols;
+  constexpr size_t height    = rows;
+
+  rmm::cuda_stream_pool pool{1};
+  auto stream = pool.get_stream();
+
+  rmm::device_uvector<DType> d_src(pitch * elem_size * rows, stream);
+  rmm::device_uvector<DType> d_dst(pitch * elem_size * rows, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+  std::vector<DType> h_src(rows * pitch, -1.0f);
+  std::vector<DType> h_dst(rows * pitch, 0.0f);
+  std::vector<DType> h_dst_baseline(rows * pitch, 0.0f);
+
+  for (size_t r = 0; r < rows; ++r) {
+    for (size_t c = 0; c < pitch; ++c) {
+      h_src[r * pitch + c] = static_cast<DType>(r * pitch + c);
+      if (r < height && c < cols) {
+        h_dst_baseline[r * pitch + c] = static_cast<DType>(r * pitch + c);
+      }
+    }
+  }
+  RAFT_CUDA_TRY(
+    cudaMemcpy(d_src.data(), h_src.data(), pitch * elem_size * rows, cudaMemcpyHostToDevice));
+  RAFT_CUDA_TRY(
+    cudaMemcpy(d_dst.data(), h_dst.data(), pitch * elem_size * rows, cudaMemcpyHostToDevice));
+
+  raft::copy_matrix(d_dst.data(), pitch, d_src.data(), pitch, width, height, stream);
+  RAFT_CUDA_TRY(
+    cudaMemcpy(h_dst.data(), d_dst.data(), pitch * elem_size * rows, cudaMemcpyDeviceToHost));
+
+  for (size_t r = 0; r < rows; ++r) {
+    for (size_t c = 0; c < pitch; ++c) {
+      ASSERT_EQ(h_dst[r * pitch + c], h_dst_baseline[r * pitch + c])
+        << "Mismatch at row " << r << " col " << c;
+    }
+  }
+}
+
 }  // namespace raft