mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[ck_tile] refactor reduce kernel (#3257)
* refactor reduce kernel - Rename Reduce kernel as per convention - Move kept_dim and reduce_dims from runtime to compile-time parameters - Update Reduce2dProblem template to include KeptDim, ReduceDims, and Rank - Remove IsSupportedArgument validation function as it's unnecessary. Not using the GuaranteedLastDimensionVectorStride while making tensor view or descriptor which removes the bounds enforced earlier. We still calculate and use vector size. - Update reduce example to demonstrate NCHW->NHW reduction with non-contiguous support - Update tests Kernel now handles both contiguous and non-contiguous memory layout. * fix compile errors
This commit is contained in:
committed by
GitHub
parent
92653168c2
commit
ea10a78203
@@ -286,7 +286,6 @@ template <typename CDataType,
|
||||
typename ELayout = ck_tile::tensor_layout::gemm::RowMajor>
|
||||
float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
|
||||
{
|
||||
const ck_tile::index_t reduce_dim_size = args.k_batch; // Number of partial results to reduce
|
||||
// Calculate output size based on the final output tensor dimensions
|
||||
const ck_tile::index_t output_size = args.M * args.N;
|
||||
|
||||
@@ -303,27 +302,28 @@ float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config
|
||||
constexpr auto reduce_dims = ck_tile::sequence<0>{}; // Reduce k_batch dimension
|
||||
|
||||
using ReduceOp = ck_tile::ReduceOp::Add;
|
||||
using BlockWarps = ck_tile::sequence<4, 1>;
|
||||
using BlockTile = ck_tile::sequence<128, 128>;
|
||||
using WarpTile = ck_tile::sequence<32, 128>;
|
||||
using ThreadTile = ck_tile::sequence<8, 8>;
|
||||
using BlockWarps = ck_tile::sequence<1, 1>;
|
||||
using BlockTile = ck_tile::sequence<256, 1>;
|
||||
using WarpTile = ck_tile::sequence<256, 1>;
|
||||
using ThreadTile = ck_tile::sequence<1, 1>;
|
||||
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
|
||||
ck_tile::index_t kGridSize = (output_size + BlockTile::at(ck_tile::number<0>{}) - 1) /
|
||||
BlockTile::at(ck_tile::number<0>{});
|
||||
|
||||
using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
|
||||
using Problem =
|
||||
ck_tile::Reduce2dProblem<CDataType, ComputeDataType, CDataType, Shape, ReduceOp>;
|
||||
using Kernel = ck_tile::Reduce<Problem>;
|
||||
using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
|
||||
using Problem = ck_tile::Reduce2dProblem<CDataType,
|
||||
ComputeDataType,
|
||||
CDataType,
|
||||
Shape,
|
||||
ReduceOp,
|
||||
decltype(kept_dim),
|
||||
decltype(reduce_dims),
|
||||
3>;
|
||||
using Kernel = ck_tile::ReduceKernel<Problem>;
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
|
||||
if(!Kernel::IsSupportedArgument(reduce_dim_size, workspace_strides))
|
||||
{
|
||||
throw std::runtime_error("Wrong! Reduction arguments not supported!\n");
|
||||
}
|
||||
|
||||
if(s.log_level_ > 0)
|
||||
{
|
||||
std::cout << "Stage 2 - Launching Reduction kernel" << '\n'
|
||||
@@ -343,9 +343,7 @@ float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config
|
||||
static_cast<const CDataType*>(args.e_ptr), // workspace input
|
||||
static_cast<CDataType*>(args.final_output_ptr), // final output
|
||||
workspace_shape,
|
||||
workspace_strides,
|
||||
kept_dim,
|
||||
reduce_dims));
|
||||
workspace_strides));
|
||||
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
@@ -9,14 +9,14 @@
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("n", "32", "n dimension")
|
||||
.insert("h", "7", "h dimension")
|
||||
.insert("w", "7", "w dimension")
|
||||
.insert("c", "512", "c dimension")
|
||||
arg_parser.insert("n", "16", "n dimension")
|
||||
.insert("h", "64", "h dimension")
|
||||
.insert("w", "32", "w dimension")
|
||||
.insert("c", "960", "c dimension")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("prec", "fp16", "precision")
|
||||
.insert("warmup", "5", "cold iter")
|
||||
.insert("repeat", "20", "hot iter")
|
||||
.insert("warmup", "20", "cold iter")
|
||||
.insert("repeat", "100", "hot iter")
|
||||
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
|
||||
.insert("jsonfile", "reduce.json", "json file name to dump results");
|
||||
|
||||
@@ -47,12 +47,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
strides[3] = 1;
|
||||
|
||||
// Define reduction specification:
|
||||
constexpr auto kept_dim = ck_tile::sequence<0, 3>{}; // Which dimension to keep
|
||||
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
|
||||
constexpr auto kept_dim = ck_tile::sequence<1, 2, 3>{}; // Which dimension to keep
|
||||
constexpr auto reduce_dims = ck_tile::sequence<0>{}; // Which dimensions to reduce
|
||||
|
||||
ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
|
||||
ck_tile::HostTensor<YDataType> y_host_ref({N, C}, {C, 1});
|
||||
ck_tile::HostTensor<YDataType> y_host_dev({N, C}, {C, 1});
|
||||
ck_tile::HostTensor<YDataType> y_host_ref({H, W, C}, {W * C, C, 1});
|
||||
ck_tile::HostTensor<YDataType> y_host_dev({H, W, C}, {W * C, C, 1});
|
||||
|
||||
ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
|
||||
|
||||
@@ -62,40 +62,40 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
x_buf.ToDevice(x_host.data());
|
||||
|
||||
using ReduceOp = ck_tile::ReduceOp::Add;
|
||||
using BlockWarps = ck_tile::sequence<4, 1>;
|
||||
using BlockTile = ck_tile::sequence<128, 128>;
|
||||
using WarpTile = ck_tile::sequence<32, 128>;
|
||||
using Vector = ck_tile::sequence<8, 8>;
|
||||
using BlockWarps = ck_tile::sequence<1, 1>;
|
||||
using BlockTile = ck_tile::sequence<256, 1>;
|
||||
using WarpTile = ck_tile::sequence<256, 1>;
|
||||
using ThreadTile = ck_tile::sequence<1, 1>;
|
||||
|
||||
// cross warp-reduce
|
||||
// using BlockWarps = ck_tile::sequence<2, 2>;
|
||||
// using BlockTile = ck_tile::sequence<2, 1024>;
|
||||
// using WarpTile = ck_tile::sequence<1, 512>;
|
||||
// using Vector = ck_tile::sequence<1, 8>;
|
||||
// using ThreadTile = ck_tile::sequence<1, 8>;
|
||||
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
ck_tile::index_t kept_dim_len_prod = N * C;
|
||||
ck_tile::index_t kept_dim_len_prod = H * W * C;
|
||||
ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
|
||||
BlockTile::at(ck_tile::number<0>{});
|
||||
std::cout << "grid size " << kGridSize << std::endl;
|
||||
|
||||
using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
|
||||
using Porblem =
|
||||
ck_tile::Reduce2dProblem<XDataType, ComputeDataType, YDataType, Shape, ReduceOp>;
|
||||
using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
|
||||
using Porblem = ck_tile::Reduce2dProblem<XDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
Shape,
|
||||
ReduceOp,
|
||||
decltype(kept_dim),
|
||||
decltype(reduce_dims),
|
||||
4>;
|
||||
|
||||
using Kernel = ck_tile::Reduce<Porblem>;
|
||||
using Kernel = ck_tile::ReduceKernel<Porblem>;
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
// Create input tensor shape and strides
|
||||
auto input_shape =
|
||||
ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
|
||||
auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
|
||||
|
||||
if(!Kernel::IsSupportedArgument(
|
||||
C, input_strides)) // output tensor's continuous dimension and input strides
|
||||
{
|
||||
throw std::runtime_error("Wrong! Arguments not supported!\n");
|
||||
}
|
||||
|
||||
float ave_time = launch_kernel(
|
||||
ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
|
||||
ck_tile::make_kernel<kBlockPerCu>(Kernel{},
|
||||
@@ -105,11 +105,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
|
||||
static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
|
||||
input_shape,
|
||||
input_strides,
|
||||
kept_dim,
|
||||
reduce_dims));
|
||||
input_strides));
|
||||
|
||||
std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;
|
||||
std::size_t num_btype = sizeof(XDataType) * N * H * W * C + sizeof(YDataType) * H * W * C;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
@@ -149,8 +147,8 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
// else if(data_type == "bf16")
|
||||
// {
|
||||
// return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
|
||||
// }
|
||||
else if(data_type == "bf16")
|
||||
{
|
||||
return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename Problem_, typename Policy_ = Reduce2dDefaultPolicy>
|
||||
struct Reduce
|
||||
struct ReduceKernel
|
||||
{
|
||||
using Problem = ck_tile::remove_cvref_t<Problem_>;
|
||||
using Policy = ck_tile::remove_cvref_t<Policy_>;
|
||||
@@ -33,7 +33,7 @@ struct Reduce
|
||||
|
||||
private:
|
||||
// Helper function to calculate optimal vector size for input tensor
|
||||
template <typename InputShape, typename ReduceDims>
|
||||
template <typename ReduceDims, index_t Rank, index_t NumReduceDim>
|
||||
static constexpr index_t CalculateInputVectorSize()
|
||||
{
|
||||
using S = typename Problem::BlockShape;
|
||||
@@ -41,8 +41,8 @@ struct Reduce
|
||||
constexpr index_t thread_tile_vector_size = S::ThreadTile_N;
|
||||
|
||||
// Check if innermost reduce dimension is the last dimension (stride 1).
|
||||
constexpr auto innermost_reduce_dim = ReduceDims{}.at(number<ReduceDims{}.size() - 1>{});
|
||||
constexpr bool is_innermost_contiguous = (innermost_reduce_dim == InputShape{}.size() - 1);
|
||||
constexpr index_t innermost_reduce_dim = ReduceDims::at(number<NumReduceDim - 1>{});
|
||||
constexpr bool is_innermost_contiguous = (innermost_reduce_dim == Rank - 1);
|
||||
|
||||
// If innermost reduce dimension is not the last dim (not contiguous), limit vectorization
|
||||
constexpr index_t stride_based_vector_size =
|
||||
@@ -63,29 +63,28 @@ struct Reduce
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename InputShape, typename InputStrides, typename KeptDim, typename ReduceDims>
|
||||
template <typename InputShape, typename InputStrides>
|
||||
CK_TILE_DEVICE void operator()(const XDataType* p_x,
|
||||
YDataType* p_y,
|
||||
InputShape input_shape,
|
||||
InputStrides input_strides,
|
||||
KeptDim kept_dim,
|
||||
ReduceDims reduce_dims) const
|
||||
InputStrides input_strides) const
|
||||
{
|
||||
using S = typename Problem::BlockShape;
|
||||
const auto iM = get_block_id() * S::Block_M;
|
||||
|
||||
static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(),
|
||||
static_assert(Problem::KeptDim::size() + Problem::ReduceDims::size() == Problem::Rank,
|
||||
"Size of kept dimensions + reduced dimensions must equal input tensor rank");
|
||||
|
||||
// Extract lengths based on kept and reduced dimensions
|
||||
const auto kept_lens = [&]() {
|
||||
return generate_tuple([&](auto I) { return input_shape.at(number<kept_dim.at(I)>{}); },
|
||||
number<kept_dim.size()>{});
|
||||
return generate_tuple(
|
||||
[&](auto I) { return input_shape.at(number<Problem::KeptDim::at(I)>{}); },
|
||||
number<Problem::KeptDim::size()>{});
|
||||
}();
|
||||
const auto reduce_lens = [&]() {
|
||||
return generate_tuple(
|
||||
[&](auto I) { return input_shape.at(number<reduce_dims.at(I)>{}); },
|
||||
number<reduce_dims.size()>{});
|
||||
[&](auto I) { return input_shape.at(number<Problem::ReduceDims::at(I)>{}); },
|
||||
number<Problem::ReduceDims::size()>{});
|
||||
}();
|
||||
|
||||
const auto kept_merge_transform = make_merge_transform(kept_lens);
|
||||
@@ -96,11 +95,13 @@ struct Reduce
|
||||
type_convert<XDataType>(reduce_func.template GetIdentityValue<ComputeDataType>());
|
||||
|
||||
// Calculate optimal vector size for input tensor
|
||||
constexpr auto x_tensor_vector_size = CalculateInputVectorSize<InputShape, ReduceDims>();
|
||||
constexpr auto x_tensor_vector_size = CalculateInputVectorSize<typename Problem::ReduceDims,
|
||||
Problem::Rank,
|
||||
Problem::NumReduceDim>();
|
||||
|
||||
// Create input tensor view with custom padding value
|
||||
auto desc = make_naive_tensor_descriptor(
|
||||
input_shape, input_strides, number<x_tensor_vector_size>{}, number<1>{});
|
||||
input_shape, input_strides, number<x_tensor_vector_size>{});
|
||||
|
||||
// Create buffer view with custom padding value
|
||||
auto buffer_view = make_buffer_view<address_space_enum::global>(
|
||||
@@ -109,10 +110,11 @@ struct Reduce
|
||||
// Create tensor view with custom padding
|
||||
const auto x_tensor = tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
|
||||
const auto transformed_x_tensor = pad_tensor_view(
|
||||
transform_tensor_view(x_tensor,
|
||||
make_tuple(kept_merge_transform, reduce_merge_transform),
|
||||
make_tuple(kept_dim, reduce_dims),
|
||||
make_tuple(sequence<0>{}, sequence<1>{})),
|
||||
transform_tensor_view(
|
||||
x_tensor,
|
||||
make_tuple(kept_merge_transform, reduce_merge_transform),
|
||||
make_tuple(typename Problem::KeptDim{}, typename Problem::ReduceDims{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{})),
|
||||
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
|
||||
sequence<0, 1>{});
|
||||
|
||||
@@ -122,25 +124,25 @@ struct Reduce
|
||||
[&](auto I) {
|
||||
// Calculate stride for dimension I as product of all following dimensions
|
||||
index_t stride = 1;
|
||||
static_for<I + 1, kept_dim.size(), 1>{}(
|
||||
static_for<I + 1, Problem::KeptDim::size(), 1>{}(
|
||||
[&](auto J) { stride *= kept_lens.at(number<J>{}); });
|
||||
return stride;
|
||||
},
|
||||
number<kept_dim.size()>{});
|
||||
number<Problem::KeptDim::size()>{});
|
||||
}();
|
||||
|
||||
// Calculate optimal vector size for output tensor
|
||||
constexpr auto y_tensor_vector_size = CalculateOutputVectorSize();
|
||||
|
||||
const auto y_m = make_naive_tensor_view<address_space_enum::global>(
|
||||
p_y, kept_lens, kept_strides, number<y_tensor_vector_size>{}, number<1>{});
|
||||
p_y, kept_lens, kept_strides, number<y_tensor_vector_size>{});
|
||||
|
||||
// Transform output tensor to 1D merged view
|
||||
// This creates a view compatible with the 2D reduction pattern
|
||||
const auto y_merged = transform_tensor_view(
|
||||
y_m,
|
||||
make_tuple(kept_merge_transform),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, Problem::KeptDim::size(), 1>::type{}),
|
||||
make_tuple(sequence<0>{}));
|
||||
|
||||
auto x_window = make_tile_window(transformed_x_tensor,
|
||||
@@ -179,49 +181,6 @@ struct Reduce
|
||||
|
||||
store_tile(y_window, cast_tile<YDataType>(y_compute));
|
||||
}
|
||||
|
||||
/// @brief Validates if the given arguments are supported by the 2D reduction kernel.
|
||||
///
|
||||
/// @param y_continous_dim Size of the continuous dimension of the output tensor.
|
||||
/// Must be a multiple of ThreadTile_N for proper thread mapping.
|
||||
///
|
||||
/// @param input_strides The stride configuration of the input tensor.
|
||||
/// The last stride must be 1 to ensure contiguous memory access
|
||||
/// and enable efficient vectorized loads.
|
||||
///
|
||||
/// @return true if the arguments are supported, false otherwise.
|
||||
/// Error messages are logged when CK_TILE_LOGGING is enabled.
|
||||
///
|
||||
/// @note Requirements:
|
||||
/// - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution)
|
||||
/// - input_strides[-1] == 1 (for contiguous memory access)
|
||||
template <typename InputStrides>
|
||||
CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim,
|
||||
InputStrides input_strides)
|
||||
{
|
||||
using S = typename Problem::BlockShape;
|
||||
|
||||
if(y_continous_dim % S::ThreadTile_N != 0)
|
||||
{
|
||||
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
|
||||
{
|
||||
CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if(input_strides.at(number<input_strides.size() - 1>{}) != 1)
|
||||
{
|
||||
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
|
||||
{
|
||||
CK_TILE_ERROR(
|
||||
"Input tensor's last stride must be 1 to support correct vector access!");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -12,6 +12,9 @@ template <typename XDataType_,
|
||||
typename YDataType_,
|
||||
typename BlockShape_,
|
||||
typename ReduceOp_,
|
||||
typename KeptDim_,
|
||||
typename ReduceDims_,
|
||||
index_t Rank_,
|
||||
bool OutputIndex_ = false>
|
||||
struct Reduce2dProblem
|
||||
{
|
||||
@@ -20,7 +23,11 @@ struct Reduce2dProblem
|
||||
using YDataType = remove_cvref_t<YDataType_>;
|
||||
using BlockShape = remove_cvref_t<BlockShape_>;
|
||||
using ReduceOp = ReduceOp_;
|
||||
using KeptDim = remove_cvref_t<KeptDim_>;
|
||||
using ReduceDims = remove_cvref_t<ReduceDims_>;
|
||||
|
||||
static constexpr index_t Rank = Rank_;
|
||||
static constexpr index_t NumReduceDim = ReduceDims::size();
|
||||
static constexpr bool kOutputIndex = OutputIndex_;
|
||||
static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
|
||||
static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
|
||||
|
||||
@@ -53,10 +53,16 @@ class TestCkTileReduce : public ::testing::Test
|
||||
d_y_mem.ToDevice(h_y.data()); // Initialize device output buffer
|
||||
|
||||
// Problem and kernel setup
|
||||
using Problem = ck_tile::
|
||||
Reduce2dProblem<XDataType, ComputeDataType, YDataType, TestReduce2dShape, ReduceOpType>;
|
||||
using Problem = ck_tile::Reduce2dProblem<XDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
TestReduce2dShape,
|
||||
ReduceOpType,
|
||||
KeptDimSeq,
|
||||
ReduceDimSeq,
|
||||
InputDim>;
|
||||
|
||||
using Kernel = ck_tile::Reduce<Problem>;
|
||||
using Kernel = ck_tile::ReduceKernel<Problem>;
|
||||
|
||||
// Launch configuration
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
@@ -75,13 +81,6 @@ class TestCkTileReduce : public ::testing::Test
|
||||
auto input_shape_tuple = make_shape_tuple.template operator()<InputDim>(input_shape);
|
||||
auto input_strides_tuple = make_shape_tuple.template operator()<InputDim>(input_strides);
|
||||
|
||||
if(!Kernel::IsSupportedArgument(
|
||||
output_shape[output_shape.size() - 1],
|
||||
input_strides_tuple)) // output tensor's continuous dimension
|
||||
{
|
||||
throw std::runtime_error("Wrong! Arguments not supported!\n");
|
||||
}
|
||||
|
||||
ck_tile::launch_kernel(
|
||||
ck_tile::stream_config{nullptr, false, 0},
|
||||
ck_tile::make_kernel<kBlockPerCu>(Kernel{},
|
||||
@@ -91,9 +90,7 @@ class TestCkTileReduce : public ::testing::Test
|
||||
static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
|
||||
static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
|
||||
input_shape_tuple,
|
||||
input_strides_tuple,
|
||||
kept_dims,
|
||||
reduce_dims));
|
||||
input_strides_tuple));
|
||||
|
||||
// Get results back
|
||||
d_y_mem.FromDevice(h_y.data());
|
||||
|
||||
Reference in New Issue
Block a user