mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-17 13:47:40 +00:00
[CK] Replace tuple value construction with tuple_element_t type extraction [1A] (#5030) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary ### Rationale CK's device operation instance registration uses `add_device_operation_instances` at ~1,850 call sites to register GPU kernel configurations. The existing implementation constructs `std::tuple` values just to extract their types via `decltype`, then copy-constructs each instance into `make_unique`. This is wasteful — only the types matter, not the values — and forces the compiler to instantiate the full `std::tuple` constructor and `std::get` machinery at every call site. ### What changed - Replace `remove_cvref_t<decltype(std::get<i>(tuple_obj))>` with `std::tuple_element_t<i.value, TupleType>`, which extracts the type directly without constructing any values - Replace copy-from-default `make_unique<T>(value)` with direct default construction `make_unique<T>()` — all CK device operation instances are stateless structs with configuration encoded in template parameters - Add `static_assert(std::is_default_constructible_v<NewOpInstance>)` to enforce this contract at compile time with a clear error message - Add Doxygen documentation for this high-traffic public API ### Value - Eliminates unnecessary template instantiation of `std::tuple` constructors and `std::get` across ~1,850 call sites - Establishes a cleaner, more intention-revealing pattern for type-only tuple usage - The `static_assert` prevents silent breakage if a non-default-constructible type is ever added - No runtime behavior change — zero risk ### Files changed (9) - `add_device_operation_instance.hpp`: Core pattern change - 3 example files, 3 reduce instance headers, 1 convolution header, 1 profiler header ## Test plan - [ ] Existing CI tests cover all ~1,850 call sites (GEMM, reduce, softmax, convolution) - [ ] `static_assert` provides compile-time validation stronger than runtime tests - [ ] No runtime behavior change — stateless struct default construction is identical to copy-from-default - [ ] Compatible with both `std::tuple` and `ck::type_list` containers 🤖 Generated with [Claude Code](https://claude.com/claude-code) ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
325 lines
12 KiB
C++
325 lines
12 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#include <iostream>
|
|
#include <initializer_list>
|
|
#include <cstdlib>
|
|
#include <getopt.h>
|
|
|
|
#include "ck/utility/reduction_enums.hpp"
|
|
#include "reduce_blockwise_impl.hpp"
|
|
#include "reduce_example_common.hpp"
|
|
|
|
using namespace ck;
|
|
using namespace ck::tensor_operation::device;
|
|
|
|
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
|
|
{"verify", required_argument, nullptr, 'v'},
|
|
{"help", no_argument, nullptr, '?'},
|
|
{nullptr, 0, nullptr, 0}};
|
|
|
|
class SimpleAppArgs
|
|
{
|
|
private:
|
|
int option_index = 0;
|
|
|
|
public:
|
|
std::vector<size_t> inLengths = {16, 64, 32, 960};
|
|
std::vector<int> reduceDims = {0, 1, 2};
|
|
std::vector<float> scales = {1.0f, 0.0f};
|
|
|
|
bool do_verification = true;
|
|
int data_type = 1;
|
|
int init_method = 2;
|
|
bool time_kernel = false;
|
|
|
|
public:
|
|
void show_usage(const char* cmd)
|
|
{
|
|
std::cout << "Usage of " << cmd << std::endl;
|
|
std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
|
|
<< std::endl;
|
|
std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
|
|
<< std::endl;
|
|
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
|
|
"comparing with the host-based reduction"
|
|
<< std::endl;
|
|
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)"
|
|
<< std::endl;
|
|
std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
|
|
"value, 3=decimal value)"
|
|
<< std::endl;
|
|
std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl;
|
|
};
|
|
|
|
int processArgs(int argc, char* argv[])
|
|
{
|
|
using ck::host_common::getTypeValuesFromString;
|
|
|
|
int ch;
|
|
|
|
while(1)
|
|
{
|
|
ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index);
|
|
if(ch == -1)
|
|
break;
|
|
switch(ch)
|
|
{
|
|
case 'D':
|
|
if(!optarg)
|
|
throw std::runtime_error("Invalid option format!");
|
|
|
|
inLengths = getTypeValuesFromString<size_t>(optarg);
|
|
break;
|
|
case 'R':
|
|
if(!optarg)
|
|
throw std::runtime_error("Invalid option format!");
|
|
|
|
reduceDims = getTypeValuesFromString<int>(optarg);
|
|
break;
|
|
case 'v':
|
|
if(!optarg)
|
|
throw std::runtime_error("Invalid option format!");
|
|
|
|
do_verification = static_cast<bool>(std::atoi(optarg));
|
|
break;
|
|
case '?':
|
|
if(std::string(long_options[option_index].name) == "help")
|
|
{
|
|
show_usage(argv[0]);
|
|
return (-1);
|
|
};
|
|
break;
|
|
default: show_usage(argv[0]); return (-1);
|
|
};
|
|
};
|
|
|
|
if(optind + 3 > argc)
|
|
{
|
|
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
|
};
|
|
|
|
data_type = std::atoi(argv[optind++]);
|
|
init_method = std::atoi(argv[optind++]);
|
|
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
|
|
|
|
if(scales.empty())
|
|
{
|
|
scales.push_back(1.0f);
|
|
scales.push_back(0.0f);
|
|
};
|
|
|
|
return (0);
|
|
};
|
|
};
|
|
|
|
template <typename InOutDataType,
|
|
typename AccDataType,
|
|
ReduceTensorOp ReduceOpId,
|
|
index_t PropagateNan,
|
|
index_t OutputIndex>
|
|
bool reduce_blockwise_test(bool do_verification,
|
|
int init_method,
|
|
bool time_kernel,
|
|
const std::vector<size_t>& inLengths,
|
|
const std::vector<int>& reduceDims,
|
|
float alpha,
|
|
float beta)
|
|
{
|
|
bool matched = false;
|
|
int result = 0;
|
|
|
|
static_for<0, std::tuple_size<reduce_shape_instances>::value, 1>{}([&](auto i) {
|
|
if(matched)
|
|
return;
|
|
|
|
using ShapeType = std::tuple_element_t<i.value, reduce_shape_instances>;
|
|
|
|
if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
|
|
return;
|
|
|
|
std::array<int, ShapeType::NumReduceDim_> arrReduceDims;
|
|
|
|
ck::ranges::copy(reduceDims, arrReduceDims.begin());
|
|
|
|
result = reduce_blockwise_impl<InOutDataType,
|
|
AccDataType,
|
|
ReduceOpId,
|
|
ShapeType::Rank_,
|
|
ShapeType::NumReduceDim_,
|
|
PropagateNan,
|
|
OutputIndex>(
|
|
do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
|
|
|
|
matched = true;
|
|
});
|
|
|
|
return (result == 0) ? true : false;
|
|
};
|
|
|
|
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
|
|
constexpr bool PropagateNan = true;
|
|
constexpr bool OutputIndex = false;
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
bool pass = true;
|
|
|
|
if(argc > 1)
|
|
{
|
|
SimpleAppArgs arg;
|
|
|
|
if(arg.processArgs(argc, argv) < 0)
|
|
return (-1);
|
|
|
|
if(arg.data_type == 0)
|
|
{
|
|
pass = reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
arg.do_verification,
|
|
arg.init_method,
|
|
arg.time_kernel,
|
|
arg.inLengths,
|
|
arg.reduceDims,
|
|
arg.scales[0],
|
|
arg.scales[1]);
|
|
}
|
|
else if(arg.data_type == 1)
|
|
{
|
|
pass = reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
arg.do_verification,
|
|
arg.init_method,
|
|
arg.time_kernel,
|
|
arg.inLengths,
|
|
arg.reduceDims,
|
|
arg.scales[0],
|
|
arg.scales[1]);
|
|
}
|
|
else if(arg.data_type == 3)
|
|
{
|
|
pass = reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
|
|
arg.do_verification,
|
|
arg.init_method,
|
|
arg.time_kernel,
|
|
arg.inLengths,
|
|
arg.reduceDims,
|
|
arg.scales[0],
|
|
arg.scales[1]);
|
|
}
|
|
else if(arg.data_type == 5)
|
|
{
|
|
pass = reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
arg.do_verification,
|
|
arg.init_method,
|
|
arg.time_kernel,
|
|
arg.inLengths,
|
|
arg.reduceDims,
|
|
arg.scales[0],
|
|
arg.scales[1]);
|
|
}
|
|
else if(arg.data_type == 6)
|
|
{
|
|
pass = reduce_blockwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
|
|
arg.do_verification,
|
|
arg.init_method,
|
|
arg.time_kernel,
|
|
arg.inLengths,
|
|
arg.reduceDims,
|
|
arg.scales[0],
|
|
arg.scales[1]);
|
|
}
|
|
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
|
else if(arg.data_type == 7)
|
|
{
|
|
pass = reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
|
|
arg.do_verification,
|
|
arg.init_method,
|
|
arg.time_kernel,
|
|
arg.inLengths,
|
|
arg.reduceDims,
|
|
arg.scales[0],
|
|
arg.scales[1]);
|
|
|
|
pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
|
|
arg.do_verification,
|
|
arg.init_method,
|
|
arg.time_kernel,
|
|
arg.inLengths,
|
|
arg.reduceDims,
|
|
arg.scales[0],
|
|
arg.scales[1]);
|
|
}
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
// for testing half_t
|
|
pass =
|
|
pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
|
|
pass =
|
|
pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
// for testing float
|
|
pass =
|
|
pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
// for testing double
|
|
pass =
|
|
pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
// for testing bhalf_t
|
|
pass = pass &&
|
|
reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
pass = pass &&
|
|
reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
// for testing int8_t
|
|
pass =
|
|
pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
pass =
|
|
pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
|
// for testing int4_t using AVG operation
|
|
pass =
|
|
pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
|
|
true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
|
|
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
// for testing int4_t using MAX operation
|
|
pass =
|
|
pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
|
|
true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
|
|
|
|
pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
|
|
true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
|
|
#endif
|
|
// for testing 3D input
|
|
pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
|
|
|
|
// for testing 5D input
|
|
pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
|
true, 2, true, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f);
|
|
};
|
|
|
|
return (pass ? 0 : 1);
|
|
};
|