From fd6c05e9d72551985291658f99f55114a31c4a29 Mon Sep 17 00:00:00 2001 From: Christopher Millette <63608002+cgmillette@users.noreply.github.com> Date: Fri, 6 Mar 2026 09:27:27 -0700 Subject: [PATCH] [CK] Replace tuple value construction with tuple_element_t type extraction [1A] (#5030) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary ### Rationale CK's device operation instance registration uses `add_device_operation_instances` at ~1,850 call sites to register GPU kernel configurations. The existing implementation constructs `std::tuple` values just to extract their types via `decltype`, then copy-constructs each instance into `make_unique`. This is wasteful — only the types matter, not the values — and forces the compiler to instantiate the full `std::tuple` constructor and `std::get` machinery at every call site. ### What changed - Replace `remove_cvref_t(tuple_obj))>` with `std::tuple_element_t`, which extracts the type directly without constructing any values - Replace copy-from-default `make_unique(value)` with direct default construction `make_unique()` — all CK device operation instances are stateless structs with configuration encoded in template parameters - Add `static_assert(std::is_default_constructible_v)` to enforce this contract at compile time with a clear error message - Add Doxygen documentation for this high-traffic public API ### Value - Eliminates unnecessary template instantiation of `std::tuple` constructors and `std::get` across ~1,850 call sites - Establishes a cleaner, more intention-revealing pattern for type-only tuple usage - The `static_assert` prevents silent breakage if a non-default-constructible type is ever added - No runtime behavior change — zero risk ### Files changed (9) - `add_device_operation_instance.hpp`: Core pattern change - 3 example files, 3 reduce instance headers, 1 convolution header, 1 profiler header ## Test plan - [ ] Existing CI tests cover all ~1,850 call sites (GEMM, reduce, softmax, convolution) - [ ] `static_assert` provides compile-time validation stronger than runtime tests - [ ] No runtime behavior change — stateless struct default construction is identical to copy-from-default - [ ] Compatible with both `std::tuple` and `ck::type_list` containers 🤖 Generated with [Claude Code](https://claude.com/claude-code) ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: Claude Opus 4.6 Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- example/12_reduce/reduce_blockwise.cpp | 4 +--- .../reduce_multiblock_atomic_add.cpp | 4 +--- .../12_reduce/reduce_threadwise_multi_d.cpp | 4 +--- .../add_device_operation_instance.hpp | 21 +++++++++++++------ ..._bwd_wei_exp_device_operation_instance.hpp | 6 ++++-- .../device_reduce_instance_blockwise.hpp | 10 ++++----- ..._reduce_instance_multiblock_atomic_add.hpp | 11 +++++----- .../device_reduce_instance_threadwise.hpp | 5 ++--- .../include/profiler/profile_reduce_impl.hpp | 4 +--- 9 files changed, 35 insertions(+), 34 deletions(-) diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index f8299028da..031b6ae979 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -129,13 +129,11 @@ bool reduce_blockwise_test(bool do_verification, bool matched = false; int result = 0; - const auto tuple_object = reduce_shape_instances{}; - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { if(matched) return; - using ShapeType = remove_cvref_t(tuple_object))>; + using ShapeType = std::tuple_element_t; if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) return; diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp index 66fc2bb582..0f527ad68f 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add.cpp +++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp @@ -127,13 +127,11 @@ bool reduce_multiblock_atomic_add_test(bool do_verification, bool matched = false; int result = 0; - const auto tuple_object = reduce_shape_instances{}; - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { if(matched) return; - using ShapeType = remove_cvref_t(tuple_object))>; + using ShapeType = std::tuple_element_t; if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) return; diff --git a/example/12_reduce/reduce_threadwise_multi_d.cpp b/example/12_reduce/reduce_threadwise_multi_d.cpp index ee06395771..df50242103 100644 --- a/example/12_reduce/reduce_threadwise_multi_d.cpp +++ b/example/12_reduce/reduce_threadwise_multi_d.cpp @@ -129,13 +129,11 @@ bool reduce_threadwise_multi_d_test(bool do_verification, bool matched = false; int result = 0; - const auto tuple_object = reduce_shape_instances{}; - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { if(matched) return; - using ShapeType = remove_cvref_t(tuple_object))>; + using ShapeType = std::tuple_element_t; if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) return; diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp index 3a88358341..d0520a9342 100644 --- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp @@ -14,14 +14,18 @@ namespace tensor_operation { namespace device { namespace instance { +/** + * @brief Register device operation instances from a type container. + * @tparam BaseOp The base class that all operation instances must derive from. + * @tparam NewOpInstances A std::tuple (or ck::type_list) of device operation types. + * Only the type is used; the parameter value is unused (retained for type deduction). + */ template void add_device_operation_instances(std::vector>& op_instances, - const NewOpInstances& new_op_instances) + const NewOpInstances& /*new_op_instances*/) { ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) { - const auto new_op_instance = std::get(new_op_instances); - - using NewOpInstance = remove_cvref_t; + using NewOpInstance = std::tuple_element_t; if constexpr(std::is_same_v) { return; // We can use nullptr_t to enable trailing comma @@ -29,8 +33,13 @@ void add_device_operation_instances(std::vector>& op_ins else { static_assert(std::is_base_of_v, - "wrong! NewOpInstance should be derived from BaseOp"); - op_instances.push_back(std::make_unique(new_op_instance)); + "add_device_operation_instances: NewOpInstance must derive from BaseOp"); + static_assert( + std::is_default_constructible_v, + "add_device_operation_instances: NewOpInstance must be default-constructible; " + "registration default-constructs instances and ignores tuple values, so store " + "configuration in template parameters instead of constructor arguments."); + op_instances.push_back(std::make_unique()); } }); } diff --git a/library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp index 594c9ca5a7..3bdf25778d 100644 --- a/library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp @@ -45,9 +45,11 @@ void add_explicit_gemm_device_operation_instances( DeviceGemmOp>; static_assert(std::is_base_of_v, - "wrong! NewOpInstance should be derived from BaseOp"); + "NewOpInstance must derive from BaseOp"); + static_assert(std::is_default_constructible_v, + "NewOpInstance must be default-constructible"); - op_instances.push_back(std::make_unique(NewOpInstance{})); + op_instances.push_back(std::make_unique()); }); } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index 1502d905b9..9917f81f1c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -89,13 +89,12 @@ void add_device_reduce_instance_blockwise( { static_for<0, std::tuple_size::value, 1>{}( [&](auto i) { - using cfg1 = remove_cvref_t( - reduce_configuration_1_instances_blockwise{}))>; + using cfg1 = std::tuple_element_t; static_for<0, std::tuple_size::value, 1>{}( [&](auto j) { - using cfg2 = remove_cvref_t( - reduce_configuration_2_instances_blockwise{}))>; + using cfg2 = + std::tuple_element_t; using ReduceOpInstance = DeviceReduceMultiBlock; - device_op_instances.push_back( - std::make_unique(ReduceOpInstance{})); + device_op_instances.push_back(std::make_unique()); }); }); }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index 76f58782c5..5fa81b3c73 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -90,14 +90,15 @@ void add_device_reduce_instance_multiblock_atomic_add( static_for<0, std::tuple_size::value, 1>{}([&](auto i) { - using cfg1 = remove_cvref_t( - reduce_configuration_1_instances_multiblock_atomic_add{}))>; + using cfg1 = + std::tuple_element_t; static_for<0, std::tuple_size::value, 1>{}([&](auto j) { - using cfg2 = remove_cvref_t( - reduce_configuration_2_instances_multiblock_atomic_add{}))>; + using cfg2 = + std::tuple_element_t; using ReduceOpInstance = DeviceReduceMultiBlock; - device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); + device_op_instances.push_back(std::make_unique()); }); }); }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index 250f266507..753b86e98a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -77,8 +77,7 @@ void add_device_reduce_instance_threadwise( static_for<0, std::tuple_size::value, 1>{}( [&](auto j) { - using cfg2 = remove_cvref_t( - reduce_configuration_2_instances_threadwise{}))>; + using cfg2 = std::tuple_element_t; using ReduceOpInstance = DeviceReduceThreadWise; - device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); + device_op_instances.push_back(std::make_unique()); }); }; diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp index 191c57780c..4a32505be7 100644 --- a/profiler/include/profiler/profile_reduce_impl.hpp +++ b/profiler/include/profiler/profile_reduce_impl.hpp @@ -488,13 +488,11 @@ bool profile_reduce_impl(bool do_verification, using tuple_of_description_instances = tensor_operation::device::instance::reduce_description_instances; - const auto tuple_object = tuple_of_description_instances{}; - static_for<0, std::tuple_size::value, 1>{}([&](auto i) { if(matched) return; - using descType = remove_cvref_t(tuple_object))>; + using descType = std::tuple_element_t; if(!description_match( descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))