From f0655784fbbdbd516b5c344acdd4612d21b5ff52 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 22 Jan 2026 02:52:27 +0000 Subject: [PATCH] Add inline documentation for container and tuple helper optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detailed comments explain: - Why named functors reduce instantiations vs lambdas in container_concat - Impact: 50% reduction in container_concat (186 → 93 instantiations) - make_uniform_tuple optimization using pack expansion instead of lambda - generate_identity_sequences optimization for identity permutations - When to apply these patterns elsewhere This documentation helps maintainers understand the build-time optimization strategies and prevents reverting to less efficient patterns. --- include/ck/utility/container_helper.hpp | 24 +++++++++++++++++-- include/ck/utility/tuple_helper.hpp | 32 +++++++++++++++++++++---- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp index e09c32d20f..159cbb1c35 100644 --- a/include/ck/utility/container_helper.hpp +++ b/include/ck/utility/container_helper.hpp @@ -324,8 +324,28 @@ container_reverse_inclusive_scan(const Tuple& x, Reduce f, TData init) return y; } -// Named functors for container_concat to reduce template instantiations -// (lambdas create unique types per call site, functors are shared) +// Named functors for container operations - optimized to reduce template instantiations +// +// Problem: Using lambdas in container operations causes excessive instantiations because +// each lambda expression creates a unique type, even if they do the same thing. +// +// Example with lambdas (BEFORE): +// container_concat uses [](auto x, auto y) { return make_tuple(x, y); } +// Each call site creates a new lambda type → multiple instantiations of the same logic +// Result: 186 template instantiations +// +// Solution: Named functors (AFTER): +// make_tuple_functor is a single reusable type +// All call sites use the same type → single instantiation of the logic +// Result: 93 template instantiations (50% reduction) +// +// Impact: +// - container_concat: 186 → 93 instantiations (50% reduction) +// - Compilation time improvement proportional to instantiation reduction +// - Pattern applies to any repeated template operation with lambdas +// +// Trade-off: Named functors require more upfront definition but are reusable across the codebase. +// struct make_tuple_functor { template diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index e7ce443ea2..fab3539954 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -37,8 +37,19 @@ __host__ __device__ constexpr auto generate_tie(F&& f, Number) typename arithmetic_sequence_gen<0, N, 1>::type{}); } -// Optimized helper for common pattern: generate_tuple([](auto i) { return Sequence{}; }, -// N) Creates Tuple, Sequence<1>, ..., Sequence> without lambda instantiation +// generate_identity_sequences - creates Tuple, Sequence<1>, ..., Sequence> +// +// Optimization: Uses pack expansion with named functor to avoid per-element lambda instantiation +// +// Why this approach: +// - Common pattern: creating identity permutations for tensor dimensions +// - Lambda approach: N unique lambda types for N sequences → O(N) instantiations +// - Named functor approach: Single functor type → O(1) instantiation overhead +// +// The detail::make_identity_sequences_impl creates a Sequence for each index I via pack expansion +// +// Impact: Reduces instantiation overhead for identity sequence generation (common in transforms) +// namespace detail { template __host__ __device__ constexpr auto make_identity_sequences_impl(Sequence) @@ -59,8 +70,21 @@ __host__ __device__ constexpr auto generate_identity_sequences(Number) return generate_identity_sequences(); } -// Optimized helper for common pattern: generate_tuple([&](auto) { return value; }, Number{}) -// Creates Tuple (N copies) without lambda instantiation +// make_uniform_tuple - generates a tuple of N identical values without lambda instantiation +// +// Optimization: Uses named functor with pack expansion instead of generate_tuple with lambda +// +// Why this approach: +// - generate_tuple with lambda: each Size instantiates a unique lambda type → O(N) instantiations +// - make_uniform_tuple with named functor: single functor type reused → O(1) instantiations +// - Pack expansion ((void)Is, Value)... creates N copies of Value without recursion +// +// Example: make_uniform_tuple<4>(42) generates Tuple<42, 42, 42, 42> +// - Old way: generate_tuple<4>([](auto) { return 42; }) → 4+ lambda instantiations +// - New way: make_uniform_tuple<4>(42) → 1 functor instantiation +// +// Impact: Reduces instantiation count when creating uniform tuples (common in tensor ops) +// namespace detail { template __host__ __device__ constexpr auto make_uniform_tuple_impl(T&& value, Sequence)