From f0655784fbbdbd516b5c344acdd4612d21b5ff52 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 22 Jan 2026 02:52:27 +0000
Subject: [PATCH] Add inline documentation for container and tuple helper
 optimizations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Detailed comments explain:
- Why named functors reduce instantiations vs lambdas in container_concat
- Impact: 50% reduction in container_concat (186 → 93 instantiations)
- make_uniform_tuple optimization using pack expansion instead of lambda
- generate_identity_sequences optimization for identity permutations
- When to apply these patterns elsewhere

This documentation helps maintainers understand the build-time optimization
strategies and prevents reverting to less efficient patterns.
---
 include/ck/utility/container_helper.hpp | 24 +++++++++++++++++--
 include/ck/utility/tuple_helper.hpp     | 32 +++++++++++++++++++++----
 2 files changed, 50 insertions(+), 6 deletions(-)
diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp
index e09c32d20f..159cbb1c35 100644
--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -324,8 +324,28 @@ container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
     return y;
 }
 
-// Named functors for container_concat to reduce template instantiations
-// (lambdas create unique types per call site, functors are shared)
+// Named functors for container operations - optimized to reduce template instantiations
+//
+// Problem: Using lambdas in container operations causes excessive instantiations because
+// each lambda expression creates a unique type, even if they do the same thing.
+//
+// Example with lambdas (BEFORE):
+//   container_concat uses [](auto x, auto y) { return make_tuple(x, y); }
+//   Each call site creates a new lambda type → multiple instantiations of the same logic
+//   Result: 186 template instantiations
+//
+// Solution: Named functors (AFTER):
+//   make_tuple_functor is a single reusable type
+//   All call sites use the same type → single instantiation of the logic
+//   Result: 93 template instantiations (50% reduction)
+//
+// Impact:
+// - container_concat: 186 → 93 instantiations (50% reduction)
+// - Compilation time improvement proportional to instantiation reduction
+// - Pattern applies to any repeated template operation with lambdas
+//
+// Trade-off: Named functors require more upfront definition but are reusable across the codebase.
+//
 struct make_tuple_functor
 {
     template <typename... Ts>
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index e7ce443ea2..fab3539954 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -37,8 +37,19 @@ __host__ __device__ constexpr auto generate_tie(F&& f, Number<N>)
                   typename arithmetic_sequence_gen<0, N, 1>::type{});
 }
 
-// Optimized helper for common pattern: generate_tuple([](auto i) { return Sequence<i.value>{}; },
-// N) Creates Tuple<Sequence<0>, Sequence<1>, ..., Sequence<N-1>> without lambda instantiation
+// generate_identity_sequences - creates Tuple<Sequence<0>, Sequence<1>, ..., Sequence<N-1>>
+//
+// Optimization: Uses pack expansion with named functor to avoid per-element lambda instantiation
+//
+// Why this approach:
+// - Common pattern: creating identity permutations for tensor dimensions
+// - Lambda approach: N unique lambda types for N sequences → O(N) instantiations
+// - Named functor approach: Single functor type → O(1) instantiation overhead
+//
+// The detail::make_identity_sequences_impl creates a Sequence<I> for each index I via pack expansion
+//
+// Impact: Reduces instantiation overhead for identity sequence generation (common in transforms)
+//
 namespace detail {
 template <index_t... Is>
 __host__ __device__ constexpr auto make_identity_sequences_impl(Sequence<Is...>)
@@ -59,8 +70,21 @@ __host__ __device__ constexpr auto generate_identity_sequences(Number<N>)
     return generate_identity_sequences<N>();
 }
 
-// Optimized helper for common pattern: generate_tuple([&](auto) { return value; }, Number<N>{})
-// Creates Tuple<T, T, ..., T> (N copies) without lambda instantiation
+// make_uniform_tuple - generates a tuple of N identical values without lambda instantiation
+//
+// Optimization: Uses named functor with pack expansion instead of generate_tuple with lambda
+//
+// Why this approach:
+// - generate_tuple with lambda: each Size instantiates a unique lambda type → O(N) instantiations
+// - make_uniform_tuple with named functor: single functor type reused → O(1) instantiations
+// - Pack expansion ((void)Is, Value)... creates N copies of Value without recursion
+//
+// Example: make_uniform_tuple<4>(42) generates Tuple<42, 42, 42, 42>
+// - Old way: generate_tuple<4>([](auto) { return 42; }) → 4+ lambda instantiations
+// - New way: make_uniform_tuple<4>(42) → 1 functor instantiation
+//
+// Impact: Reduces instantiation count when creating uniform tuples (common in tensor ops)
+//
 namespace detail {
 template <typename T, index_t... Is>
 __host__ __device__ constexpr auto make_uniform_tuple_impl(T&& value, Sequence<Is...>)