diff --git a/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp
index 35625d142e..4f421b4282 100644
--- a/include/ck/library/utility/fill.hpp
+++ b/include/ck/library/utility/fill.hpp
@@ -85,6 +85,20 @@ struct FillUniformDistributionIntegerValue
     }
 };
 
+/**
+ * @brief A functor for filling a container with a monotonically increasing or decreasing sequence.
+ *
+ * FillMonotonicSeq generates a sequence of values starting from an initial value
+ * and incrementing by a fixed step for each subsequent element.
+ *
+ * @tparam T The numeric type of the sequence elements.
+ *
+ * Example usage:
+ * ```
+ * std::vector<int> v(5);
+ * FillMonotonicSeq<int>{10, 2}(v); // Fills v with {10, 12, 14, 16, 18}
+ * ```
+ */
 template <typename T>
 struct FillMonotonicSeq
 {
diff --git a/include/ck_tile/host/device_memory.hpp b/include/ck_tile/host/device_memory.hpp
index 13684c0e24..587f38987e 100644
--- a/include/ck_tile/host/device_memory.hpp
+++ b/include/ck_tile/host/device_memory.hpp
@@ -20,10 +20,35 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
 }
 
 /**
- * @brief Container for storing data in GPU device memory
+ * @brief Manages device memory allocation and host-device data transfers
  *
+ * DeviceMem encapsulates GPU memory management operations using HIP runtime API.
+ * It provides functionality for allocating device memory, transferring data between
+ * host and device, and performing basic memory operations.
+ *
+ * Key features:
+ * - Automatic memory allocation and deallocation
+ * - Host-to-device and device-to-host data transfers
+ * - Memory initialization operations
+ * - Integration with HostTensor for simplified data handling
+ *
+ * Usage example:
+ * ```
+ * // Allocate device memory
+ * BHostTensor<float> AHostData({256});
+ * DeviceMem d_mem(BHostData.get_element_space_size_in_bytes());
+ *
+ * // Transfer data to device
+ * HostTensor<float> AHostTensor({256});
+ * d_mem.ToDevice(AHostData.data());
+ *
+ * // Retrieve data from device
+ * HostTensor<float> ResultHostTensor({256});
+ * d_mem.FromDevice(ResultHostTensor.data());
+ * ```
  */
 struct DeviceMem
+
 {
     DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
     DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
@@ -163,8 +188,8 @@ struct DeviceMem
         }
     }
 
-    void* mpDeviceBuf;
-    std::size_t mMemSize;
+    void* mpDeviceBuf;    ///< pointer to device buffer
+    std::size_t mMemSize; ///< size of device buffer in bytes
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 3f64eb28cd..4a359e031f 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -17,13 +17,31 @@
 
 namespace ck_tile {
 
+/**
+ * @brief Functor for filling a range with randomly generated values from a uniform distribution.
+ *
+ * This struct provides functionality to fill iterators or ranges with random values
+ * generated from a uniform distribution. It supports both single-threaded and
+ * multi-threaded operation.
+ *
+ * @tparam T The target type for the generated values.
+ *
+ * @note The multi-threaded implementation is not guaranteed to provide perfectly
+ * distributed values across threads.
+ *
+ * @example
+ *
+ *     // Direct usage without creating a separate variable:
+ *     ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_host_tensor);
+ */
 template <typename T>
 struct FillUniformDistribution
 {
     float a_{-5.f};
     float b_{5.f};
     std::optional<uint32_t> seed_{11939};
-    // ATTENTION: threaded does not guarantee the distribution between thread
+    // ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed
+    // across threads).
     bool threaded = false;
 
     template <typename ForwardIter>
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index a43877c6da..deaa158d50 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -85,6 +85,19 @@ CK_TILE_HOST auto construct_f_unpack_args(F, T args)
     return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
 }
 
+/**
+ * @brief Descriptor for tensors in host memory.
+ *
+ * HostTensorDescriptor manages the shape (dimensions) and memory layout (strides)
+ * of a tensor in host memory. It provides functionality to:
+ * - Store tensor dimensions and strides
+ * - Calculate default strides for contiguous memory layout
+ * - Convert multi-dimensional indices to linear memory offsets
+ * - Query tensor metadata (dimensions, element counts, etc.)
+ *
+ * The class supports both automatic stride calculation for contiguous memory layout
+ * and custom strides for more complex memory patterns.
+ */
 struct HostTensorDescriptor
 {
     HostTensorDescriptor() = default;
@@ -138,12 +151,35 @@ struct HostTensorDescriptor
     }
 
     std::size_t get_num_of_dimension() const { return mLens.size(); }
+    /**
+     * @brief Calculates the total number of elements in the tensor.
+     *
+     * Computes the product of all dimension lengths to determine the
+     * total element count in the tensor.
+     *
+     * @pre The lengths array (mLens) and strides array (mStrides) must have
+     *      the same size.
+     *
+     * @return The total number of elements in the tensor.
+     */
     std::size_t get_element_size() const
     {
         assert(mLens.size() == mStrides.size());
         return std::accumulate(
             mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
     }
+    /**
+     * @brief Calculates the total element space required for the tensor in memory.
+     *
+     * This method computes the minimum size of contiguous memory needed to store
+     * all elements of the tensor, taking into account the tensor's dimensions and
+     * strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i)
+     * across all dimensions.
+     *
+     * Dimensions with length 0 are skipped in this calculation.
+     *
+     * @return The size of the tensor's element space (number of elements).
+     */
     std::size_t get_element_space_size() const
     {
         std::size_t space = 1;
@@ -165,6 +201,18 @@ struct HostTensorDescriptor
 
     const std::vector<std::size_t>& get_strides() const { return mStrides; }
 
+    /**
+     * @brief Calculates the linear offset from multi-dimensional indices.
+     *
+     * Converts a set of N-dimensional indices into a single linear offset by computing
+     * the inner product of the indices with the tensor's strides.
+     *
+     * @tparam Is Parameter pack of index types (should be convertible to std::size_t)
+     * @param is Variable number of indices, one for each dimension of the tensor
+     * @return std::size_t Linear offset corresponding to the given multi-dimensional indices
+     *
+     * @pre The number of indices must match the number of dimensions in the tensor
+     */
     template <typename... Is>
     std::size_t GetOffsetFromMultiIndex(Is... is) const
     {
@@ -173,6 +221,15 @@ struct HostTensorDescriptor
         return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
     }
 
+    /**
+     * @brief Calculates the linear memory offset from a multi-dimensional index
+     *
+     * Computes the linear offset by performing an inner product between the provided
+     * multi-dimensional indices and the tensor's strides.
+     *
+     * @param iss Vector containing the multi-dimensional indices
+     * @return The calculated linear offset as a size_t
+     */
     std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
     {
         return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
@@ -194,8 +251,8 @@ struct HostTensorDescriptor
     }
 
     private:
-    std::vector<std::size_t> mLens;
-    std::vector<std::size_t> mStrides;
+    std::vector<std::size_t> mLens;    ///< Lengths of each dimension
+    std::vector<std::size_t> mStrides; ///< Strides for each dimension
 };
 
 template <typename New2Old>
@@ -681,6 +738,24 @@ struct HostTensor
     Data mData;
 };
 
+/**
+ * @brief Creates a host tensor descriptor with specified dimensions and layout
+ *
+ * Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor
+ * layout is row-major or column-major. This is determined via the compile-time template
+ * parameter `is_row_major`.
+ *
+ * @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or
+ * column-major (false)
+ *
+ * @param row Number of rows in the tensor
+ * @param col Number of columns in the tensor
+ * @param stride Stride between adjacent rows (for row-major) or columns (for column-major)
+ *
+ * @return HostTensorDescriptor with shape {row, col} and strides:
+ *         - For row-major: {stride, 1}
+ *         - For column-major: {1, stride}
+ */
 template <bool is_row_major>
 auto host_tensor_descriptor(std::size_t row,
                             std::size_t col,
@@ -698,6 +773,7 @@ auto host_tensor_descriptor(std::size_t row,
         return HostTensorDescriptor({row, col}, {1_uz, stride});
     }
 }
+
 template <bool is_row_major>
 auto get_default_stride(std::size_t row,
                         std::size_t col,
@@ -718,5 +794,4 @@ auto get_default_stride(std::size_t row,
     else
         return stride;
 }
-
 } // namespace ck_tile