mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-01 12:11:19 +00:00
* Add trait to use a persistent kernel and split the entrypoints in grouped gemm * Some helper functions for persistent kernel case * Get max occupancy grid using device properties * Implement tile loop in main entry point to grouped gemm * Enable GridSize() on device * Handle offset tile index using real current block index * Add persistent kernel choice to grouped gemm example * Use a for-loop for iterating over the group * Reduce VGPR spills by early-exit * Enable persistent kernel choice in grouped_gemm example * Add persistent kernel option to grouped_gemm test * Fix formatting with remod.py * Remove GridUpdateBlocks as blocks are now iteratively computed * Add comment about VGPR spilling * Fix formatting * Use CK_TILE_HOST instead of __host__ * Enable all Row/Col combinations in grouped gemm unit test * Add some KBatch=2 cases to grouped gemm tests * Fix SplitK for grouped gemm * Enable pipeline hotloop/tailnumber selection in-kernel for grouped gemm * Add type traits * Split examples to regular and tileloop * Formatting * Use hipExtStreamGetCUMask to get current active CUs for the given stream * Align test and example kernel config, and disable validation for splitk repeats * Remove debug options from CMakeLists.txt * Separate the code paths for persistent/non-persistent in test * Fix formatting * Address review comments --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
46 lines
1.0 KiB
C++
46 lines
1.0 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <hip/hip_runtime_api.h>
|
|
|
|
#include "ck_tile/core/numeric/integer.hpp"
|
|
#include "ck_tile/host/stream_config.hpp"
|
|
#include "ck_tile/host/hip_check_error.hpp"
|
|
|
|
namespace ck_tile {
|
|
|
|
static inline index_t get_available_compute_units(const stream_config& s)
|
|
{
|
|
constexpr static uint32_t MAX_MASK_DWORDS = 64;
|
|
|
|
// assume at most 64*32 = 2048 CUs
|
|
uint32_t cu_mask[MAX_MASK_DWORDS]{};
|
|
|
|
auto count_set_bits = [](uint32_t dword) {
|
|
index_t count = 0;
|
|
while(dword != 0)
|
|
{
|
|
if(dword & 0x1)
|
|
{
|
|
count++;
|
|
}
|
|
dword = dword >> 1;
|
|
}
|
|
return count;
|
|
};
|
|
|
|
HIP_CHECK_ERROR(hipExtStreamGetCUMask(s.stream_id_, MAX_MASK_DWORDS, &cu_mask[0]));
|
|
|
|
index_t num_cu = 0;
|
|
for(uint32_t i = 0; i < MAX_MASK_DWORDS; i++)
|
|
{
|
|
num_cu += count_set_bits(cu_mask[i]);
|
|
}
|
|
|
|
return num_cu;
|
|
};
|
|
|
|
} // namespace ck_tile
|