Files
composable_kernel/include/ck_tile/host/device_prop.hpp
assistant-librarian[bot] 6181eb2adf [rocm-libraries] ROCm/rocm-libraries#4279 (commit 5b3f4b7)
[CK_TILE] Stream-K XCD remapping (#4279)

## Proposed changes

This PR adds support for XCD remapping as detailed in this
[document](https://amdcloud.sharepoint.com/:w:/r/sites/ComposableKernels/Shared%20Documents/Stream-K/Design%20Docs/XCD%20Mapping.docx?d=w2df1b0737dc54614970d99a2e26022d1&csf=1&web=1&e=mLVN4A).
On gfx942, workgroups are typically scheduled round-robin across XCDs,
which can lead to poor locality. We will use a remapping to assign
workgroups to contiguous tiles in the XCDs improving the locality and
the cache hit rate. This is done through a function that computes this
contiguous mapping from this
[PR](https://github.com/ROCm/composable_kernel/pull/3161), which we have
added to the StreamKTilePartitioner. This will require minimal changes
to the Stream-K algorithm, only requiring a remap at the time the
workgroups are partitioned. Through this approach we can improve the
data locality by improving cache hits therefore closing performance gaps
that are seen with the default scheduling. There have been unit tests
added to verify the function in isolation. This is an optimization that
is not specialized to just Stream-K GEMM and can be applied across GEMM.

Note: This only applies to the gfx942 as they introduce the XCDs.

Please put an `x` into the boxes that apply. You can also fill these out
after creating the PR. If you're not sure, please don't hesitate to ask.

- [x] I have added tests relevant to the introduced functionality, and
the unit tests are passing locally
- [ ] I have added the test to REGRESSION_TESTS list defined at the top
of CMakeLists.txt in tests/CMakeLists.txt, **IF** the test takes more
than 30 seconds to run.
- [x] I have added inline documentation which enables the maintainers
with understanding the motivation
- [ ] I have removed the stale documentation which is no longer relevant
after this pull request
- [ ] (If this change is user-facing) I have added release notes which
provide the end users with a brief summary of the improvement from this
pull request
- [x] I have run `clang-format` on all changed files
- [x] Any dependent changes have been merged

---
🔁 Imported from
[ROCm/composable_kernel#3652](https://github.com/ROCm/composable_kernel/pull/3652)
🧑‍💻 Originally authored by @arai713

---------

Co-authored-by: Astha <astha.rai713@gmail.com>
Co-authored-by: systems-assistant[bot] <systems-assistant[bot]@users.noreply.github.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: Christopher Millette <63608002+cgmillette@users.noreply.github.com>
Co-authored-by: arai713 <67439843+arai713@users.noreply.github.com>
2026-05-26 09:43:03 -07:00

128 lines
3.4 KiB
C++

// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#ifndef __HIPCC_RTC__
#include <string>
#include <string_view>
#include <hip/hip_runtime.h>
namespace ck_tile {
constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u)
{
return str.empty() ? h
: fnv1a_hash(str.substr(1),
(h ^ static_cast<unsigned char>(str.front())) * 16777619u);
}
inline std::string get_device_name()
{
hipDeviceProp_t props{};
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return std::string();
}
status = hipGetDeviceProperties(&props, device);
if(status != hipSuccess)
{
return std::string();
}
const std::string raw_name(props.gcnArchName);
const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
switch(fnv1a_hash(name))
{
// https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
case fnv1a_hash("Ellesmere"):
case fnv1a_hash("Baffin"):
case fnv1a_hash("RacerX"):
case fnv1a_hash("Polaris10"):
case fnv1a_hash("Polaris11"):
case fnv1a_hash("Tonga"):
case fnv1a_hash("Fiji"):
case fnv1a_hash("gfx800"):
case fnv1a_hash("gfx802"):
case fnv1a_hash("gfx804"): return "gfx803";
case fnv1a_hash("Vega10"):
case fnv1a_hash("gfx901"): return "gfx900";
case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030";
default: return name;
}
}
inline int get_device_revision()
{
hipDeviceProp_t props{};
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return -1; // Error: cannot get device
}
status = hipGetDeviceProperties(&props, device);
if(status != hipSuccess)
{
return -1; // Error: cannot get device properties
}
return props.asicRevision;
}
inline bool is_gfx11_supported()
{
return get_device_name() == "gfx1100" || get_device_name() == "gfx1101" ||
get_device_name() == "gfx1102" || get_device_name() == "gfx1103" ||
get_device_name() == "gfx1150" || get_device_name() == "gfx1151" ||
get_device_name() == "gfx1152" || get_device_name() == "gfx1153";
}
inline bool is_gfx120_supported()
{
return get_device_name() == "gfx1200" || get_device_name() == "gfx1201";
}
inline bool is_gfx95_supported() { return get_device_name() == "gfx950"; }
inline bool is_gfx125_supported() { return get_device_name() == "gfx1250"; }
inline bool is_gfx12_supported() { return is_gfx120_supported() || is_gfx125_supported(); }
inline size_t get_num_cus()
{
hipDeviceProp_t props{};
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return 0;
}
status = hipGetDeviceProperties(&props, device);
if(status != hipSuccess)
{
return 0;
}
return static_cast<size_t>(props.multiProcessorCount);
}
inline size_t get_num_xccs()
{
int device = 0;
int num_xccs = 1;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return 0;
}
status = hipDeviceGetAttribute(&num_xccs, hipDeviceAttributeNumberOfXccs, device);
if(status == hipSuccess)
{
return num_xccs;
}
return 1;
}
} // namespace ck_tile
#endif