mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-22 07:54:33 +00:00
[rocm-libraries] ROCm/rocm-libraries#8260 (commit 1139236)
[ck] Enforce LF-only line endings in C/C++ sources
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Summary
Several CK source files carry Windows **CRLF** line endings (a trailing
carriage return on each line), introduced by editors configured for
Windows endings or copy/paste from Windows tooling. These are purely
cosmetic but they pollute diffs (whole-file churn the first time someone
makes an LF edit), confuse `clang-format`, and are inconsistent with the
LF-only convention used across the rest of the tree.
This PR (a) normalizes every existing CRLF file (6 files) to LF and (b)
adds a pre-checkin gate so new CRLF leaks are rejected before merge.
## File extensions covered
Both the cleanup scan and the new Jenkins enforcement stage use the same
predicate as the adjacent `ASCII Only Check` stage:
```
*.h *.hpp *.cpp *.h.in *.hpp.in *.cpp.in *.inc *.cl
```
(excluding `*/build/*` and `*/include/rapidjson/*`). The local
pre-commit hook's `c++/inc` type filter covers the same set.
## Why no enforcement today
CK is opted out of the rocm-libraries root `.pre-commit-config.yaml`, so
the existing `pre-commit` workflow doesn't touch CK. The local CK
`.pre-commit-config.yaml` only runs for developers who installed hooks.
The **authoritative gate is therefore the new Jenkins stage** in this
PR; the local hook is convenience.
## Commit layout (bisect-friendly)
1. `[ck] Normalize CRLF line endings to LF in C/C++ sources`
Mechanical line-ending cleanup across 6 files. No content change: every
edit is purely CRLF -> LF, verified with `git diff --ignore-cr-at-eol`
reporting an empty diff.
2. `[ck] Enforce LF-only line endings in C/C++ sources`
- New `projects/composablekernel/script/check_no_crlf.sh` (modeled on
`check_ascii_only.sh`).
- New `crlf-checker` entry in
`projects/composablekernel/.pre-commit-config.yaml` under the
local-hooks block (`types_or: [c++, inc]`).
- New `CRLF Check` parallel stage in
`projects/composablekernel/Jenkinsfile`'s `Static checks` block,
mirroring the adjacent `ASCII Only Check` stage. Always-on, no
`RUN_CPPCHECK` gate.
The tree is buildable at every commit boundary. Commit 1 leaves 0 CRLF
violations; commit 2 wires the gate.
## Demo
Script output on a synthesized violation:
```
$ printf 'int main() {}\r\n' > /tmp/bad.cpp
$ projects/composablekernel/script/check_no_crlf.sh /tmp/bad.cpp
ERROR: /tmp/bad.cpp contains CRLF (Windows) line endings:
1:int main() {}<CR>
Fix: convert to LF, e.g. 'sed -i 's/\r$//' /tmp/bad.cpp' or 'dos2unix /tmp/bad.cpp'
$ echo $?
1
```
Full repo scan after the cleanup commit:
```
$ cd projects/composablekernel && find . -type f \( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' \
-o -name '*.h.in' -o -name '*.hpp.in' -o -name '*.cpp.in' -o -name '*.inc' -o -name '*.cl' \) \
-not -path '*/build/*' -not -path '*/include/rapidjson/*' -print0 \
| xargs -0 -P 8 -n 64 script/check_no_crlf.sh
$ echo $?
0
```
## Test plan
- [ ] Jenkins PR build: confirm new `Static checks -> CRLF Check` stage
runs green over the full predicate and the existing `ASCII Only Check` /
`Clang Format` stages are unaffected.
- [ ] Local: `pre-commit run crlf-checker --all-files` runs cleanly
after installing CK pre-commit hooks.
- [ ] Manually inject a CRLF line ending in any `.cpp/.hpp/.inc` file,
push: confirm Jenkins fails the new stage with a clear error.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
committed by
assistant-librarian[bot]
parent
96a7e44832
commit
329e589840
@@ -31,6 +31,11 @@ repos:
|
||||
entry: projects/composablekernel/script/check_ascii_only.sh
|
||||
language: script
|
||||
types_or: [c++, inc]
|
||||
- id: crlf-checker
|
||||
name: Check for CRLF line endings in C/C++ sources
|
||||
entry: projects/composablekernel/script/check_no_crlf.sh
|
||||
language: script
|
||||
types_or: [c++, inc]
|
||||
- id: remove-exec-bit
|
||||
name: Remove executable bit from non-executable files
|
||||
entry: projects/composablekernel/script/remove_exec_bit.sh
|
||||
|
||||
18
Jenkinsfile
vendored
18
Jenkinsfile
vendored
@@ -348,6 +348,24 @@ pipeline {
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage('CRLF Check') {
|
||||
agent{ label rocmnode("nogpu") }
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_cmd = """cd .. && \
|
||||
find . -type f \\( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.h.in' -o -name '*.hpp.in' -o -name '*.cpp.in' -o -name '*.inc' -o -name '*.cl' \\) \
|
||||
-not -path '*/build/*' -not -path '*/include/rapidjson/*' \
|
||||
-print0 | xargs -0 -P 8 -n 64 script/check_no_crlf.sh"""
|
||||
}
|
||||
steps{
|
||||
deleteDir()
|
||||
script {
|
||||
loadCk();
|
||||
ck.buildAndTest(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Run Downstream Tests")
|
||||
|
||||
@@ -1,33 +1,33 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/amd_buffer_coherence.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
|
||||
struct GlobalPrefetchDataOp
|
||||
{
|
||||
// addr needs to point to global memory!
|
||||
__device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
|
||||
{
|
||||
#if defined(__gfx125__)
|
||||
__builtin_amdgcn_global_prefetch(addr, static_cast<index_t>(Coherence_));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
|
||||
struct FlatPrefetchDataOp
|
||||
{
|
||||
__device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
|
||||
{
|
||||
#if defined(__gfx125__)
|
||||
__builtin_amdgcn_flat_prefetch(addr, static_cast<index_t>(Coherence_));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/amd_buffer_coherence.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
|
||||
struct GlobalPrefetchDataOp
|
||||
{
|
||||
// addr needs to point to global memory!
|
||||
__device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
|
||||
{
|
||||
#if defined(__gfx125__)
|
||||
__builtin_amdgcn_global_prefetch(addr, static_cast<index_t>(Coherence_));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <AmdBufferCoherenceEnum Coherence_ = AmdBufferCoherenceEnum::DefaultCoherence>
|
||||
struct FlatPrefetchDataOp
|
||||
{
|
||||
__device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const
|
||||
{
|
||||
#if defined(__gfx125__)
|
||||
__builtin_amdgcn_flat_prefetch(addr, static_cast<index_t>(Coherence_));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -1,80 +1,80 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
static constexpr auto BlkGemmPipeSched = ck::BlockGemmPipelineScheduler::Intrawave;
|
||||
static constexpr auto BlkGemmPipeVer = ck::BlockGemmPipelineVersion::v3;
|
||||
|
||||
// A[m, k] * B[n, k] = C[m, n] with data cache prefetch support
|
||||
template <bool UseDataCachePrefetch>
|
||||
using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances = std::tuple<
|
||||
// clang-format off
|
||||
//#########################|ALayout|BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| KPer | AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| | | Compute | Compute | Permute | Minimum | Use |
|
||||
//#########################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| PipeScheduler| PipelineVer| TypeA | TypeB | A/B | Occupancy| DataCachePrefetch |
|
||||
//#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | | | | |
|
||||
//#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
// 128x128x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 8, 16, 16, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>,
|
||||
// 256x128x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 64, 8, 8, 16, 16, 8, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>,
|
||||
// 128x256x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 256, 64, 8, 8, 16, 16, 4, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>,
|
||||
// 256x256x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 256, 64, 8, 8, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_prefetch_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
|
||||
instances)
|
||||
{
|
||||
if(ck::is_gfx125_supported())
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<true>{});
|
||||
}
|
||||
}
|
||||
|
||||
void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_no_prefetch_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
|
||||
instances)
|
||||
{
|
||||
if(ck::is_gfx125_supported())
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<false>{});
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
static constexpr auto BlkGemmPipeSched = ck::BlockGemmPipelineScheduler::Intrawave;
|
||||
static constexpr auto BlkGemmPipeVer = ck::BlockGemmPipelineVersion::v3;
|
||||
|
||||
// A[m, k] * B[n, k] = C[m, n] with data cache prefetch support
|
||||
template <bool UseDataCachePrefetch>
|
||||
using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances = std::tuple<
|
||||
// clang-format off
|
||||
//#########################|ALayout|BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| KPer | AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| | | Compute | Compute | Permute | Minimum | Use |
|
||||
//#########################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| PipeScheduler| PipelineVer| TypeA | TypeB | A/B | Occupancy| DataCachePrefetch |
|
||||
//#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | | | | |
|
||||
//#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
// 128x128x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 8, 16, 16, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>,
|
||||
// 256x128x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 64, 8, 8, 16, 16, 8, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>,
|
||||
// 128x256x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 256, 64, 8, 8, 16, 16, 4, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>,
|
||||
// 256x256x64
|
||||
DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 256, 64, 8, 8, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_prefetch_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
|
||||
instances)
|
||||
{
|
||||
if(ck::is_gfx125_supported())
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<true>{});
|
||||
}
|
||||
}
|
||||
|
||||
void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_no_prefetch_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
|
||||
instances)
|
||||
{
|
||||
if(ck::is_gfx125_supported())
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances<false>{});
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
23
script/check_no_crlf.sh
Executable file
23
script/check_no_crlf.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# Rejects Windows CRLF line endings (a trailing carriage return) in the
|
||||
# files passed as arguments. Used both by the local pre-commit hook and
|
||||
# by the Jenkinsfile "CRLF Check" static-check stage.
|
||||
#
|
||||
# Usage: ./check_no_crlf.sh <file1> <file2> ...
|
||||
|
||||
exit_code=0
|
||||
|
||||
for file in "$@"; do
|
||||
[[ -f "$file" ]] || continue
|
||||
if LC_ALL=C grep -qP '\r$' "$file" 2>/dev/null; then
|
||||
echo "ERROR: $file contains CRLF (Windows) line endings:"
|
||||
LC_ALL=C grep -nP '\r$' "$file" | head -20 | sed 's/\r$/<CR>/'
|
||||
echo " Fix: convert to LF, e.g. 'sed -i 's/\\r\$//' $file' or 'dos2unix $file'"
|
||||
exit_code=1
|
||||
fi
|
||||
done
|
||||
|
||||
exit $exit_code
|
||||
@@ -1,31 +1,31 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "test_gemm_quant_common.hpp"
|
||||
|
||||
using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
|
||||
|
||||
// Type combinations for ABQuant tests
|
||||
// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
|
||||
// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
|
||||
// clang-format off
|
||||
using ABQuantPreshuffleQuantTypes = ::testing::Types<
|
||||
std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<false>, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
|
||||
std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<true>, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
// Test suite for ABQuant
|
||||
TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
|
||||
|
||||
// AQuant tests
|
||||
TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
|
||||
{
|
||||
using BQuantGroupSize = std::tuple_element_t<11, TypeParam>;
|
||||
if(ck_tile::is_gfx120_supported() && std::is_same_v<BQuantGroupSize, GroupSize2D128N>)
|
||||
{
|
||||
GTEST_SKIP() << "temp disable due to random fail on gfx120.";
|
||||
}
|
||||
|
||||
this->run_test_with_validation(1024, 1024, 1024);
|
||||
}
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "test_gemm_quant_common.hpp"
|
||||
|
||||
using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
|
||||
|
||||
// Type combinations for ABQuant tests
|
||||
// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
|
||||
// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
|
||||
// clang-format off
|
||||
using ABQuantPreshuffleQuantTypes = ::testing::Types<
|
||||
std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<false>, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
|
||||
std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<true>, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
// Test suite for ABQuant
|
||||
TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
|
||||
|
||||
// AQuant tests
|
||||
TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
|
||||
{
|
||||
using BQuantGroupSize = std::tuple_element_t<11, TypeParam>;
|
||||
if(ck_tile::is_gfx120_supported() && std::is_same_v<BQuantGroupSize, GroupSize2D128N>)
|
||||
{
|
||||
GTEST_SKIP() << "temp disable due to random fail on gfx120.";
|
||||
}
|
||||
|
||||
this->run_test_with_validation(1024, 1024, 1024);
|
||||
}
|
||||
|
||||
@@ -1,74 +1,74 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
|
||||
#include "prefetch_op_util.hpp"
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, bool IS_L1_PREFETCH>
|
||||
bool run_test(bool time_kernels)
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
#if defined(__gfx125__)
|
||||
const auto coherence =
|
||||
IS_L1_PREFETCH ? ck::AmdBufferCoherenceEnum::CU_RT : ck::AmdBufferCoherenceEnum::SE_RT;
|
||||
using global_prefetch_op = ck::GlobalPrefetchDataOp<coherence>;
|
||||
using flat_prefetch_op = ck::FlatPrefetchDataOp<coherence>;
|
||||
#else
|
||||
using global_prefetch_op = ck::GlobalPrefetchDataOp<>;
|
||||
using flat_prefetch_op = ck::FlatPrefetchDataOp<>;
|
||||
#endif
|
||||
|
||||
const auto global_prefetch_kernel =
|
||||
ck::prefetch_op_util::kernel_with_prefetch<T, NUM_THREADS, NUM_SCALARS, global_prefetch_op>;
|
||||
const auto flat_prefetch_kernel = ck::prefetch_op_util::
|
||||
kernel_with_prefetch_and_shared_mem<T, NUM_THREADS, NUM_SCALARS, flat_prefetch_op>;
|
||||
|
||||
const auto prefetch_kernel_container =
|
||||
std::make_tuple(global_prefetch_kernel, flat_prefetch_kernel);
|
||||
|
||||
ck::static_for<0, 2, 1>{}([&](auto i) {
|
||||
std::string kernel_name = (i == 1 ? "flat_prefetch" : "global_prefetch");
|
||||
|
||||
auto kernel = std::get<ck::Number<i>{}>(prefetch_kernel_container);
|
||||
|
||||
pass &=
|
||||
ck::prefetch_op_util::test_prefetch_impl<decltype(kernel), T, NUM_THREADS, NUM_SCALARS>(
|
||||
time_kernels, kernel, kernel_name);
|
||||
});
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(!ck::is_gfx125_supported())
|
||||
{
|
||||
std::cout << "This feature is not supported by current HW, skipping tests." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool time_kernels = false;
|
||||
|
||||
if(argc == 2)
|
||||
{
|
||||
time_kernels = std::stoi(argv[1]);
|
||||
}
|
||||
|
||||
bool pass = true;
|
||||
|
||||
std::cout << "=== Testing L2 Global Cache Prefetch ===" << std::endl;
|
||||
|
||||
pass &= run_test<float, 4096, 1024, false>(time_kernels);
|
||||
pass &= run_test<double, 4096, 512, false>(time_kernels);
|
||||
|
||||
std::cout << "=== Testing L1 Global Cache Prefetch ===" << std::endl;
|
||||
|
||||
pass &= run_test<float, 4096, 1024, true>(time_kernels);
|
||||
pass &= run_test<double, 4096, 512, true>(time_kernels);
|
||||
|
||||
std::cout << "TestGlobalPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
return pass ? 0 : 1;
|
||||
}
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
|
||||
#include "prefetch_op_util.hpp"
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, bool IS_L1_PREFETCH>
|
||||
bool run_test(bool time_kernels)
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
#if defined(__gfx125__)
|
||||
const auto coherence =
|
||||
IS_L1_PREFETCH ? ck::AmdBufferCoherenceEnum::CU_RT : ck::AmdBufferCoherenceEnum::SE_RT;
|
||||
using global_prefetch_op = ck::GlobalPrefetchDataOp<coherence>;
|
||||
using flat_prefetch_op = ck::FlatPrefetchDataOp<coherence>;
|
||||
#else
|
||||
using global_prefetch_op = ck::GlobalPrefetchDataOp<>;
|
||||
using flat_prefetch_op = ck::FlatPrefetchDataOp<>;
|
||||
#endif
|
||||
|
||||
const auto global_prefetch_kernel =
|
||||
ck::prefetch_op_util::kernel_with_prefetch<T, NUM_THREADS, NUM_SCALARS, global_prefetch_op>;
|
||||
const auto flat_prefetch_kernel = ck::prefetch_op_util::
|
||||
kernel_with_prefetch_and_shared_mem<T, NUM_THREADS, NUM_SCALARS, flat_prefetch_op>;
|
||||
|
||||
const auto prefetch_kernel_container =
|
||||
std::make_tuple(global_prefetch_kernel, flat_prefetch_kernel);
|
||||
|
||||
ck::static_for<0, 2, 1>{}([&](auto i) {
|
||||
std::string kernel_name = (i == 1 ? "flat_prefetch" : "global_prefetch");
|
||||
|
||||
auto kernel = std::get<ck::Number<i>{}>(prefetch_kernel_container);
|
||||
|
||||
pass &=
|
||||
ck::prefetch_op_util::test_prefetch_impl<decltype(kernel), T, NUM_THREADS, NUM_SCALARS>(
|
||||
time_kernels, kernel, kernel_name);
|
||||
});
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(!ck::is_gfx125_supported())
|
||||
{
|
||||
std::cout << "This feature is not supported by current HW, skipping tests." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool time_kernels = false;
|
||||
|
||||
if(argc == 2)
|
||||
{
|
||||
time_kernels = std::stoi(argv[1]);
|
||||
}
|
||||
|
||||
bool pass = true;
|
||||
|
||||
std::cout << "=== Testing L2 Global Cache Prefetch ===" << std::endl;
|
||||
|
||||
pass &= run_test<float, 4096, 1024, false>(time_kernels);
|
||||
pass &= run_test<double, 4096, 512, false>(time_kernels);
|
||||
|
||||
std::cout << "=== Testing L1 Global Cache Prefetch ===" << std::endl;
|
||||
|
||||
pass &= run_test<float, 4096, 1024, true>(time_kernels);
|
||||
pass &= run_test<double, 4096, 512, true>(time_kernels);
|
||||
|
||||
std::cout << "TestGlobalPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
return pass ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -1,276 +1,276 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/host_utility/flush_cache.hpp"
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "ck/utility/data_cache_prefetch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace prefetch_op_util {
|
||||
|
||||
template <typename T>
|
||||
struct KernelArgs
|
||||
{
|
||||
const T* p_a_grid;
|
||||
T* dst;
|
||||
const T* p_b_grid;
|
||||
bool enable_prefetch;
|
||||
};
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
|
||||
__global__ void kernel_with_prefetch(KernelArgs<T> args)
|
||||
{
|
||||
const T* src = args.p_a_grid;
|
||||
T* dst = args.dst;
|
||||
const T* scalar_data = args.p_b_grid;
|
||||
bool enable_prefetch = args.enable_prefetch;
|
||||
|
||||
uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Calculate number of 32B cachelines needed to cover num_scalars elements
|
||||
constexpr index_t cachelineSize = 32;
|
||||
constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
|
||||
constexpr unsigned int cachelinesNeeded =
|
||||
(NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
|
||||
|
||||
const char* byte_addr = reinterpret_cast<const char*>(scalar_data);
|
||||
|
||||
// Prefetch all scalar data at once
|
||||
if(tid < cachelinesNeeded)
|
||||
{
|
||||
if(enable_prefetch)
|
||||
{
|
||||
// Prefetch the cacheline
|
||||
PrefetchOp{}(byte_addr + tid * cachelineSize);
|
||||
}
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
|
||||
// finish
|
||||
}
|
||||
__syncthreads(); // waits on loads from global mem
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
// Access prefetched scalar data
|
||||
for(uint32_t i = 0; i < NUM_SCALARS; i++)
|
||||
{
|
||||
sum += scalar_data[i]; // should be fast due to scalars being preloaded
|
||||
}
|
||||
|
||||
dst[tid] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
|
||||
__global__ void kernel_with_prefetch_and_shared_mem(KernelArgs<T> args)
|
||||
{
|
||||
const T* src = args.p_a_grid;
|
||||
T* dst = args.dst;
|
||||
const T* scalar_data = args.p_b_grid;
|
||||
bool enable_prefetch = args.enable_prefetch;
|
||||
|
||||
__shared__ T sharedMem[32];
|
||||
|
||||
uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Calculate number of 32B cachelines needed to cover num_scalars elements
|
||||
constexpr index_t cachelineSize = 32;
|
||||
constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
|
||||
constexpr unsigned int cachelinesNeeded =
|
||||
(NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
|
||||
|
||||
bool use_shared_mem = tid % 2 == 1;
|
||||
|
||||
const void* byte_addr;
|
||||
if(use_shared_mem)
|
||||
{
|
||||
byte_addr = reinterpret_cast<const void*>(sharedMem);
|
||||
}
|
||||
else
|
||||
{
|
||||
uintptr_t base = reinterpret_cast<uintptr_t>(scalar_data);
|
||||
uintptr_t offset = base + (tid / 2) * cachelineSize;
|
||||
byte_addr = reinterpret_cast<const void*>(offset);
|
||||
}
|
||||
|
||||
// Prefetch all scalar data at once
|
||||
if(tid < cachelinesNeeded * 2)
|
||||
{
|
||||
if(enable_prefetch)
|
||||
{
|
||||
// Prefetch the cacheline
|
||||
PrefetchOp{}(byte_addr);
|
||||
}
|
||||
else
|
||||
{
|
||||
(void)byte_addr;
|
||||
}
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
|
||||
// finish
|
||||
}
|
||||
__syncthreads(); // waits on loads from global mem
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
// Access prefetched scalar data
|
||||
for(uint32_t i = 0; i < NUM_SCALARS; i++)
|
||||
{
|
||||
sum += scalar_data[i]; // should be fast due to scalars being preloaded
|
||||
}
|
||||
|
||||
dst[tid] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PrefetchKernel, typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
|
||||
bool test_prefetch_impl(bool time_kernels,
|
||||
const PrefetchKernel& prefetch_kernel,
|
||||
const std::string& kernel_name)
|
||||
{
|
||||
constexpr index_t block_size = 256;
|
||||
constexpr index_t num_elements = NUM_THREADS;
|
||||
constexpr index_t num_scalars = NUM_SCALARS;
|
||||
|
||||
// TODO: maybe add more prefetch instructions inside kernel to support more values
|
||||
assert(NUM_SCALARS / sizeof(T) < (32 * block_size) &&
|
||||
"Too many scalars to prefetch with current implementation!");
|
||||
|
||||
constexpr index_t grid_size = (num_elements + block_size - 1) / block_size;
|
||||
|
||||
std::cout << "Testing " << kernel_name << " for type: " << typeid(T).name() << std::endl;
|
||||
std::cout << "Elements: " << num_elements << ", Scalars: " << num_scalars << std::endl;
|
||||
|
||||
// Host data
|
||||
std::vector<T> h_src(num_elements);
|
||||
std::vector<T> h_scalar(num_scalars);
|
||||
std::vector<T> h_dst_with_prefetch_chunks(num_elements);
|
||||
std::vector<T> h_expected(num_elements);
|
||||
|
||||
// Initialize data
|
||||
for(index_t i = 0; i < num_elements; i++)
|
||||
{
|
||||
h_src[i] = static_cast<T>(i % 100);
|
||||
}
|
||||
|
||||
T scalar_sum = 0;
|
||||
for(index_t i = 0; i < num_scalars; i++)
|
||||
{
|
||||
h_scalar[i] = static_cast<T>(i + 1);
|
||||
scalar_sum += h_scalar[i];
|
||||
}
|
||||
|
||||
// Expected results
|
||||
for(index_t i = 0; i < num_elements; i++)
|
||||
{
|
||||
h_expected[i] = h_src[i] + scalar_sum;
|
||||
}
|
||||
|
||||
// Device memory
|
||||
DeviceMem d_src(sizeof(T) * num_elements);
|
||||
DeviceMem d_scalar(sizeof(T) * num_scalars);
|
||||
DeviceMem d_dst_with_prefetch_chunks(sizeof(T) * num_elements);
|
||||
|
||||
d_src.ToDevice(h_src.data());
|
||||
d_scalar.ToDevice(h_scalar.data());
|
||||
|
||||
KernelArgs<T> args{static_cast<const T*>(d_src.GetDeviceBuffer()),
|
||||
static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
|
||||
static_cast<const T*>(d_scalar.GetDeviceBuffer()),
|
||||
true};
|
||||
if(time_kernels)
|
||||
{
|
||||
std::array<float, 2> avg_times_us;
|
||||
ck::static_for<0, 2, 1>{}([&](auto static_i) {
|
||||
constexpr bool prefetch_enabled = static_i == 0;
|
||||
std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl;
|
||||
|
||||
args.enable_prefetch = prefetch_enabled;
|
||||
|
||||
constexpr int num_warmup = 1;
|
||||
constexpr int num_iterations = 10;
|
||||
constexpr int rotating_count = num_iterations;
|
||||
auto size_a_buffer = d_src.GetBufferSize();
|
||||
auto size_b_buffer = d_scalar.GetBufferSize();
|
||||
|
||||
ck::utility::RotatingMemWrapper<KernelArgs<T>> rotating_mem(
|
||||
args, rotating_count, size_a_buffer, size_b_buffer);
|
||||
rotating_mem.Print();
|
||||
|
||||
auto run_flush_cache = [&]() {
|
||||
// flush icache
|
||||
ck::utility::flush_icache();
|
||||
// rotating mem
|
||||
rotating_mem.Next();
|
||||
};
|
||||
float avg_time_ms = ck::utility::launch_and_time_kernel_with_preprocess<false>(
|
||||
StreamConfig{nullptr, true, 0, num_warmup, num_iterations, true, rotating_count},
|
||||
run_flush_cache,
|
||||
prefetch_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(block_size),
|
||||
0,
|
||||
args);
|
||||
|
||||
float avg_time_us = avg_time_ms * 1000.0f;
|
||||
float total_bytes = (size_a_buffer + size_b_buffer); // read
|
||||
float bandwidth_gb_s = (total_bytes / (avg_time_us * 1e-6)) / 1e9;
|
||||
float ops_per_iteration = num_elements * num_scalars; // adds
|
||||
float gflops = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9;
|
||||
|
||||
std::cout << " Performance: " << std::endl;
|
||||
std::cout << " Average kernel time: " << avg_time_us << " us" << std::endl;
|
||||
std::cout << " Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl;
|
||||
std::cout << " Compute throughput: " << gflops << " GFLOPS" << std::endl;
|
||||
|
||||
avg_times_us[static_i] = avg_time_us;
|
||||
});
|
||||
|
||||
float speedup = avg_times_us[1] / avg_times_us[0];
|
||||
|
||||
std::cout << "On average kernel with prefetch is " << speedup
|
||||
<< " times faster than without prefetch." << std::endl;
|
||||
|
||||
if(speedup < 1.0f)
|
||||
std::cout << "WARNING: prefetch kernel is slower!" << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
launch_and_time_kernel(StreamConfig{nullptr, false},
|
||||
prefetch_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(block_size),
|
||||
0, // lds_byte
|
||||
args);
|
||||
}
|
||||
|
||||
// Copy results back
|
||||
d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data());
|
||||
|
||||
// Verify results
|
||||
bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected);
|
||||
|
||||
std::cout << " Correctness: " << (pass ? "PASS" : "FAIL") << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
} // namespace prefetch_op_util
|
||||
} // namespace ck
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/host_utility/flush_cache.hpp"
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "ck/utility/data_cache_prefetch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace prefetch_op_util {
|
||||
|
||||
template <typename T>
|
||||
struct KernelArgs
|
||||
{
|
||||
const T* p_a_grid;
|
||||
T* dst;
|
||||
const T* p_b_grid;
|
||||
bool enable_prefetch;
|
||||
};
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
|
||||
__global__ void kernel_with_prefetch(KernelArgs<T> args)
|
||||
{
|
||||
const T* src = args.p_a_grid;
|
||||
T* dst = args.dst;
|
||||
const T* scalar_data = args.p_b_grid;
|
||||
bool enable_prefetch = args.enable_prefetch;
|
||||
|
||||
uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Calculate number of 32B cachelines needed to cover num_scalars elements
|
||||
constexpr index_t cachelineSize = 32;
|
||||
constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
|
||||
constexpr unsigned int cachelinesNeeded =
|
||||
(NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
|
||||
|
||||
const char* byte_addr = reinterpret_cast<const char*>(scalar_data);
|
||||
|
||||
// Prefetch all scalar data at once
|
||||
if(tid < cachelinesNeeded)
|
||||
{
|
||||
if(enable_prefetch)
|
||||
{
|
||||
// Prefetch the cacheline
|
||||
PrefetchOp{}(byte_addr + tid * cachelineSize);
|
||||
}
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
|
||||
// finish
|
||||
}
|
||||
__syncthreads(); // waits on loads from global mem
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
// Access prefetched scalar data
|
||||
for(uint32_t i = 0; i < NUM_SCALARS; i++)
|
||||
{
|
||||
sum += scalar_data[i]; // should be fast due to scalars being preloaded
|
||||
}
|
||||
|
||||
dst[tid] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
|
||||
__global__ void kernel_with_prefetch_and_shared_mem(KernelArgs<T> args)
|
||||
{
|
||||
const T* src = args.p_a_grid;
|
||||
T* dst = args.dst;
|
||||
const T* scalar_data = args.p_b_grid;
|
||||
bool enable_prefetch = args.enable_prefetch;
|
||||
|
||||
__shared__ T sharedMem[32];
|
||||
|
||||
uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Calculate number of 32B cachelines needed to cover num_scalars elements
|
||||
constexpr index_t cachelineSize = 32;
|
||||
constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
|
||||
constexpr unsigned int cachelinesNeeded =
|
||||
(NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
|
||||
|
||||
bool use_shared_mem = tid % 2 == 1;
|
||||
|
||||
const void* byte_addr;
|
||||
if(use_shared_mem)
|
||||
{
|
||||
byte_addr = reinterpret_cast<const void*>(sharedMem);
|
||||
}
|
||||
else
|
||||
{
|
||||
uintptr_t base = reinterpret_cast<uintptr_t>(scalar_data);
|
||||
uintptr_t offset = base + (tid / 2) * cachelineSize;
|
||||
byte_addr = reinterpret_cast<const void*>(offset);
|
||||
}
|
||||
|
||||
// Prefetch all scalar data at once
|
||||
if(tid < cachelinesNeeded * 2)
|
||||
{
|
||||
if(enable_prefetch)
|
||||
{
|
||||
// Prefetch the cacheline
|
||||
PrefetchOp{}(byte_addr);
|
||||
}
|
||||
else
|
||||
{
|
||||
(void)byte_addr;
|
||||
}
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
|
||||
// finish
|
||||
}
|
||||
__syncthreads(); // waits on loads from global mem
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
// Access prefetched scalar data
|
||||
for(uint32_t i = 0; i < NUM_SCALARS; i++)
|
||||
{
|
||||
sum += scalar_data[i]; // should be fast due to scalars being preloaded
|
||||
}
|
||||
|
||||
dst[tid] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PrefetchKernel, typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
|
||||
bool test_prefetch_impl(bool time_kernels,
|
||||
const PrefetchKernel& prefetch_kernel,
|
||||
const std::string& kernel_name)
|
||||
{
|
||||
constexpr index_t block_size = 256;
|
||||
constexpr index_t num_elements = NUM_THREADS;
|
||||
constexpr index_t num_scalars = NUM_SCALARS;
|
||||
|
||||
// TODO: maybe add more prefetch instructions inside kernel to support more values
|
||||
assert(NUM_SCALARS / sizeof(T) < (32 * block_size) &&
|
||||
"Too many scalars to prefetch with current implementation!");
|
||||
|
||||
constexpr index_t grid_size = (num_elements + block_size - 1) / block_size;
|
||||
|
||||
std::cout << "Testing " << kernel_name << " for type: " << typeid(T).name() << std::endl;
|
||||
std::cout << "Elements: " << num_elements << ", Scalars: " << num_scalars << std::endl;
|
||||
|
||||
// Host data
|
||||
std::vector<T> h_src(num_elements);
|
||||
std::vector<T> h_scalar(num_scalars);
|
||||
std::vector<T> h_dst_with_prefetch_chunks(num_elements);
|
||||
std::vector<T> h_expected(num_elements);
|
||||
|
||||
// Initialize data
|
||||
for(index_t i = 0; i < num_elements; i++)
|
||||
{
|
||||
h_src[i] = static_cast<T>(i % 100);
|
||||
}
|
||||
|
||||
T scalar_sum = 0;
|
||||
for(index_t i = 0; i < num_scalars; i++)
|
||||
{
|
||||
h_scalar[i] = static_cast<T>(i + 1);
|
||||
scalar_sum += h_scalar[i];
|
||||
}
|
||||
|
||||
// Expected results
|
||||
for(index_t i = 0; i < num_elements; i++)
|
||||
{
|
||||
h_expected[i] = h_src[i] + scalar_sum;
|
||||
}
|
||||
|
||||
// Device memory
|
||||
DeviceMem d_src(sizeof(T) * num_elements);
|
||||
DeviceMem d_scalar(sizeof(T) * num_scalars);
|
||||
DeviceMem d_dst_with_prefetch_chunks(sizeof(T) * num_elements);
|
||||
|
||||
d_src.ToDevice(h_src.data());
|
||||
d_scalar.ToDevice(h_scalar.data());
|
||||
|
||||
KernelArgs<T> args{static_cast<const T*>(d_src.GetDeviceBuffer()),
|
||||
static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
|
||||
static_cast<const T*>(d_scalar.GetDeviceBuffer()),
|
||||
true};
|
||||
if(time_kernels)
|
||||
{
|
||||
std::array<float, 2> avg_times_us;
|
||||
ck::static_for<0, 2, 1>{}([&](auto static_i) {
|
||||
constexpr bool prefetch_enabled = static_i == 0;
|
||||
std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl;
|
||||
|
||||
args.enable_prefetch = prefetch_enabled;
|
||||
|
||||
constexpr int num_warmup = 1;
|
||||
constexpr int num_iterations = 10;
|
||||
constexpr int rotating_count = num_iterations;
|
||||
auto size_a_buffer = d_src.GetBufferSize();
|
||||
auto size_b_buffer = d_scalar.GetBufferSize();
|
||||
|
||||
ck::utility::RotatingMemWrapper<KernelArgs<T>> rotating_mem(
|
||||
args, rotating_count, size_a_buffer, size_b_buffer);
|
||||
rotating_mem.Print();
|
||||
|
||||
auto run_flush_cache = [&]() {
|
||||
// flush icache
|
||||
ck::utility::flush_icache();
|
||||
// rotating mem
|
||||
rotating_mem.Next();
|
||||
};
|
||||
float avg_time_ms = ck::utility::launch_and_time_kernel_with_preprocess<false>(
|
||||
StreamConfig{nullptr, true, 0, num_warmup, num_iterations, true, rotating_count},
|
||||
run_flush_cache,
|
||||
prefetch_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(block_size),
|
||||
0,
|
||||
args);
|
||||
|
||||
float avg_time_us = avg_time_ms * 1000.0f;
|
||||
float total_bytes = (size_a_buffer + size_b_buffer); // read
|
||||
float bandwidth_gb_s = (total_bytes / (avg_time_us * 1e-6)) / 1e9;
|
||||
float ops_per_iteration = num_elements * num_scalars; // adds
|
||||
float gflops = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9;
|
||||
|
||||
std::cout << " Performance: " << std::endl;
|
||||
std::cout << " Average kernel time: " << avg_time_us << " us" << std::endl;
|
||||
std::cout << " Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl;
|
||||
std::cout << " Compute throughput: " << gflops << " GFLOPS" << std::endl;
|
||||
|
||||
avg_times_us[static_i] = avg_time_us;
|
||||
});
|
||||
|
||||
float speedup = avg_times_us[1] / avg_times_us[0];
|
||||
|
||||
std::cout << "On average kernel with prefetch is " << speedup
|
||||
<< " times faster than without prefetch." << std::endl;
|
||||
|
||||
if(speedup < 1.0f)
|
||||
std::cout << "WARNING: prefetch kernel is slower!" << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
launch_and_time_kernel(StreamConfig{nullptr, false},
|
||||
prefetch_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(block_size),
|
||||
0, // lds_byte
|
||||
args);
|
||||
}
|
||||
|
||||
// Copy results back
|
||||
d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data());
|
||||
|
||||
// Verify results
|
||||
bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected);
|
||||
|
||||
std::cout << " Correctness: " << (pass ? "PASS" : "FAIL") << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
} // namespace prefetch_op_util
|
||||
} // namespace ck
|
||||
|
||||
@@ -1,39 +1,39 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
|
||||
#include "s_prefetch_inst_op_util.hpp"
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
|
||||
bool run_test(bool time_kernels)
|
||||
{
|
||||
return ck::s_prefetch_inst_op_util::test_inst_prefetch_impl<T, NUM_THREADS, NUM_SCALARS>(
|
||||
time_kernels, "s_prefetch_inst_pc_rel");
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(!ck::is_gfx12_supported())
|
||||
{
|
||||
std::cout << "instruction cache prefetch is not supported by current HW, skipping tests."
|
||||
<< std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool time_kernels = false;
|
||||
if(argc == 2)
|
||||
{
|
||||
time_kernels = std::stoi(argv[1]);
|
||||
}
|
||||
|
||||
bool pass = true;
|
||||
|
||||
std::cout << "=== Testing Instruction Prefetch ===" << std::endl;
|
||||
|
||||
pass &= run_test<float, 4096, 16384>(time_kernels);
|
||||
|
||||
std::cout << "TestInstPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
return pass ? 0 : 1;
|
||||
}
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
|
||||
#include "s_prefetch_inst_op_util.hpp"
|
||||
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
|
||||
bool run_test(bool time_kernels)
|
||||
{
|
||||
return ck::s_prefetch_inst_op_util::test_inst_prefetch_impl<T, NUM_THREADS, NUM_SCALARS>(
|
||||
time_kernels, "s_prefetch_inst_pc_rel");
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(!ck::is_gfx12_supported())
|
||||
{
|
||||
std::cout << "instruction cache prefetch is not supported by current HW, skipping tests."
|
||||
<< std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool time_kernels = false;
|
||||
if(argc == 2)
|
||||
{
|
||||
time_kernels = std::stoi(argv[1]);
|
||||
}
|
||||
|
||||
bool pass = true;
|
||||
|
||||
std::cout << "=== Testing Instruction Prefetch ===" << std::endl;
|
||||
|
||||
pass &= run_test<float, 4096, 16384>(time_kernels);
|
||||
|
||||
std::cout << "TestInstPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
return pass ? 0 : 1;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user