mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 21:21:22 +00:00
[rocm-libraries] ROCm/rocm-libraries#5383 (commit b660b8c)
[CK_TILE] Add CShuffleLds microbenchmark suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Summary
Microbenchmarks isolating LDS store/load operations in CShuffleEpilogue
for bank conflict analysis.
## Motivation
CShuffleEpilogue performs LDS store (MFMA registers → LDS) and load (LDS
→ registers for coalesced global writes). This suite isolates each
operation to:
- Identify which operation causes bank conflicts
- Measure pure LDS bandwidth per access pattern
- Validate access patterns across MFMA tile sizes and wave layouts
## Components
- **Microkernels** (`tile_load_store_microkernels.hpp`):
`StoreTile<Setup>`, `LoadTile<Setup>`
- **Setup Adapters** (`benchmark_cshuffle_lds.hpp`): Wire
CShuffleEpilogue to microkernels
- **Template** (`benchmark_template.cpp.in`): Generated benchmarks with
timing
## Build
```bash
cmake -G Ninja -B build -S . \
-DGPU_TARGETS=gfx950 \
-DBUILD_CK_EXAMPLES=ON \
-DBUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS=ON
ninja -C build bench_lds_fp8_16x16x128_2x2_fp8
```
## New CMake Options
| Option | Default | Description |
|--------|---------|-------------|
| `BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS` | OFF | LDS microbenchmarks |
| `BUILD_CK_TILE_FMHA_TESTS` | ON | FMHA tests |
| `BUILD_CK_TILE_ENGINE` | ON | Tile engine |
| `BUILD_CK_TILE_ENGINE_TESTS` | ON | Tile engine tests |
| `BUILD_CK_EXAMPLES` | ON | Examples |
| `BUILD_CK_TUTORIALS` | ON | Tutorials |
| `BUILD_CK_DEVICE_INSTANCES` | ON | Device instances |
| `BUILD_CK_PROFILER` | ON | Profiler |
Setting guards to OFF reduces cmake configure from ~150s to ~5s.
This commit is contained in:
committed by
assistant-librarian[bot]
parent
5348b577ed
commit
7dcc606adc
128
example/ck_tile/52_cshuffle_lds/CMakeLists.txt
Normal file
128
example/ck_tile/52_cshuffle_lds/CMakeLists.txt
Normal file
@@ -0,0 +1,128 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# CShuffleLds LDS store/load microbenchmark suite
|
||||
# Measures LDS bandwidth and bank conflicts for different MFMA configurations
|
||||
|
||||
set(GENERATED_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated")
|
||||
file(MAKE_DIRECTORY "${GENERATED_SOURCE_DIR}")
|
||||
|
||||
# Core function: generate and build a benchmark executable
|
||||
function(add_cshuffle_lds_benchmark NAME A_TYPE B_TYPE ACC_TYPE O_TYPE M N M_WAVE N_WAVE M_XDL N_XDL K_XDL CONFIG_NAME)
|
||||
set(GENERATED_SOURCE "${GENERATED_SOURCE_DIR}/${NAME}.cpp")
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/benchmark_template.cpp.in" "${GENERATED_SOURCE}" @ONLY)
|
||||
set_source_files_properties(${GENERATED_SOURCE} PROPERTIES LANGUAGE HIP)
|
||||
add_executable(${NAME} ${GENERATED_SOURCE})
|
||||
set_property(TARGET ${NAME} PROPERTY HIP_ARCHITECTURES ${SUPPORTED_GPU_TARGETS})
|
||||
target_include_directories(${NAME} PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/test ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${NAME} PRIVATE hip::device)
|
||||
if(CK_USE_OCP_FP8)
|
||||
target_compile_options(${NAME} PRIVATE -DCK_TILE_USE_OCP_FP8)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Type-specific wrappers (derive name and config from parameters)
|
||||
function(add_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
|
||||
set(NAME "bench_lds_fp16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
|
||||
set(CONFIG "FP16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
|
||||
add_cshuffle_lds_benchmark(${NAME} "ck_tile::half_t" "ck_tile::half_t" "float" "ck_tile::half_t"
|
||||
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
|
||||
endfunction()
|
||||
|
||||
function(add_fp8_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
|
||||
set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16")
|
||||
set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16")
|
||||
add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::half_t"
|
||||
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
|
||||
endfunction()
|
||||
|
||||
function(add_fp8_fp8_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
|
||||
set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8")
|
||||
set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8")
|
||||
add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::fp8_t"
|
||||
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
|
||||
endfunction()
|
||||
|
||||
function(add_fp32_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
|
||||
set(NAME "bench_lds_fp32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
|
||||
set(CONFIG "FP32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
|
||||
add_cshuffle_lds_benchmark(${NAME} "float" "float" "float" "float"
|
||||
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
|
||||
endfunction()
|
||||
|
||||
function(add_bf16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
|
||||
set(NAME "bench_lds_bf16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
|
||||
set(CONFIG "BF16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
|
||||
add_cshuffle_lds_benchmark(${NAME} "ck_tile::bf16_t" "ck_tile::bf16_t" "float" "ck_tile::bf16_t"
|
||||
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
|
||||
endfunction()
|
||||
|
||||
# Helper to add benchmarks for all wave layouts of a given MFMA tile
|
||||
# Block tile M = M_XDL * M_WAVE, N = N_XDL * N_WAVE (must be divisible, here we use single iteration)
|
||||
macro(add_benchmarks_for_mfma FUNC M_XDL N_XDL K_XDL)
|
||||
foreach(WAVE_LAYOUT "4;1" "2;2" "1;4")
|
||||
list(GET WAVE_LAYOUT 0 M_WAVE)
|
||||
list(GET WAVE_LAYOUT 1 N_WAVE)
|
||||
math(EXPR M "${M_XDL} * ${M_WAVE}")
|
||||
math(EXPR N "${N_XDL} * ${N_WAVE}")
|
||||
cmake_language(CALL ${FUNC} ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL})
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
#
|
||||
# FP32 benchmarks
|
||||
#
|
||||
# MFMA tiles: 32x32x4, 32x32x8, 16x16x4, 16x16x8, 16x16x16
|
||||
add_benchmarks_for_mfma(add_fp32_benchmark 32 32 4)
|
||||
add_benchmarks_for_mfma(add_fp32_benchmark 32 32 8)
|
||||
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 4)
|
||||
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 8)
|
||||
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 16)
|
||||
|
||||
#
|
||||
# FP16 benchmarks
|
||||
#
|
||||
# MFMA tiles: 32x32x8, 32x32x16, 16x16x16, 4x64x16, 64x4x16
|
||||
add_benchmarks_for_mfma(add_fp16_benchmark 32 32 8)
|
||||
add_benchmarks_for_mfma(add_fp16_benchmark 32 32 16)
|
||||
add_benchmarks_for_mfma(add_fp16_benchmark 16 16 16)
|
||||
add_benchmarks_for_mfma(add_fp16_benchmark 4 64 16)
|
||||
add_benchmarks_for_mfma(add_fp16_benchmark 64 4 16)
|
||||
|
||||
#
|
||||
# FP8 -> FP16 benchmarks
|
||||
#
|
||||
# MFMA tiles: 32x32x16, 16x16x32
|
||||
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 16)
|
||||
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 32)
|
||||
|
||||
#
|
||||
# FP8 -> FP8 benchmarks
|
||||
#
|
||||
# MFMA tiles: 32x32x16, 16x16x32
|
||||
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 16)
|
||||
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 32)
|
||||
|
||||
#
|
||||
# gfx950-only configurations
|
||||
#
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx950")
|
||||
# FP16: 16x16x32
|
||||
add_benchmarks_for_mfma(add_fp16_benchmark 16 16 32)
|
||||
|
||||
# BF16: 16x16x64 (gfx950-only, uses 16x16x32 base instruction)
|
||||
# Other BF16 tiles have same LDS behavior as FP16 since both are 2-byte types
|
||||
add_benchmarks_for_mfma(add_bf16_benchmark 16 16 64)
|
||||
|
||||
# FP8 -> FP16: 32x32x32, 32x32x64, 16x16x64, 16x16x128
|
||||
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 32)
|
||||
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 64)
|
||||
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 64)
|
||||
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 128)
|
||||
|
||||
# FP8 -> FP8: 32x32x32, 32x32x64, 16x16x64, 16x16x128
|
||||
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 32)
|
||||
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 64)
|
||||
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 64)
|
||||
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 128)
|
||||
endif()
|
||||
61
example/ck_tile/52_cshuffle_lds/README.md
Normal file
61
example/ck_tile/52_cshuffle_lds/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# CShuffleLds LDS Microbenchmarks
|
||||
|
||||
Microbenchmark suite for measuring LDS (Local Data Share) bandwidth and bank conflicts in the CShuffleEpilogue cross-lane shuffle patterns.
|
||||
|
||||
## What This Measures
|
||||
|
||||
The CShuffleEpilogue uses LDS to redistribute GEMM output tiles from MFMA register layout to thread-raked layout for efficient global memory writes. This benchmark isolates the LDS store/load operations to measure:
|
||||
|
||||
1. **Store bandwidth** - Writing accumulator tiles to LDS (MFMA → LDS)
|
||||
2. **Load bandwidth** - Reading shuffled tiles from LDS (LDS → thread-raked)
|
||||
3. **Bank conflicts** - LDS bank conflicts during store/load (via rocprofv3)
|
||||
|
||||
## Configurations
|
||||
|
||||
Benchmarks are generated for all combinations of:
|
||||
|
||||
- **FP32 MFMA tiles**: 32x32x4, 32x32x8, 16x16x4, 16x16x8, 16x16x16
|
||||
- **FP16 MFMA tiles**: 32x32x8, 32x32x16, 16x16x16, 4x64x16, 64x4x16
|
||||
- **FP8 MFMA tiles**: 32x32x16, 16x16x32 (output FP16 or FP8)
|
||||
- **Wave layouts**: 4x1, 2x2, 1x4 (block size = MFMA tile × wave layout)
|
||||
|
||||
**gfx950-only configurations:**
|
||||
- **FP16**: 16x16x32
|
||||
- **BF16**: 16x16x64 (uses gfx950-only 16x16x32 base instruction)
|
||||
- **FP8**: 32x32x32, 32x32x64, 16x16x64, 16x16x128 (output FP16 or FP8)
|
||||
|
||||
Each configuration produces two measurements: Store and Load.
|
||||
|
||||
## Building
|
||||
|
||||
```bash
|
||||
cmake -G Ninja -B build -S . \
|
||||
-DGPU_TARGETS=gfx950 \
|
||||
-DBUILD_CK_EXAMPLES=ON \
|
||||
-DBUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS=ON
|
||||
|
||||
ninja -C build bench_lds_fp8_16x16x128_2x2_fp8 # Single benchmark
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
# Run a single benchmark
|
||||
./build/bin/bench_lds_fp8_16x16x128_2x2_fp8 --warmup 3 --iters 10
|
||||
|
||||
# Profile with rocprofv3 for bank conflicts
|
||||
cat > counters.txt <<EOF
|
||||
pmc: SQ_LDS_BANK_CONFLICT SQ_INSTS_LDS
|
||||
EOF
|
||||
|
||||
rocprofv3 -i counters.txt -d output/ -- \
|
||||
./build/bin/bench_lds_fp8_16x16x128_2x2_fp8
|
||||
```
|
||||
|
||||
## Implementation
|
||||
|
||||
- **Generic kernels**: `include/ck_tile/utility/tile_load_store_microkernels.hpp`
|
||||
- **Setup adapters**: `benchmark_cshuffle_lds.hpp`
|
||||
- **Template generation**: `benchmark_template.cpp.in`
|
||||
|
||||
The benchmark uses CK's `launch_kernel` infrastructure for timing and `make_kernel` for functor-based kernel dispatch.
|
||||
122
example/ck_tile/52_cshuffle_lds/benchmark_cshuffle_lds.hpp
Normal file
122
example/ck_tile/52_cshuffle_lds/benchmark_cshuffle_lds.hpp
Normal file
@@ -0,0 +1,122 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
/**
|
||||
* @file benchmark_cshuffle_lds.hpp
|
||||
* @brief LDS benchmark setup for CShuffleEpilogue.
|
||||
*
|
||||
* Provides Setup adapters that extract LDS descriptor and distribution
|
||||
* from CShuffleEpilogue for use with generic tile benchmark kernels.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/utility/tile_load_store_microkernels.hpp"
|
||||
#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/**
|
||||
* @brief Create CShuffleEpilogue type from benchmark parameters.
|
||||
*/
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename ODataType,
|
||||
index_t kM,
|
||||
index_t kN,
|
||||
index_t MWave,
|
||||
index_t NWave,
|
||||
index_t MPerXdl,
|
||||
index_t NPerXdl,
|
||||
index_t KPerXdl>
|
||||
using BenchmarkEpilogue = CShuffleEpilogue<CShuffleEpilogueProblem<ADataType,
|
||||
BDataType,
|
||||
tuple<>,
|
||||
AccDataType,
|
||||
ODataType,
|
||||
tuple<>,
|
||||
tensor_layout::gemm::RowMajor,
|
||||
element_wise::PassThrough,
|
||||
kM,
|
||||
kN,
|
||||
MWave,
|
||||
NWave,
|
||||
MPerXdl,
|
||||
NPerXdl,
|
||||
KPerXdl,
|
||||
false>>;
|
||||
|
||||
/**
|
||||
* @brief Setup for LDS store benchmark - adapts CShuffleEpilogue for tile benchmark.
|
||||
*/
|
||||
template <typename Epilogue>
|
||||
struct LdsStoreSetup
|
||||
{
|
||||
using ODataType = typename Epilogue::ODataType;
|
||||
static constexpr index_t kBlockSize = Epilogue::kBlockSize;
|
||||
static constexpr index_t kBytes =
|
||||
Epilogue::MPerIterationShuffle * Epilogue::NPerIterationShuffle * sizeof(ODataType);
|
||||
static constexpr auto lds_desc =
|
||||
Epilogue::template MakeLdsBlockDescriptor<typename Epilogue::Problem>();
|
||||
static constexpr auto distr =
|
||||
make_static_tile_distribution(Epilogue::MakeLdsDistributionEncode());
|
||||
|
||||
CK_TILE_DEVICE static auto create()
|
||||
{
|
||||
alignas(16) __shared__ char smem[Epilogue::GetSmemSize()];
|
||||
|
||||
auto lds_view =
|
||||
make_tensor_view<address_space_enum::lds>(reinterpret_cast<ODataType*>(smem), lds_desc);
|
||||
|
||||
auto window = make_tile_window(lds_view,
|
||||
make_tuple(number<Epilogue::MPerIterationShuffle>{},
|
||||
number<Epilogue::NPerIterationShuffle>{}),
|
||||
{0, 0},
|
||||
distr);
|
||||
|
||||
auto tile = make_static_distributed_tensor<ODataType>(distr);
|
||||
|
||||
return make_tuple(window, tile);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Setup for LDS load benchmark - adapts CShuffleEpilogue for tile benchmark.
|
||||
*/
|
||||
template <typename Epilogue>
|
||||
struct LdsLoadSetup
|
||||
{
|
||||
using ODataType = typename Epilogue::ODataType;
|
||||
static constexpr index_t kBlockSize = Epilogue::kBlockSize;
|
||||
static constexpr index_t kBytes =
|
||||
Epilogue::MPerIterationShuffle * Epilogue::NPerIterationShuffle * sizeof(ODataType);
|
||||
static constexpr auto lds_desc =
|
||||
Epilogue::template MakeLdsBlockDescriptor<typename Epilogue::Problem>();
|
||||
|
||||
using ReadPattern =
|
||||
tile_distribution_encoding_pattern_2d<Epilogue::kBlockSize,
|
||||
Epilogue::MPerIterationShuffle,
|
||||
Epilogue::NPerIterationShuffle,
|
||||
Epilogue::GetVectorSizeC(),
|
||||
tile_distribution_pattern::thread_raked>;
|
||||
static constexpr auto read_distr = ReadPattern::make_2d_static_tile_distribution();
|
||||
|
||||
CK_TILE_DEVICE static auto create()
|
||||
{
|
||||
alignas(16) __shared__ char smem[Epilogue::GetSmemSize()];
|
||||
|
||||
auto lds_view =
|
||||
make_tensor_view<address_space_enum::lds>(reinterpret_cast<ODataType*>(smem), lds_desc);
|
||||
|
||||
return make_tile_window(lds_view,
|
||||
make_tuple(number<Epilogue::MPerIterationShuffle>{},
|
||||
number<Epilogue::NPerIterationShuffle>{}),
|
||||
{0, 0},
|
||||
read_distr);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
100
example/ck_tile/52_cshuffle_lds/benchmark_template.cpp.in
Normal file
100
example/ck_tile/52_cshuffle_lds/benchmark_template.cpp.in
Normal file
@@ -0,0 +1,100 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// clang-format off
|
||||
|
||||
#include "benchmark_cshuffle_lds.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
using Epilogue = ck_tile::BenchmarkEpilogue<
|
||||
@A_TYPE@, @B_TYPE@, @ACC_TYPE@, @O_TYPE@,
|
||||
@M@, @N@, @M_WAVE@, @N_WAVE@, @M_XDL@, @N_XDL@, @K_XDL@>;
|
||||
|
||||
using StoreSetup = ck_tile::LdsStoreSetup<Epilogue>;
|
||||
using LoadSetup = ck_tile::LdsLoadSetup<Epilogue>;
|
||||
|
||||
void print_help(const char* prog)
|
||||
{
|
||||
std::cout << "Usage: " << prog << " [options]\n"
|
||||
<< "\n"
|
||||
<< "LDS microbenchmark for CShuffleEpilogue (@CONFIG_NAME@)\n"
|
||||
<< "\n"
|
||||
<< "Options:\n"
|
||||
<< " -w, --warmup <N> Warmup iterations (default: 3)\n"
|
||||
<< " -i, --iters <N> Benchmark iterations (default: 10)\n"
|
||||
<< " -h, --help Show this help message\n"
|
||||
<< "\n"
|
||||
<< "Configuration:\n"
|
||||
<< " MFMA tile: @M_XDL@x@N_XDL@x@K_XDL@\n"
|
||||
<< " Wave layout: @M_WAVE@x@N_WAVE@\n"
|
||||
<< " Block tile: @M@x@N@\n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int warmup = 3;
|
||||
int iters = 10;
|
||||
|
||||
for (int i = 1; i < argc; ++i)
|
||||
{
|
||||
if (std::strcmp(argv[i], "-h") == 0 || std::strcmp(argv[i], "--help") == 0)
|
||||
{
|
||||
print_help(argv[0]);
|
||||
return 0;
|
||||
}
|
||||
else if ((std::strcmp(argv[i], "-w") == 0 || std::strcmp(argv[i], "--warmup") == 0) && i + 1 < argc)
|
||||
{
|
||||
int val = std::atoi(argv[++i]);
|
||||
if (val <= 0)
|
||||
{
|
||||
std::cerr << "Error: --warmup requires a positive integer\n";
|
||||
return 1;
|
||||
}
|
||||
warmup = val;
|
||||
}
|
||||
else if ((std::strcmp(argv[i], "-i") == 0 || std::strcmp(argv[i], "--iters") == 0) && i + 1 < argc)
|
||||
{
|
||||
int val = std::atoi(argv[++i]);
|
||||
if (val <= 0)
|
||||
{
|
||||
std::cerr << "Error: --iters requires a positive integer\n";
|
||||
return 1;
|
||||
}
|
||||
iters = val;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << "Unknown option: " << argv[i] << "\n";
|
||||
print_help(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "=== @CONFIG_NAME@ ===" << std::endl;
|
||||
|
||||
ck_tile::stream_config stream{nullptr, true, 0, warmup, iters, true};
|
||||
|
||||
// Store benchmark
|
||||
{
|
||||
float ms = ck_tile::launch_kernel(stream,
|
||||
ck_tile::make_kernel(ck_tile::StoreTile<StoreSetup>{},
|
||||
dim3(1), dim3(StoreSetup::kBlockSize), 0));
|
||||
double gb_s = (double(StoreSetup::kBytes) / 1e9) / (ms / 1e3);
|
||||
std::cout << "Store: " << ms << " ms, " << gb_s << " GB/s" << std::endl;
|
||||
}
|
||||
|
||||
// Load benchmark
|
||||
{
|
||||
float ms = ck_tile::launch_kernel(stream,
|
||||
ck_tile::make_kernel(ck_tile::LoadTile<LoadSetup>{},
|
||||
dim3(1), dim3(LoadSetup::kBlockSize), 0));
|
||||
double gb_s = (double(LoadSetup::kBytes) / 1e9) / (ms / 1e3);
|
||||
std::cout << "Load: " << ms << " ms, " << gb_s << " GB/s" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user