mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-29 19:28:33 +00:00
print hipOccupancyDefined Grid size for best in ckProfiler
This commit is contained in:
13
include/ck/tensor_operation/gpu/device/device_base.hpp
Executable file → Normal file
13
include/ck/tensor_operation/gpu/device/device_base.hpp
Executable file → Normal file
@@ -56,17 +56,10 @@ struct BaseArgument
|
||||
virtual ~BaseArgument() {}
|
||||
|
||||
void* p_workspace_ = nullptr;
|
||||
|
||||
virtual dim3 GetLaunchGridDims() const
|
||||
{
|
||||
return dim3{0, 0, 0};
|
||||
}
|
||||
|
||||
virtual bool HasLaunchGridDims() const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual dim3 GetLaunchGridDims() const { return dim3{0, 0, 0}; }
|
||||
|
||||
virtual bool HasLaunchGridDims() const { return false; }
|
||||
};
|
||||
|
||||
struct BaseInvoker
|
||||
|
||||
17
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
Executable file → Normal file
17
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
Executable file → Normal file
@@ -168,17 +168,18 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
|
||||
hip_check_error(hipGetDevice(&dev));
|
||||
hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
|
||||
num_cu = dev_prop.multiProcessorCount;
|
||||
// arg.Grid_size = num_cu * occupancy;
|
||||
arg.Grid_size = num_cu * occupancy;
|
||||
// grid_dim = arg.Grid_size;
|
||||
grid_dim.x = num_cu * occupancy; // Set the x-dimension
|
||||
|
||||
// TODO: Set grid_dim.y and grid_dim.z appropriately if they are not 1.
|
||||
// This often comes from the block_2_ctile_map.CalculateGridSize(...)
|
||||
// For now, assuming they might be 1 or derived from block_2_ctile_map elsewhere if needed.
|
||||
// If block_2_ctile_map.CalculateGridSize gives (N0, M0, k_split), then
|
||||
// grid_dim might be (N0, M0, k_split) or (total_blocks, 1, 1)
|
||||
// The current code sets grid_dim = arg.Grid_size (if positive) or occupancy-based (if negative)
|
||||
// which implies a 1D grid of blocks. We'll stick to that interpretation for grid_dim.x
|
||||
// For now, assuming they might be 1 or derived from block_2_ctile_map elsewhere
|
||||
// if needed. If block_2_ctile_map.CalculateGridSize gives (N0, M0, k_split),
|
||||
// then grid_dim might be (N0, M0, k_split) or (total_blocks, 1, 1) The current
|
||||
// code sets grid_dim = arg.Grid_size (if positive) or occupancy-based (if
|
||||
// negative) which implies a 1D grid of blocks. We'll stick to that
|
||||
// interpretation for grid_dim.x
|
||||
grid_dim.y = 1;
|
||||
grid_dim.z = 1;
|
||||
}
|
||||
@@ -190,7 +191,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
|
||||
grid_dim.y = 1;
|
||||
grid_dim.z = 1;
|
||||
}
|
||||
|
||||
|
||||
arg.SetLaunchGridDims(grid_dim); // Store the determined launch grid dimensions
|
||||
|
||||
if(stream_config.flush_cache)
|
||||
@@ -819,7 +820,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
|
||||
<< BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
|
||||
<< "BlkGemmPipelinePrefetchStages: "
|
||||
<< GridwiseGemm::BlockwiseGemmPipe::PrefetchStages ;
|
||||
|
||||
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
|
||||
15
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
Executable file → Normal file
15
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
Executable file → Normal file
@@ -631,19 +631,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
|
||||
|
||||
mutable dim3 launch_grid_dims_;
|
||||
|
||||
void SetLaunchGridDims(dim3 dims) const
|
||||
{
|
||||
launch_grid_dims_ = dims;
|
||||
}
|
||||
void SetLaunchGridDims(dim3 dims) const { launch_grid_dims_ = dims; }
|
||||
|
||||
dim3 GetLaunchGridDims() const override
|
||||
{
|
||||
return launch_grid_dims_;
|
||||
}
|
||||
bool HasLaunchGridDims() const override
|
||||
{
|
||||
return launch_grid_dims_.x > 0;
|
||||
}
|
||||
dim3 GetLaunchGridDims() const override { return launch_grid_dims_; }
|
||||
bool HasLaunchGridDims() const override { return launch_grid_dims_.x > 0; }
|
||||
};
|
||||
|
||||
struct SplitKBatchOffset
|
||||
|
||||
53
profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
Executable file → Normal file
53
profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
Executable file → Normal file
@@ -251,14 +251,16 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
// // const auto actual_launch_grid_dims = argument_ptr->GetLaunchGridDims();
|
||||
// const auto* typed_argument_ptr = dynamic_cast<const GridwiseGemm_xdl_cshuffle_streamk_v3<...>::Argument*>(argument_ptr)
|
||||
// const auto* typed_argument_ptr = dynamic_cast<const
|
||||
// GridwiseGemm_xdl_cshuffle_streamk_v3<...>::Argument*>(argument_ptr)
|
||||
|
||||
// Get actual launch grid dims from argument
|
||||
dim3 actual_launch_grid_dims = argument_ptr->GetLaunchGridDims();
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size ";
|
||||
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name
|
||||
<< ", Grid_size ";
|
||||
|
||||
if(argument_ptr->HasLaunchGridDims() && actual_launch_grid_dims.x > 0)
|
||||
{
|
||||
std::cout << actual_launch_grid_dims.x;
|
||||
@@ -267,31 +269,9 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
|
||||
{
|
||||
std::cout << grid_size_curr;
|
||||
}
|
||||
|
||||
|
||||
std::cout << ", streamk selection strategy " << streamk_sel_curr << std::endl;
|
||||
|
||||
// if (typed_argument_ptr)
|
||||
// {
|
||||
// const auto actual_launch_grid_dims = typed_argument_ptr->GetLaunchGridDims();
|
||||
// std::cout << "Actual Grid Dimensions: " << actual_launch_grid_dims.x << "x"
|
||||
// << actual_launch_grid_dims.y << "x" << actual_launch_grid_dims.z << std::endl;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// std::cerr << "Error: Failed to cast argument_ptr to the correct type." << std::endl;
|
||||
// }
|
||||
|
||||
// std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
|
||||
// << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size "
|
||||
// << actual_launch_grid_dims.x // Use the x-dimension of the actual launch grid
|
||||
// << ", streamk selection strategy "
|
||||
// << streamk_sel_curr << std::endl;
|
||||
|
||||
// std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
|
||||
// << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size "
|
||||
// << grid_size_curr << ", streamk selection strategy"
|
||||
// << streamk_sel_curr << std::endl;
|
||||
|
||||
#if defined CK_ENABLE_FP8
|
||||
// set softer tolerances for fp8
|
||||
if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
|
||||
@@ -313,11 +293,22 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_grid_size = grid_size_curr;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
|
||||
best_grid_size = grid_size_curr;
|
||||
|
||||
if(argument_ptr->HasLaunchGridDims() && actual_launch_grid_dims.x > 0)
|
||||
{
|
||||
best_grid_size = actual_launch_grid_dims.x;
|
||||
}
|
||||
else
|
||||
{
|
||||
best_grid_size = grid_size_curr;
|
||||
}
|
||||
|
||||
best_streamk_sel = streamk_sel_curr;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user