mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
[CK_TILE, CK_BUILDER] Add bwd data to CK Tile profiler (#5516)
## Motivation We want close the performance gap between old CK and CK Tile for bwd data convolutions. To achieve this, we need tow things - Configurations for the old CK kernel instances such that we can map them into CK Tile instances. - Support in CK profiler to run the CK Tile instance with the same API as for old CK instances. ## Technical Details Extracted kernel configurations from old CK. The codegen python script for CK Tile convs is extended to support also bwd data. The generated instances are added to the CMake build (target `device_grouped_conv_bwd_data_tile_instances`). A new profiler op (`grouped_conv_bwd_data_tile`) has been added to the CK Profiler. The API is same as for old CK's profiler op `grouped_conv_bwd_data`. --------- Co-authored-by: Ville Pietilä <>
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <tuple>
|
||||
|
||||
#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp"
|
||||
#include "grouped_convolution_signatures.hpp"
|
||||
#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp"
|
||||
|
||||
#include "ck_tile/builder/testing/filter_extent.hpp"
|
||||
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
|
||||
#include "ck_tile/builder/testing/conv/reference.hpp"
|
||||
#include "ck_tile/builder/conv_builder.hpp"
|
||||
#include "tile_profiler_utils.hpp"
|
||||
|
||||
namespace ck_tile::builder::profiling {
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp32.inc"
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp32.inc"
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_bf16.inc"
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp16.inc"
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_bf16.inc"
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp16.inc"
|
||||
|
||||
template <auto SIGNATURE>
|
||||
void run_cpu_validation(const ckt::Args<SIGNATURE>& args,
|
||||
const ckt::Outputs<SIGNATURE>& outputs,
|
||||
const ckt::Outputs<SIGNATURE>& reference)
|
||||
{
|
||||
using DataType =
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
|
||||
float,
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
|
||||
ck_tile::half_t,
|
||||
ck_tile::bfloat16_t>>;
|
||||
const auto conv_param = args.to_ck_tile_conv_param();
|
||||
|
||||
const std::size_t input_bytes_num = conv_param.template GetInputByte<DataType>();
|
||||
std::vector<DataType> in(input_bytes_num / sizeof(DataType));
|
||||
std::vector<DataType> ref(input_bytes_num / sizeof(DataType));
|
||||
HIP_CHECK_ERROR(
|
||||
hipMemcpy(&ref.data()[0], reference.input, input_bytes_num, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK_ERROR(
|
||||
hipMemcpy(&in.data()[0], outputs.input, input_bytes_num, hipMemcpyDeviceToHost));
|
||||
ck_tile::check_err(in, ref, "\tError: Incorrect results!");
|
||||
}
|
||||
|
||||
/// @brief `run_grouped_conv_backward_data_tile_algs()` run all grouped conv fwd instances.
|
||||
///
|
||||
/// @tparam SIGNATURE Forward convolution signature.
|
||||
///
|
||||
/// @see run_grouped_conv_backward_data_tile_algs()
|
||||
template <auto SIGNATURE>
|
||||
std::tuple<bool, float, std::string, int, int>
|
||||
run_grouped_conv_backward_data_tile_algs(const ckt::Args<SIGNATURE>& args,
|
||||
const std::string& split_k,
|
||||
const index_t instance_index,
|
||||
const ckt::Inputs<SIGNATURE>& inputs,
|
||||
const ckt::Outputs<SIGNATURE>& outputs,
|
||||
const ck_tile::stream_config& s_conf)
|
||||
{
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
std::string best_op_name, op_name;
|
||||
int best_split_k = 0;
|
||||
ck::index_t best_instance_index = -1;
|
||||
bool is_supported = false;
|
||||
float avg_time;
|
||||
bool all_instances_valid = true;
|
||||
|
||||
using DataType =
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
|
||||
float,
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
|
||||
ck_tile::half_t,
|
||||
ck_tile::bfloat16_t>>;
|
||||
|
||||
auto reference = ckt::alloc_outputs(args);
|
||||
using ReferenceInstance =
|
||||
typename ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
|
||||
auto ref_conv = ReferenceInstance{};
|
||||
auto ref_result = ckt::run(ref_conv, args, inputs, reference.get());
|
||||
|
||||
const auto conv_param = args.to_ck_tile_conv_param();
|
||||
|
||||
// Get max possible value in the output
|
||||
const std::size_t input_bytes_num = conv_param.template GetInputByte<DataType>();
|
||||
std::vector<DataType> ref(input_bytes_num / sizeof(DataType));
|
||||
HIP_CHECK_ERROR(
|
||||
hipMemcpy(&ref.data()[0], reference.get().input, input_bytes_num, hipMemcpyDeviceToHost));
|
||||
const float max_accumulated_value = *std::max_element(ref.begin(), ref.end());
|
||||
|
||||
const index_t num_accums = conv_param.K_;
|
||||
|
||||
// BWD data doesn't support split-K autodeduce value -1
|
||||
auto split_k_values = get_split_k_values(split_k);
|
||||
split_k_values.erase(std::remove(split_k_values.begin(), split_k_values.end(), -1),
|
||||
split_k_values.end());
|
||||
|
||||
index_t num_kernel = 0;
|
||||
auto run_alg = [&](auto&& run_alg_func) {
|
||||
num_kernel++;
|
||||
// Skip if a specific instance was requested and this isn't it
|
||||
const bool running_specific_instance = (instance_index != -1);
|
||||
const bool current_is_target = (num_kernel - 1 == instance_index);
|
||||
if(running_specific_instance && !current_is_target)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
for(auto& k_batch : split_k_values)
|
||||
{
|
||||
ckt::Args<SIGNATURE> args_k_batch = args;
|
||||
args_k_batch.k_batch = k_batch;
|
||||
std::tie(is_supported, avg_time, op_name) =
|
||||
run_alg_func(args_k_batch, inputs, outputs, s_conf);
|
||||
if(is_supported)
|
||||
{
|
||||
ckt::ValidationReport report;
|
||||
auto&& [rtol, atol] =
|
||||
get_rtol_atol<SIGNATURE>(num_accums, k_batch, max_accumulated_value);
|
||||
ckt::Outputs<SIGNATURE>::reflect(
|
||||
args_k_batch,
|
||||
[&](std::string_view name,
|
||||
const auto& desc,
|
||||
void* ckt::Outputs<SIGNATURE>::*ptr) {
|
||||
report.check(name, desc, outputs.*ptr, reference.get().*ptr, rtol, atol);
|
||||
});
|
||||
|
||||
const bool valid = report.get_errors().empty();
|
||||
if(valid)
|
||||
{
|
||||
if(avg_time < best_avg_time)
|
||||
{
|
||||
best_instance_index = num_kernel - 1;
|
||||
}
|
||||
best_avg_time = std::min(best_avg_time, avg_time);
|
||||
best_op_name = best_avg_time < avg_time ? best_op_name : op_name;
|
||||
best_split_k = best_avg_time < avg_time ? best_split_k : k_batch;
|
||||
std::cout << "[Valid] Perf: " << std::setw(10) << avg_time << " ms," << " "
|
||||
<< op_name << " (instance " << num_kernel - 1 << "), SplitK "
|
||||
<< k_batch << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "[Error] " << op_name << ", SplitK " << k_batch << std::endl;
|
||||
for(const auto& error : report.get_errors())
|
||||
{
|
||||
std::cout << "\tNumber of incorrect values: " << error.wrong_elements
|
||||
<< " Is all zero:" << error.is_all_zero()
|
||||
<< " max err: " << error.max_error << std::endl;
|
||||
// Check with cpu verification to get a values
|
||||
run_cpu_validation<SIGNATURE>(args_k_batch, outputs, reference.get());
|
||||
}
|
||||
all_instances_valid = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "[Not supported] " << op_name << ", SplitK " << k_batch << std::endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_BWD_DATA)
|
||||
{
|
||||
#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp16_calls.inc"
|
||||
}
|
||||
else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_BWD_DATA)
|
||||
{
|
||||
#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_bf16_calls.inc"
|
||||
}
|
||||
else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_BWD_DATA)
|
||||
{
|
||||
#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp32_calls.inc"
|
||||
}
|
||||
else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_BWD_DATA)
|
||||
{
|
||||
#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp16_calls.inc"
|
||||
}
|
||||
else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_BWD_DATA)
|
||||
{
|
||||
#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_bf16_calls.inc"
|
||||
}
|
||||
else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_BWD_DATA)
|
||||
{
|
||||
#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp32_calls.inc"
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Signature not supported" << std::endl;
|
||||
return std::make_tuple(
|
||||
false, best_avg_time, best_op_name, best_split_k, best_instance_index);
|
||||
}
|
||||
return std::make_tuple(
|
||||
all_instances_valid, best_avg_time, best_op_name, best_split_k, best_instance_index);
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::profiling
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
|
||||
#include "ck_tile/builder/testing/conv/reference.hpp"
|
||||
#include "ck_tile/builder/conv_builder.hpp"
|
||||
#include "tile_profiler_utils.hpp"
|
||||
|
||||
namespace ck_tile::builder::profiling {
|
||||
|
||||
@@ -28,26 +29,6 @@ namespace ckt = ck_tile::builder::test;
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_bf16.inc"
|
||||
#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_fp16.inc"
|
||||
|
||||
std::vector<int> get_split_k_values(const std::string& split_k)
|
||||
{
|
||||
std::vector<int> split_k_list = {/*auto deduce value*/ -1, 1, 2, 4, 8, 16, 32, 64, 128};
|
||||
|
||||
if(split_k != "all")
|
||||
{
|
||||
try
|
||||
{
|
||||
int split_k_value = std::stoi(split_k);
|
||||
split_k_list = {split_k_value};
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << e.what() << '\n';
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
return split_k_list;
|
||||
}
|
||||
|
||||
template <auto SIGNATURE>
|
||||
void run_cpu_validation(const ckt::Args<SIGNATURE>& args,
|
||||
const ckt::Outputs<SIGNATURE>& outputs,
|
||||
@@ -71,36 +52,6 @@ void run_cpu_validation(const ckt::Args<SIGNATURE>& args,
|
||||
ck_tile::check_err(wei, ref, "\tError: Incorrect results!");
|
||||
}
|
||||
|
||||
template <auto SIGNATURE>
|
||||
std::tuple<double, double>
|
||||
get_rtol_atol(const int num_accums, const int k_batch, const float max_accumulated_value)
|
||||
{
|
||||
using WeiDataType =
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
|
||||
float,
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
|
||||
ck_tile::half_t,
|
||||
ck_tile::bfloat16_t>>;
|
||||
using ComputeType = WeiDataType;
|
||||
using AccDataType = float;
|
||||
|
||||
// Assign middle value of the range for auto deduce
|
||||
const int num_accums_split_k = k_batch > 0 ? k_batch : 64;
|
||||
auto rtol = ck_tile::get_relative_threshold<ComputeType, WeiDataType, AccDataType>(
|
||||
num_accums / num_accums_split_k);
|
||||
auto atol = ck_tile::get_absolute_threshold<ComputeType, WeiDataType, AccDataType>(
|
||||
max_accumulated_value / num_accums_split_k, num_accums / num_accums_split_k);
|
||||
// Calculate error due to split_k accumulation
|
||||
auto rtol_split_k =
|
||||
ck_tile::get_relative_threshold<WeiDataType, WeiDataType, WeiDataType>(num_accums_split_k);
|
||||
auto atol_split_k = ck_tile::get_absolute_threshold<WeiDataType, WeiDataType, WeiDataType>(
|
||||
max_accumulated_value, num_accums_split_k);
|
||||
// Use higher threshold
|
||||
rtol = std::max(rtol, rtol_split_k);
|
||||
atol = std::max(atol, atol_split_k);
|
||||
return std::make_tuple(rtol, atol);
|
||||
}
|
||||
|
||||
/// @brief `run_grouped_conv_backward_weight_tile_algs()` run all grouped conv fwd instances.
|
||||
///
|
||||
/// @tparam SIGNATURE Forward convolution signature.
|
||||
|
||||
@@ -5,124 +5,5 @@
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "../../experimental/builder/test/impl/conv_signature_types.hpp"
|
||||
#include "../../experimental/grouped_convolution_tile_instances/include/signatures.hpp"
|
||||
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
|
||||
|
||||
namespace ck_tile::builder::profiling {
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
constexpr auto SIGNATURE_NHWGC_FP32_FWD =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
.data_type = ckb::DataType::FP32,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NHWGC_BF16_FWD =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
.data_type = ckb::DataType::BF16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NHWGC_FP16_FWD =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
.data_type = ckb::DataType::FP16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NDHWGC_FP32_FWD =
|
||||
ckt::ConvSignature{.spatial_dim = 3,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
.data_type = ckb::DataType::FP32,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NDHWGC_BF16_FWD =
|
||||
ckt::ConvSignature{.spatial_dim = 3,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
.data_type = ckb::DataType::BF16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NDHWGC_FP16_FWD =
|
||||
ckt::ConvSignature{.spatial_dim = 3,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
.data_type = ckb::DataType::FP16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
|
||||
|
||||
/////////////////////////////////////////
|
||||
// BWD WEIGHT signatures
|
||||
//////////////////////////////////////////
|
||||
|
||||
constexpr auto SIGNATURE_NHWGC_BF16_BWD_WEIGHT =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = ckb::DataType::BF16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NHWGC_FP16_BWD_WEIGHT =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = ckb::DataType::FP16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NHWGC_FP32_BWD_WEIGHT =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = ckb::DataType::FP32,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NDHWGC_BF16_BWD_WEIGHT =
|
||||
ckt::ConvSignature{.spatial_dim = 3,
|
||||
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = ckb::DataType::BF16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NDHWGC_FP16_BWD_WEIGHT =
|
||||
ckt::ConvSignature{.spatial_dim = 3,
|
||||
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = ckb::DataType::FP16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
|
||||
|
||||
constexpr auto SIGNATURE_NDHWGC_FP32_BWD_WEIGHT =
|
||||
ckt::ConvSignature{.spatial_dim = 3,
|
||||
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = ckb::DataType::FP32,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
|
||||
|
||||
} // namespace ck_tile::builder::profiling
|
||||
|
||||
@@ -4,14 +4,70 @@
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp"
|
||||
|
||||
namespace ck_tile::builder::profiling {
|
||||
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
inline std::vector<int> get_split_k_values(const std::string& split_k)
|
||||
{
|
||||
std::vector<int> split_k_list = {/*auto deduce value*/ -1, 1, 2, 4, 8, 16, 32, 64, 128};
|
||||
|
||||
if(split_k != "all")
|
||||
{
|
||||
try
|
||||
{
|
||||
int split_k_value = std::stoi(split_k);
|
||||
split_k_list = {split_k_value};
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << e.what() << '\n';
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
return split_k_list;
|
||||
}
|
||||
|
||||
template <auto SIGNATURE>
|
||||
auto parse_conv_args(int arg_idx, char* const argv[])
|
||||
inline std::tuple<double, double>
|
||||
get_rtol_atol(const int num_accums, const int k_batch, const float max_accumulated_value)
|
||||
{
|
||||
using DataType =
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
|
||||
float,
|
||||
std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
|
||||
ck_tile::half_t,
|
||||
ck_tile::bfloat16_t>>;
|
||||
using ComputeType = DataType;
|
||||
using AccDataType = float;
|
||||
|
||||
// Assign middle value of the range for auto deduce
|
||||
const int num_accums_split_k = k_batch > 0 ? k_batch : 64;
|
||||
auto rtol = ck_tile::get_relative_threshold<ComputeType, DataType, AccDataType>(
|
||||
num_accums / num_accums_split_k);
|
||||
auto atol = ck_tile::get_absolute_threshold<ComputeType, DataType, AccDataType>(
|
||||
max_accumulated_value / num_accums_split_k, num_accums / num_accums_split_k);
|
||||
// Calculate error due to split_k accumulation
|
||||
auto rtol_split_k =
|
||||
ck_tile::get_relative_threshold<DataType, DataType, DataType>(num_accums_split_k);
|
||||
auto atol_split_k = ck_tile::get_absolute_threshold<DataType, DataType, DataType>(
|
||||
max_accumulated_value, num_accums_split_k);
|
||||
// Use higher threshold
|
||||
rtol = std::max(rtol, rtol_split_k);
|
||||
atol = std::max(atol, atol_split_k);
|
||||
return std::make_tuple(rtol, atol);
|
||||
}
|
||||
|
||||
template <auto SIGNATURE>
|
||||
inline ckt::Args<SIGNATURE> parse_conv_args(int arg_idx, char* const argv[])
|
||||
{
|
||||
const std::size_t G = static_cast<size_t>(std::stol(argv[arg_idx++]));
|
||||
const std::size_t N = static_cast<size_t>(std::stol(argv[arg_idx++]));
|
||||
|
||||
@@ -46,6 +46,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
|
||||
if(CK_EXPERIMENTAL_BUILDER)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight_tile.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data_tile.cpp)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -275,6 +276,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
|
||||
if(CK_EXPERIMENTAL_BUILDER)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv_bwd_weight_tile_instances)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv_bwd_data_tile_instances)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
218
profiler/src/profile_grouped_conv_bwd_data_tile.cpp
Normal file
218
profiler/src/profile_grouped_conv_bwd_data_tile.cpp
Normal file
@@ -0,0 +1,218 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
|
||||
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "profiler/grouped_convolution_backward_data_tile_algs.hpp"
|
||||
#include "profiler/tile_profiler_utils.hpp"
|
||||
#include "profiler/profiler_arg_utils.hpp"
|
||||
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
enum struct ConvLayout
|
||||
{
|
||||
GNHWC_GKYXC_GNHWK, // 0
|
||||
NHWGC_GKYXC_NHWGK, // 1
|
||||
NGCHW_GKYXC_NGKHW, // 2
|
||||
NGCHW_GKCYX_NGKHW, // 3
|
||||
};
|
||||
|
||||
enum struct ConvDataType
|
||||
{
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
BF16_BF16_BF16, // 2
|
||||
F32_F32_F32_TF32, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "grouped_conv_bwd_data_tile"
|
||||
#define OP_DESC "Grouped Convolution Backward Data (CK Tile)"
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
std::cout
|
||||
// clang-format off
|
||||
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Output fp32, Weight fp32, Input fp32\n"
|
||||
<< " 1: Output fp16, Weight fp16, Input fp16\n"
|
||||
<< " 2: Output bf16, Weight bf16, Input bf16\n"
|
||||
<< " 3: Output fp32, Weight fp32, Input fp32, Compute tf32)\n"
|
||||
<< "arg3: tensor layout (0: Output[G, N, Ho, Wo, C], Weight[G, K, Y, X, C], Input[G, N, Hi, Wi, K]\n"
|
||||
<< " 1: Output[N, Ho, Wo, G, C], Weight[G, K, Y, X, C], Input[N, Hi, Wi, G, K])\n"
|
||||
<< " 2: Output[N, G, C, Ho, Wo], Weight[G, K, Y, X, C], Input[N, G, K, Hi, Wi])\n"
|
||||
<< " 3: Output[N, G, C, Ho, Wo], Weight[G, K, C, Y, X], Input[N, G, K, Hi, Wi])\n"
|
||||
<< "arg4: verification (0: no, 1: yes)\n"
|
||||
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
|
||||
<< "arg6: print tensor value (0: no; 1: yes)\n"
|
||||
<< "arg7: time kernel (0: no, 1: yes)\n"
|
||||
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl
|
||||
<< "Last argument: split-K (0: internally computed split-K value; 1, 2, 4, 8, 16, 32, 64, 128: set k batches explicitly)\n"
|
||||
<< "\nOptional arguments:\n"
|
||||
<< " --instance <id> Run only the specified instance (0-indexed among valid instances)\n";
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
namespace ckp = ck_tile::builder::profiling;
|
||||
|
||||
template <auto SIGNATURE>
|
||||
int call_profiler(const ckt::Args<SIGNATURE>& args,
|
||||
const std::string& split_k,
|
||||
bool time_kernel,
|
||||
ck_tile::index_t instance_index)
|
||||
{
|
||||
auto inputs = ckt::alloc_inputs(args);
|
||||
auto outputs = ckt::alloc_outputs(args);
|
||||
ckt::init_inputs(args, inputs.get());
|
||||
|
||||
std::cout << args.make_input_descriptor() << std::endl;
|
||||
std::cout << args.make_weight_descriptor() << std::endl;
|
||||
std::cout << args.make_output_descriptor() << std::endl;
|
||||
auto&& [valid, avg_time, op_name, best_split_k, best_instance_index] =
|
||||
ckp::run_grouped_conv_backward_data_tile_algs(
|
||||
args,
|
||||
split_k,
|
||||
instance_index,
|
||||
inputs.get(),
|
||||
outputs.get(),
|
||||
ck_tile::stream_config{nullptr,
|
||||
time_kernel,
|
||||
0 /*log_level*/,
|
||||
5 /*cold_iters*/,
|
||||
50 /*nrepeat_*/,
|
||||
true /*is_gpu_timer_*/});
|
||||
if(time_kernel)
|
||||
{
|
||||
std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name << " (instance "
|
||||
<< best_instance_index << ")" << "\n\tavg_time: " << avg_time << ", SplitK "
|
||||
<< best_split_k << std::endl;
|
||||
}
|
||||
return !valid;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int profile_grouped_conv_bwd_data_tile(int argc, char* argv[])
|
||||
{
|
||||
// Parse optional named arguments first
|
||||
ck_tile::index_t instance_index = -1;
|
||||
bool dummy;
|
||||
ck::profiler::parse_named_args(argc, argv, instance_index, dummy);
|
||||
const int named_arg_count = ck::profiler::count_named_args(argc, argv);
|
||||
|
||||
// Adjust argc for positional argument checking
|
||||
const int positional_argc = argc - named_arg_count;
|
||||
|
||||
// 8 for control, 1 for num_dim_spatial
|
||||
if(positional_argc < 9)
|
||||
{
|
||||
print_helper_msg();
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
|
||||
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
|
||||
const bool time_kernel = std::stoi(argv[7]);
|
||||
const int num_dim_spatial = std::stoi(argv[8]);
|
||||
|
||||
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K
|
||||
if(positional_argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
|
||||
{
|
||||
print_helper_msg();
|
||||
return 1;
|
||||
}
|
||||
|
||||
constexpr ck_tile::index_t conv_params_start_idx = 9;
|
||||
const auto params =
|
||||
ck::utils::conv::parse_conv_param(num_dim_spatial, conv_params_start_idx, argv);
|
||||
std::cout << params << std::endl;
|
||||
|
||||
auto split_k = std::string(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
|
||||
|
||||
// The bwd data profiler in old CK uses -1 to loop over all split-K values.
|
||||
// We want to have the same API for backward compatibility, but we need to convert it to "all"
|
||||
// for the new API.
|
||||
if(split_k == "-1")
|
||||
{
|
||||
split_k = "all";
|
||||
}
|
||||
|
||||
if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
if(num_dim_spatial == 2)
|
||||
{
|
||||
if(data_type == ConvDataType::F16_F16_F16)
|
||||
{
|
||||
constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP16_BWD_DATA;
|
||||
return call_profiler<SIGNATURE>(
|
||||
ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
|
||||
split_k,
|
||||
time_kernel,
|
||||
instance_index);
|
||||
}
|
||||
else if(data_type == ConvDataType::BF16_BF16_BF16)
|
||||
{
|
||||
constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_BF16_BWD_DATA;
|
||||
return call_profiler<SIGNATURE>(
|
||||
ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
|
||||
split_k,
|
||||
time_kernel,
|
||||
instance_index);
|
||||
}
|
||||
else if(data_type == ConvDataType::F32_F32_F32)
|
||||
{
|
||||
constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP32_BWD_DATA;
|
||||
return call_profiler<SIGNATURE>(
|
||||
ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
|
||||
split_k,
|
||||
time_kernel,
|
||||
instance_index);
|
||||
}
|
||||
}
|
||||
else if(num_dim_spatial == 3)
|
||||
{
|
||||
if(data_type == ConvDataType::F16_F16_F16)
|
||||
{
|
||||
constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP16_BWD_DATA;
|
||||
return call_profiler<SIGNATURE>(
|
||||
ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
|
||||
split_k,
|
||||
time_kernel,
|
||||
instance_index);
|
||||
}
|
||||
else if(data_type == ConvDataType::BF16_BF16_BF16)
|
||||
{
|
||||
constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_BF16_BWD_DATA;
|
||||
return call_profiler<SIGNATURE>(
|
||||
ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
|
||||
split_k,
|
||||
time_kernel,
|
||||
instance_index);
|
||||
}
|
||||
else if(data_type == ConvDataType::F32_F32_F32)
|
||||
{
|
||||
constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP32_BWD_DATA;
|
||||
return call_profiler<SIGNATURE>(
|
||||
ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
|
||||
split_k,
|
||||
time_kernel,
|
||||
instance_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "this data_type & layout is not implemented" << std::endl;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_data_tile);
|
||||
Reference in New Issue
Block a user