mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 05:31:24 +00:00
* [CK_BUILDER] Integrate GPU reference as ConvAlgorithm Add GPU reference as a ConvAlgorithm specialization, enabling: - Unified Builder API for reference and optimized kernels - Future ckProfiler integration for validation - First step toward numerical validation in Builder tests Changes: - Add ConvAlgorithmSpecialization::REFERENCE enum - Add ConvAlgorithm_Reference struct - Add IsReferenceAlgorithm concept - Create 3 reference factories (Forward, BwdData, BwdWeight) - Wire into conv_dispatcher - Add proof-of-concept test (passing) Test result: Can instantiate reference through Builder API * Add GPU reference execution tests - Reference kernel executes through Builder (459ms) - Both reference and optimized can instantiate - Tests passing Next: Implement utilities for comparison * Optimized Builder kernel execution works - MakeArgument pattern implemented - Builder-generated kernel executes successfully - Tests passing (451ms execution) Next: Add comparison * VALIDATION COMPLETE: Builder == Reference Builder-generated kernel output matches GPU reference! Test: Validate_Optimized_vs_Reference_Forward_2D_FP16 Result: PASS ✓ This proves CK Builder generates correct code! * Update to new Builder API All tests passing * Rename test file for clarity test_builder_kernel_execution -> test_builder_kernel_validation * Add all 3 directions support - Forward, Backward Data, Backward Weight - All reference factories working - Dispatcher wired for all directions - 9 tests passing Tests: - test_reference_execution: 3 tests (all directions) - test_optimized_execution: 3 tests (all directions) - test_builder_kernel_validation: 3 tests (fwd validated, bwd placeholders) * Add backward direction support - Backward data and weight dispatcher wiring - Fix factories for new API - All 3 directions tested - 9 tests passing * Refactor: Change IsReferenceAlgorithm from concept to consteval function Address review feedback: Use consteval function in dispatcher instead of concept, matching the pattern for other algorithms (Tile, XDL, WMMA, DL). - Remove IsReferenceAlgorithm concept from conv_algorithm_concepts.hpp - Add IsReferenceAlgorithm() consteval function to conv_dispatcher.hpp - Update dispatcher to use function call: IsReferenceAlgorithm<T>() - Remove redundant algorithm checks from reference factory requires clauses All tests passing (9/9). * Move Tile algorithm check outside direction block to support all directions * Implement MakeInvokerPointer interface and add random input validation - Implement full Argument/Invoker structs for old CK interface (not just nullptr) - Refactor with reference_common.hpp to reduce code duplication - Add random input validation tests: Builder vs direct GPU reference (all directions) - Fix layout: GNHWC -> NHWGC to match reference kernel expectations - All 12 tests pass with IDENTICAL results on random input * Move ConvAlgorithm_Reference to test/impl/conv_algorithm_types.hpp Keep types.hpp for data types only (enums), move algorithm descriptors to conv_algorithm_types.hpp as suggested by review. * Add static_assert to ensure reference factories only accept PassThrough operations Reference implementation doesn't support fused elementwise operations. Add compile-time validation to fail early with clear error message if non-PassThrough operations are specified on input, weight, or output. * Add InstanceTraits support for reference kernels - Store SIGNATURE/ALGORITHM/VERSION in Instance for reflection - Create shared ReferenceCommonTraits base for common properties - Add 3 direction-specific InstanceTraits specializations in one file - Include data type and layouts in instance_string output * Remove optimized kernel validation tests from reference-only branch * Use existing layout helper and organize reference tests Use LayoutToCK from conv_tensor_layout.hpp and move reference InstanceTraits test to validation folder. * Merge develop branch Fix DataType switch for new mixed precision types. * Fix comment spacing for CI * Convert IsReferenceAlgorithm from function to concept * Add reference tests to CI smoke tests * Consolidate 3 reference factories into single unified factory --------- Co-authored-by: Ville Pietilä <188998872+vpietila-amd@users.noreply.github.com>
1032 lines
41 KiB
C++
1032 lines
41 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#include "ck_tile/builder/conv_builder.hpp"
|
|
#include "ck_tile/builder/types.hpp"
|
|
#include "impl/conv_algorithm_types.hpp"
|
|
#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp"
|
|
#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp"
|
|
#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp"
|
|
#include "utils/ckb_conv_test_configs.hpp"
|
|
#include "ck/library/utility/device_memory.hpp"
|
|
#include "ck/library/utility/check_err.hpp"
|
|
#include <gtest/gtest.h>
|
|
#include <vector>
|
|
|
|
namespace {
|
|
|
|
using namespace ck_tile::builder;
|
|
using namespace ck_tile::builder::test; // For ConvAlgorithm_Reference
|
|
using namespace ck_tile::builder::test_utils;
|
|
|
|
TEST(ReferenceExecution, Forward_2D_FP16)
|
|
{
|
|
// Note: When you don't specify .operation, it defaults to PassThrough
|
|
// Reference implementation only supports PassThrough elementwise operations
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::FORWARD,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::GNHWC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::GNHWK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
// Simple dimensions
|
|
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
|
|
|
// Allocate minimal device memory (just to test API)
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_dev(out_size);
|
|
|
|
in_dev.SetZero();
|
|
wei_dev.SetZero();
|
|
out_dev.SetZero();
|
|
|
|
// Prepare parameters for Run()
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel ref_kernel;
|
|
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// If we get here, Run() worked!
|
|
std::cout << "✓ Reference Forward kernel executed!" << std::endl;
|
|
EXPECT_TRUE(true);
|
|
}
|
|
|
|
TEST(ReferenceExecution, BackwardData_2D_FP16)
|
|
{
|
|
// Note: When you don't specify .operation, it defaults to PassThrough
|
|
// Reference implementation only supports PassThrough elementwise operations
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_DATA,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::GNHWC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::GNHWK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
|
|
|
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
ck::DeviceMem in_grad_dev(in_grad_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
in_grad_dev.SetZero();
|
|
wei_dev.SetZero();
|
|
out_grad_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel ref_kernel;
|
|
ref_kernel.Run(reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
std::cout << "✓ Reference Backward Data kernel executed!" << std::endl;
|
|
EXPECT_TRUE(true);
|
|
}
|
|
|
|
TEST(ReferenceExecution, BackwardWeight_2D_FP16)
|
|
{
|
|
// Note: When you don't specify .operation, it defaults to PassThrough
|
|
// Reference implementation only supports PassThrough elementwise operations
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_WEIGHT,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::GNHWC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::GNHWK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
|
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_grad_dev(wei_grad_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
in_dev.SetZero();
|
|
wei_grad_dev.SetZero();
|
|
out_grad_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel ref_kernel;
|
|
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
std::cout << "✓ Reference Backward Weight kernel executed!" << std::endl;
|
|
EXPECT_TRUE(true);
|
|
}
|
|
|
|
// Test the old CK interface: MakeArgumentPointer + MakeInvokerPointer
|
|
TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_DATA,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
|
|
|
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
ck::DeviceMem in_grad_dev(in_grad_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
in_grad_dev.SetZero();
|
|
wei_dev.SetZero();
|
|
out_grad_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel ref_kernel;
|
|
|
|
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
|
|
auto argument_ptr = ref_kernel.MakeArgumentPointer(
|
|
reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
|
|
|
|
// Run using invoker
|
|
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
|
|
|
std::cout << "✓ Reference Backward Data kernel executed via Invoker interface!" << std::endl;
|
|
std::cout << " (time = " << time << " ms)" << std::endl;
|
|
EXPECT_TRUE(true);
|
|
}
|
|
|
|
// Test the old CK interface for Forward convolution
|
|
TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::FORWARD,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::GNHWC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::GNHWK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
|
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_dev(out_size);
|
|
|
|
in_dev.SetZero();
|
|
wei_dev.SetZero();
|
|
out_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel ref_kernel;
|
|
|
|
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
|
|
auto argument_ptr = ref_kernel.MakeArgumentPointer(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
|
|
|
|
// Run using invoker
|
|
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
|
|
|
std::cout << "✓ Reference Forward kernel executed via Invoker interface!" << std::endl;
|
|
std::cout << " (time = " << time << " ms)" << std::endl;
|
|
EXPECT_TRUE(true);
|
|
}
|
|
|
|
// Test the old CK interface for Backward Weight convolution
|
|
TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_WEIGHT,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::GNHWC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::GNHWK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
|
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_grad_dev(wei_grad_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
in_dev.SetZero();
|
|
wei_grad_dev.SetZero();
|
|
out_grad_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel ref_kernel;
|
|
|
|
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
|
|
auto argument_ptr = ref_kernel.MakeArgumentPointer(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
|
|
|
|
// Run using invoker
|
|
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
|
|
|
std::cout << "✓ Reference Backward Weight kernel executed via Invoker interface!" << std::endl;
|
|
std::cout << " (time = " << time << " ms)" << std::endl;
|
|
EXPECT_TRUE(true);
|
|
}
|
|
|
|
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT
|
|
TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::FORWARD,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; // Small for fast testing
|
|
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
// Create host buffers with random data
|
|
const size_t in_elements = G * N * C * H * W;
|
|
const size_t wei_elements = G * K * C * 3 * 3;
|
|
const size_t out_elements = G * N * K * H * W;
|
|
|
|
std::vector<ck::half_t> in_host(in_elements);
|
|
std::vector<ck::half_t> wei_host(wei_elements);
|
|
|
|
// Fill with random values
|
|
std::srand(12345); // Fixed seed for reproducibility
|
|
for(size_t i = 0; i < in_elements; i++)
|
|
{
|
|
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
for(size_t i = 0; i < wei_elements; i++)
|
|
{
|
|
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
|
|
// Allocate GPU memory
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_builder_dev(out_size);
|
|
ck::DeviceMem out_naive_dev(out_size);
|
|
|
|
// Transfer random data to GPU
|
|
in_dev.ToDevice(in_host.data());
|
|
wei_dev.ToDevice(wei_host.data());
|
|
out_builder_dev.SetZero();
|
|
out_naive_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel builder_kernel;
|
|
|
|
// Run 1: Builder Reference Factory
|
|
builder_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(out_builder_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Run 2: Direct GPU Reference (same kernel the Builder calls internally!)
|
|
ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(out_naive_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Copy results back
|
|
std::vector<ck::half_t> out_builder_result(out_elements);
|
|
std::vector<ck::half_t> out_naive_result(out_elements);
|
|
out_builder_dev.FromDevice(out_builder_result.data());
|
|
out_naive_dev.FromDevice(out_naive_result.data());
|
|
|
|
// Compare - should be IDENTICAL (both call same kernel)
|
|
bool pass = ck::utils::check_err(out_builder_result,
|
|
out_naive_result,
|
|
"Error: Builder Reference != Direct GPU Reference",
|
|
1e-6,
|
|
1e-6); // Very tight tolerance!
|
|
|
|
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT)!" << std::endl;
|
|
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
|
std::cout << " This validates Builder Reference Factory is correct!" << std::endl;
|
|
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Data
|
|
TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Random)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_DATA,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
|
|
|
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
const size_t in_grad_elements = G * N * C * H * W;
|
|
const size_t wei_elements = G * K * C * 3 * 3;
|
|
const size_t out_grad_elements = G * N * K * H * W;
|
|
|
|
std::vector<ck::half_t> wei_host(wei_elements);
|
|
std::vector<ck::half_t> out_grad_host(out_grad_elements);
|
|
|
|
// Fill with random values
|
|
std::srand(12346);
|
|
for(size_t i = 0; i < wei_elements; i++)
|
|
{
|
|
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
for(size_t i = 0; i < out_grad_elements; i++)
|
|
{
|
|
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
|
|
ck::DeviceMem in_grad_builder_dev(in_grad_size);
|
|
ck::DeviceMem in_grad_naive_dev(in_grad_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
wei_dev.ToDevice(wei_host.data());
|
|
out_grad_dev.ToDevice(out_grad_host.data());
|
|
in_grad_builder_dev.SetZero();
|
|
in_grad_naive_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel builder_kernel;
|
|
|
|
// Run 1: Builder Reference Factory
|
|
builder_kernel.Run(reinterpret_cast<ck::half_t*>(in_grad_builder_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Run 2: Direct GPU Reference
|
|
ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>(
|
|
reinterpret_cast<ck::half_t*>(in_grad_naive_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Compare
|
|
std::vector<ck::half_t> in_grad_builder_result(in_grad_elements);
|
|
std::vector<ck::half_t> in_grad_naive_result(in_grad_elements);
|
|
in_grad_builder_dev.FromDevice(in_grad_builder_result.data());
|
|
in_grad_naive_dev.FromDevice(in_grad_naive_result.data());
|
|
|
|
bool pass = ck::utils::check_err(in_grad_builder_result,
|
|
in_grad_naive_result,
|
|
"Error: Builder Backward Data != Direct GPU Reference",
|
|
1e-6,
|
|
1e-6);
|
|
|
|
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Data)!"
|
|
<< std::endl;
|
|
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Weight
|
|
TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Random)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_WEIGHT,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
|
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
const size_t in_elements = G * N * C * H * W;
|
|
const size_t wei_grad_elements = G * K * C * 3 * 3;
|
|
const size_t out_grad_elements = G * N * K * H * W;
|
|
|
|
std::vector<ck::half_t> in_host(in_elements);
|
|
std::vector<ck::half_t> out_grad_host(out_grad_elements);
|
|
|
|
// Fill with random values
|
|
std::srand(12347);
|
|
for(size_t i = 0; i < in_elements; i++)
|
|
{
|
|
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
for(size_t i = 0; i < out_grad_elements; i++)
|
|
{
|
|
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_grad_builder_dev(wei_grad_size);
|
|
ck::DeviceMem wei_grad_naive_dev(wei_grad_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
in_dev.ToDevice(in_host.data());
|
|
out_grad_dev.ToDevice(out_grad_host.data());
|
|
wei_grad_builder_dev.SetZero();
|
|
wei_grad_naive_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel builder_kernel;
|
|
|
|
// Run 1: Builder Reference Factory
|
|
builder_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(wei_grad_builder_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Run 2: Direct GPU Reference
|
|
ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(wei_grad_naive_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Compare
|
|
std::vector<ck::half_t> wei_grad_builder_result(wei_grad_elements);
|
|
std::vector<ck::half_t> wei_grad_naive_result(wei_grad_elements);
|
|
wei_grad_builder_dev.FromDevice(wei_grad_builder_result.data());
|
|
wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data());
|
|
|
|
bool pass = ck::utils::check_err(wei_grad_builder_result,
|
|
wei_grad_naive_result,
|
|
"Error: Builder Backward Weight != Direct GPU Reference",
|
|
1e-6,
|
|
1e-6);
|
|
|
|
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Weight)!"
|
|
<< std::endl;
|
|
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Forward
|
|
TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::FORWARD,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
|
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
const size_t in_elements = G * N * C * H * W;
|
|
const size_t wei_elements = G * K * C * 3 * 3;
|
|
const size_t out_elements = G * N * K * H * W;
|
|
|
|
std::vector<ck::half_t> in_host(in_elements);
|
|
std::vector<ck::half_t> wei_host(wei_elements);
|
|
|
|
std::srand(12348);
|
|
for(size_t i = 0; i < in_elements; i++)
|
|
{
|
|
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
for(size_t i = 0; i < wei_elements; i++)
|
|
{
|
|
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_invoker_dev(out_size);
|
|
ck::DeviceMem out_naive_dev(out_size);
|
|
|
|
in_dev.ToDevice(in_host.data());
|
|
wei_dev.ToDevice(wei_host.data());
|
|
out_invoker_dev.SetZero();
|
|
out_naive_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel builder_kernel;
|
|
|
|
// Run 1: Builder Invoker Interface
|
|
auto argument_ptr = builder_kernel.MakeArgumentPointer(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(out_invoker_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
|
|
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
|
|
|
// Run 2: Direct GPU Reference
|
|
ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(out_naive_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Compare
|
|
std::vector<ck::half_t> out_invoker_result(out_elements);
|
|
std::vector<ck::half_t> out_naive_result(out_elements);
|
|
out_invoker_dev.FromDevice(out_invoker_result.data());
|
|
out_naive_dev.FromDevice(out_naive_result.data());
|
|
|
|
bool pass = ck::utils::check_err(out_invoker_result,
|
|
out_naive_result,
|
|
"Error: Invoker Interface != Direct GPU Reference",
|
|
1e-6,
|
|
1e-6);
|
|
|
|
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Forward)!" << std::endl;
|
|
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Data
|
|
TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_DATA,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
|
|
|
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
const size_t in_grad_elements = G * N * C * H * W;
|
|
const size_t wei_elements = G * K * C * 3 * 3;
|
|
const size_t out_grad_elements = G * N * K * H * W;
|
|
|
|
std::vector<ck::half_t> wei_host(wei_elements);
|
|
std::vector<ck::half_t> out_grad_host(out_grad_elements);
|
|
|
|
std::srand(12349);
|
|
for(size_t i = 0; i < wei_elements; i++)
|
|
{
|
|
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
for(size_t i = 0; i < out_grad_elements; i++)
|
|
{
|
|
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
|
|
ck::DeviceMem in_grad_invoker_dev(in_grad_size);
|
|
ck::DeviceMem in_grad_naive_dev(in_grad_size);
|
|
ck::DeviceMem wei_dev(wei_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
wei_dev.ToDevice(wei_host.data());
|
|
out_grad_dev.ToDevice(out_grad_host.data());
|
|
in_grad_invoker_dev.SetZero();
|
|
in_grad_naive_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel builder_kernel;
|
|
|
|
// Run 1: Builder Invoker Interface
|
|
auto argument_ptr = builder_kernel.MakeArgumentPointer(
|
|
reinterpret_cast<ck::half_t*>(in_grad_invoker_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
|
|
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
|
|
|
// Run 2: Direct GPU Reference
|
|
ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>(
|
|
reinterpret_cast<ck::half_t*>(in_grad_naive_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Compare
|
|
std::vector<ck::half_t> in_grad_invoker_result(in_grad_elements);
|
|
std::vector<ck::half_t> in_grad_naive_result(in_grad_elements);
|
|
in_grad_invoker_dev.FromDevice(in_grad_invoker_result.data());
|
|
in_grad_naive_dev.FromDevice(in_grad_naive_result.data());
|
|
|
|
bool pass =
|
|
ck::utils::check_err(in_grad_invoker_result,
|
|
in_grad_naive_result,
|
|
"Error: Invoker Interface != Direct GPU Reference (Backward Data)",
|
|
1e-6,
|
|
1e-6);
|
|
|
|
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Data)!"
|
|
<< std::endl;
|
|
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Weight
|
|
TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
|
|
{
|
|
constexpr ConvSignature sig{.spatial_dim = 2,
|
|
.direction = ConvDirection::BACKWARD_WEIGHT,
|
|
.data_type = DataType::FP16,
|
|
.accumulation_data_type = DataType::FP32,
|
|
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
|
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
|
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
|
|
|
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
|
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
|
|
|
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
|
|
|
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
|
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
|
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
|
|
|
const size_t in_elements = G * N * C * H * W;
|
|
const size_t wei_grad_elements = G * K * C * 3 * 3;
|
|
const size_t out_grad_elements = G * N * K * H * W;
|
|
|
|
std::vector<ck::half_t> in_host(in_elements);
|
|
std::vector<ck::half_t> out_grad_host(out_grad_elements);
|
|
|
|
std::srand(12350);
|
|
for(size_t i = 0; i < in_elements; i++)
|
|
{
|
|
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
for(size_t i = 0; i < out_grad_elements; i++)
|
|
{
|
|
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
|
}
|
|
|
|
ck::DeviceMem in_dev(in_size);
|
|
ck::DeviceMem wei_grad_invoker_dev(wei_grad_size);
|
|
ck::DeviceMem wei_grad_naive_dev(wei_grad_size);
|
|
ck::DeviceMem out_grad_dev(out_grad_size);
|
|
|
|
in_dev.ToDevice(in_host.data());
|
|
out_grad_dev.ToDevice(out_grad_host.data());
|
|
wei_grad_invoker_dev.SetZero();
|
|
wei_grad_naive_dev.SetZero();
|
|
|
|
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
|
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
|
std::vector<ck_tile::long_index_t> strides{1, 1};
|
|
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
|
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
|
|
|
RefKernel builder_kernel;
|
|
|
|
// Run 1: Builder Invoker Interface
|
|
auto argument_ptr = builder_kernel.MakeArgumentPointer(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(wei_grad_invoker_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
|
|
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
|
|
|
// Run 2: Direct GPU Reference
|
|
ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>(
|
|
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<ck::half_t*>(wei_grad_naive_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
|
G,
|
|
N,
|
|
K,
|
|
C,
|
|
input_spatial,
|
|
filter_spatial,
|
|
output_spatial,
|
|
strides,
|
|
dilations,
|
|
left_pads);
|
|
|
|
// Compare
|
|
std::vector<ck::half_t> wei_grad_invoker_result(wei_grad_elements);
|
|
std::vector<ck::half_t> wei_grad_naive_result(wei_grad_elements);
|
|
wei_grad_invoker_dev.FromDevice(wei_grad_invoker_result.data());
|
|
wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data());
|
|
|
|
bool pass =
|
|
ck::utils::check_err(wei_grad_invoker_result,
|
|
wei_grad_naive_result,
|
|
"Error: Invoker Interface != Direct GPU Reference (Backward Weight)",
|
|
1e-6,
|
|
1e-6);
|
|
|
|
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Weight)!"
|
|
<< std::endl;
|
|
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
} // namespace
|