diff --git a/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp index 0246c805c2..0748725c96 100644 --- a/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp @@ -125,9 +125,9 @@ struct ReferenceFactory // Direct Run method (simpler interface, direction-agnostic) template - static void Run(InPtrType input, - WeiPtrType weight, - OutPtrType output, + static void Run(InPtrType* input, + WeiPtrType* weight, + OutPtrType* output, int G, int N, int K, @@ -142,9 +142,9 @@ struct ReferenceFactory if constexpr(ConvDirectionIsForward) { ck_tile::naive_grouped_conv_fwd( - input, - weight, - output, + static_cast(input), + static_cast(weight), + static_cast(output), G, N, K, @@ -160,9 +160,9 @@ struct ReferenceFactory { ck_tile:: naive_grouped_conv_bwd_data( - input, - weight, - output, + static_cast(input), + static_cast(weight), + static_cast(output), G, N, K, @@ -179,19 +179,20 @@ struct ReferenceFactory ck_tile::naive_grouped_conv_bwd_weight(input, - weight, - output, - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); + OutDataType>( + static_cast(input), + static_cast(weight), + static_cast(output), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); } } diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp index cc5c613d95..499e0ef3de 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp @@ -3,10 +3,10 @@ #pragma once -#include -#include - #include "ck_tile/builder/testing/conv_fwd.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp" +#include +#include /// This file contains the implementation details for invoking/testing /// grouped convolution operations in old CK. The main item is the @@ -15,6 +15,63 @@ namespace ck_tile::builder::test { +namespace detail { + +/// @brief Concept for checking whether this is the reference convolution +/// implementation. +/// +/// This is the same as `::ck_tile::builder::test::CkConvInstance`, except +/// with some utility aliases. For that reason, its moved to this detail +/// namespace. +template > +concept CkConvInstance = requires(Conv& conv, + // TODO: This should be changed depending on IsMultiA etc. + // Currently that is not yet supported elsewhere anyway. + const void* p_a, + const void* p_b, + void* p_e, + std::array lengths, + std::array strides, + std::array filter, + Ops::AElementwiseOp elementwise_a, + Ops::BElementwiseOp elementwise_b, + Ops::CDEElementwiseOp elementwise_cde) { + { + conv.MakeArgument(p_a, + p_b, + // TODO: Support multiple D outputs. + {}, + p_e, + // A lengths/strides + lengths, + strides, + // B lengths/strides + lengths, + strides, + // TODO: Ds lengths/strides + {}, + {}, + // E lengths/strides + lengths, + strides, + // strides/dilations/pads + filter, + filter, + filter, + filter, + // element-wise operations. + elementwise_a, + elementwise_b, + elementwise_cde) + }; +}; + +} // namespace detail + /// @brief Concept for checking whether a convolution is invoked like old CK. /// /// This concept is used to tell whether a convolution implementation is @@ -24,13 +81,8 @@ namespace ck_tile::builder::test { /// /// - SIGNATURE is the operation signature. /// - Conv is a convolution instance created by the CK Builder API. -template -concept IsCkConvInstance = - // TODO: This should be implemented by converting the signature into the - // type parameters for DeviceGroupedConvFwdMultipleABD. For now, just leave - // it empty. Improve when needed, you get the point. Also we should probably - // move this to the ck conv factory helper. - true; +template +concept CkConvInstance = detail::CkConvInstance; /// @brief `run()` specialization for forward convolution and old CK. /// @@ -39,10 +91,9 @@ concept IsCkConvInstance = /// operation. This should be caught and reported by the testing framework. /// /// @see run() -template - requires ValidConvSignature && ConvDirectionIsForward && - IsCkConvInstance -void run(Conv& conv, +template + requires ValidConvSignature && ConvDirectionIsForward +void run(CkConvInstance auto& conv, const Args& args, const Inputs& inputs, const Outputs& outputs) diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp new file mode 100644 index 0000000000..85493e32eb --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp @@ -0,0 +1,114 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/builder/testing/conv_fwd.hpp" +#include +#include + +/// This file contains the implementation details for invoking/testing +/// grouped convolution operations using the reference implementation. +/// The main item is the `run()` function, which is the primary way to +/// invoke the reference execution mechanism. +/// The implementation of this file mostly looks like `conv_fwd_ck.hpp`, +/// but its made specific to the reference implementation, which is +/// invoked in a slightly different way. + +namespace ck_tile::builder::test { + +/// @brief Concept for checking whether this is the reference convolution +/// implementation. +/// +/// This concept is used to tell whether a convolution implementation is +/// likely to be the reference implementation - that is, whether we should +/// invoke it like the reference kernel. This is mainly used with `run()` to +/// differentiate which implementation that should be invoked. +/// +/// - SIGNATURE is the operation signature. +/// - Conv is a convolution instance created by the CK Builder API. +template +concept RefConvInstance = requires(Conv& conv, + const void* input, + const void* weight, + void* output, + int G, + int N, + int K, + int C, + std::vector dims) { + { + conv.Run(input, + weight, + output, + G, + N, + K, + C, + dims, // input_spatial + dims, // filter_spatial + dims, // output_spatial + dims, // strides + dims, // dilations + dims // left_pads + ) + }; +}; + +/// @brief `run()` specialization for forward convolution and the reference +/// implementation. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// @throws std::runtime_error if the arguments weren't actually valid for the +/// operation. This should be caught and reported by the testing framework. +/// +/// @see run() +template + requires ValidConvSignature && + // TODO: Maybe we can unify this implementation for bwd/weight too? + // for now, just concern outselves with reference and see when the + // rest of the bwd/weight plumbing is there. + ConvDirectionIsForward +void run(RefConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs) +{ + // We don't want to compute the output dims manually, just get + // them via the existing infrastructure + const auto param = args.to_ck_conv_param(); + + // TODO: The reference convolution is currently missing a few features. + // Just throw for now, but regard these as TODO items that should be resolved + // eventually. + + // Right pads are not supported right now for some reason. + for(auto right_pad : param.input_right_pads_) + { + if(right_pad != 0) + throw std::runtime_error("TODO: Support right pad in reference conv"); + } + + if(!args.make_input_descriptor().is_packed()) + throw std::runtime_error("TODO: Support non-packed input tensor in reference conv"); + if(!args.make_weight_descriptor().is_packed()) + throw std::runtime_error("TODO: Support non-packed weight tensor in reference conv"); + if(!args.make_output_descriptor().is_packed()) + throw std::runtime_error("TODO: Support non-packed output tensor in reference conv"); + + conv.Run(inputs.input, + inputs.weight, + outputs.output, + param.G_, + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, + param.output_spatial_lengths_, + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_); +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp index 0ba01a77ca..15fe4d89db 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "ck_tile/builder/conv_signature_concepts.hpp" #include "ck_tile/builder/testing/type_traits.hpp" @@ -369,6 +370,35 @@ struct TensorDescriptor return get_element_space_size() * data_type_sizeof(DT); } + /// @brief Check if a tensor is packed in memory. + /// + /// This function checks whether the tensor memory is "packed", that is, whether + /// all elements are continuous in memory with no gaps. + bool is_packed() const + { + // First sort by stride, then check if they match the scan of the + // sizes. + const auto& lengths = inner_descriptor_.get_lengths(); + const auto& strides = inner_descriptor_.get_strides(); + + std::array indices; + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&](auto i, auto j) { + return strides[i] < strides[j]; + }); + + size_t x = 1; + for(size_t i = 0; i < RANK; ++i) + { + if(strides[indices[i]] != x) + return false; + + x *= lengths[indices[i]]; + } + + return true; + } + /// @brief Get a tensor descriptor for the space backing a tensor. /// /// This function returns a tensor descriptor which represents the buffer space diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp index 9c8b858018..609c93cacf 100644 --- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp @@ -220,10 +220,13 @@ UniqueInputs alloc_inputs(const Args& args); /// @param args The run-time arguments of the operation. /// @param inputs The operation inputs to initialize with random data. /// +/// @note This function is explicitly deleted to generate compile errors +/// for missing implementations. +/// /// @see Inputs /// @see tensor_initialization template -void init_inputs(const Args& args, Inputs inputs); +void init_inputs(const Args& args, Inputs inputs) = delete; /// @brief Allocate outputs corresponding to a signature. /// @@ -236,13 +239,16 @@ void init_inputs(const Args& args, Inputs inputs); /// /// @param args The run-time arguments of the operation. /// +/// @note This function is explicitly deleted to generate compile errors +/// for missing implementations. +/// /// @see Outputs /// @see UniqueOutputs /// @see alloc_buffer() /// @see alloc_tensor_buffer() template requires ValidUniqueOutputs -UniqueInputs alloc_outputs(const Args& args); +UniqueInputs alloc_outputs(const Args& args) = delete; /// @brief Compare device operation outputs. /// @@ -262,10 +268,14 @@ UniqueInputs alloc_outputs(const Args& args); /// @param actual The actual results, the results of the operation to-be-tested. /// @param expected The expected results, the results of the reference implementation. /// +/// @note This function is explicitly deleted to generate compile errors +/// for missing implementations. +/// /// @see ValidationReport template -ValidationReport -validate(const Args& args, Outputs actual, Outputs expected); +ValidationReport validate(const Args& args, + Outputs actual, + Outputs expected) = delete; /// @brief Invoke a device operation created by CK Builder. /// @@ -296,10 +306,13 @@ validate(const Args& args, Outputs actual, Outputs void run(Operation& operation, const Args& args, const Inputs& inputs, - const Outputs& outputs); + const Outputs& outputs) = delete; } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/validation.hpp b/experimental/builder/include/ck_tile/builder/testing/validation.hpp index 275fa490eb..267bf8d2ac 100644 --- a/experimental/builder/include/ck_tile/builder/testing/validation.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/validation.hpp @@ -13,6 +13,7 @@ #include #include #include +#include /// This file implements functionality related to "validation", ie, functionality /// to compare tensors. The functionality in this file should be testing-framework @@ -48,12 +49,22 @@ struct ValidationReport /// The total number of elements in each tensor. uint64_t total_elements; + /// The number of elements which were bitwise 0. + uint64_t zero_elements; + + /// @brief Check whether both the output and reference tensor were both all zeros. + /// + /// If both tensors are all zero, it indicates either an incorrect testing setup + /// or an issue with the testing framework. For that reason we also consider that + /// a failure. + bool is_all_zero() const { return zero_elements == total_elements; } + /// @brief Return whether the check associated to this case was successful. /// /// This function returns whether the check associated to this case was successful, /// which is directly derived from checking whether the number of incorrect elements - /// was 0. - bool is_ok() const { return wrong_elements == 0; } + /// was 0 AND whether the tensor was not all zero. + bool is_ok() const { return wrong_elements == 0 && !is_all_zero(); } }; /// @brief Get comparison cases which were incorrect. @@ -123,10 +134,13 @@ bool ValidationReport::check(std::string_view tensor_name, // Initial pass: count errors // Allocate and reset counter - auto d_error_count = alloc_buffer(sizeof(uint64_t)); - check_hip(hipMemset(d_error_count.get(), 0, sizeof(uint64_t))); + auto d_counters = alloc_buffer(sizeof(uint64_t) * 2); + check_hip(hipMemset(d_counters.get(), 0, sizeof(uint64_t) * 2)); - tensor_foreach(descriptor.get_lengths(), [=, error_count = d_error_count.get()](auto index) { + auto d_error_count = &reinterpret_cast(d_counters.get())[0]; + auto d_zero_count = &reinterpret_cast(d_counters.get())[1]; + + tensor_foreach(descriptor.get_lengths(), [=](auto index) { using CKType = typename factory::internal::DataTypeToCK
::type; const auto* actual = static_cast(actual_data); @@ -137,21 +151,44 @@ bool ValidationReport::check(std::string_view tensor_name, const auto offset = calculate_offset(index, strides); - const auto o = static_cast(type_convert(actual[offset])); - const auto r = static_cast(type_convert(expected[offset])); + const auto a = actual[offset]; + const auto b = expected[offset]; + + const auto o = static_cast(type_convert(a)); + const auto r = static_cast(type_convert(b)); const auto err = std::abs(o - r); if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) { // We expect the number of errors to be very low, so just use an atomic // for now. - atomicAdd(reinterpret_cast(error_count), 1); + atomicAdd(d_error_count, 1); + } + + // Now compare the numbers as bitwise too. + // Update the counter if they're both zero. + using Bytes = std::array; + bool all_zero = true; + for(auto x : std::bit_cast(a)) + { + if(x != std::byte{0}) + all_zero = false; + } + for(auto x : std::bit_cast(b)) + { + if(x != std::byte{0}) + all_zero = false; + } + if(all_zero) + { + atomicAdd(d_zero_count, 1); } }); uint64_t error_count = 0; - check_hip( - hipMemcpy(&error_count, d_error_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost)); + check_hip(hipMemcpy(&error_count, d_error_count, sizeof(uint64_t), hipMemcpyDeviceToHost)); + uint64_t zero_count = 0; + check_hip(hipMemcpy(&zero_count, d_zero_count, sizeof(uint64_t), hipMemcpyDeviceToHost)); // TODO: Gather detailed coordinates. @@ -159,9 +196,10 @@ bool ValidationReport::check(std::string_view tensor_name, .tensor_name = std::string(tensor_name), .wrong_elements = error_count, .total_elements = descriptor.get_element_size(), + .zero_elements = zero_count, }); - return error_count == 0; + return reports_.back().is_ok(); } } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp index 5a52b6a9b5..1ba811bbe0 100644 --- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp +++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp @@ -5,6 +5,7 @@ #include "utils/ckb_conv_test_utils.hpp" #include "utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck.hpp" +#include "ck_tile/builder/testing/conv_fwd_reference.hpp" #include "ck_tile/host/device_prop.hpp" #include "testing_utils.hpp" @@ -34,6 +35,8 @@ constexpr auto ALGORITHM = cku::ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xd using Builder = ckb::ConvBuilder; using Instance = Builder::Instance; +using Reference = ckb::ConvBuilder::Instance; + TEST(Fwd2DFp16_CShufV3_GNHWC, Create) { const auto expected_transfer_parameters = to_string(ALGORITHM); @@ -81,18 +84,17 @@ TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd) .cde_elementwise_op = {}, }; - auto inputs = ckt::alloc_inputs(args); - auto outputs = ckt::alloc_outputs(args); + auto inputs = ckt::alloc_inputs(args); + auto outputs = ckt::alloc_outputs(args); + auto reference = ckt::alloc_outputs(args); ckt::init_inputs(args, inputs.get()); auto conv = Instance{}; ckt::run(conv, args, inputs.get(), outputs.get()); - // TODO: This should be allocated via ckt::alloc_outputs() and - // initialized via ckt::run() with the reference implementation - // instead. - auto reference = outputs.get(); + auto ref_conv = Reference{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); - EXPECT_THAT(outputs.get(), MatchesReference(args, reference)); + EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get())); } diff --git a/experimental/builder/test/unit_tensor_descriptor.cpp b/experimental/builder/test/unit_tensor_descriptor.cpp index d9e92bf07e..672ebbd88a 100644 --- a/experimental/builder/test/unit_tensor_descriptor.cpp +++ b/experimental/builder/test/unit_tensor_descriptor.cpp @@ -170,3 +170,22 @@ TEST(TensorDescriptor, ExtentFromVector) EXPECT_THAT([] { return ckt::Extent<5>::from_vector(std::vector{1, 2}); }, Throws()); } + +TEST(TensorDescriptor, IsPacked) +{ + constexpr auto dt = ckb::DataType::INT32; // Irrelevant for this test + EXPECT_TRUE( + ckt::make_descriptor
(ckt::Extent{101, 43, 25, 662, 654}, ckt::PackedLeftLayout{}) + .is_packed()); + EXPECT_TRUE( + ckt::make_descriptor
(ckt::Extent{5334, 235, 1563, 256, 23}, ckt::PackedRightLayout{}) + .is_packed()); + EXPECT_TRUE(ckt::make_descriptor
(ckt::Extent{}, ckt::Extent{}).is_packed()); + EXPECT_TRUE( + ckt::make_descriptor
(ckt::Extent{461, 345, 5, 93}, ckt::Extent{160425, 5, 1, 1725}) + .is_packed()); + EXPECT_FALSE( + ckt::make_descriptor
(ckt::Extent{10, 11, 12}, ckt::Extent{1, 100, 1100}).is_packed()); + EXPECT_FALSE( + ckt::make_descriptor
(ckt::Extent{30, 20, 10}, ckt::Extent{1, 1, 1}).is_packed()); +} diff --git a/experimental/builder/test/unit_validation.cpp b/experimental/builder/test/unit_validation.cpp index 06736ca624..5f6b620d6b 100644 --- a/experimental/builder/test/unit_validation.cpp +++ b/experimental/builder/test/unit_validation.cpp @@ -67,7 +67,7 @@ TYPED_TEST(ValidationReportTests, SingleCorrect) // Generate a sort-of-random looking sequence auto generator = [strides = desc.get_strides()](const auto& index) { const auto flat_index = ckt::calculate_offset(index, strides); - return static_cast(flat_index * 10'000'019 % 768'351); + return static_cast((flat_index + 1) * 10'000'019 % 768'351); }; ckt::fill_tensor(desc, a.get(), generator); @@ -110,6 +110,27 @@ TYPED_TEST(ValidationReportTests, SingleIncorrect) EXPECT_THAT(errors[0].total_elements, Eq(desc.get_element_size())); } +TYPED_TEST(ValidationReportTests, ZeroIsIncorrect) +{ + const auto desc = TypeParam::get_descriptor(); + + auto a = ckt::alloc_tensor_buffer(desc); + auto b = ckt::alloc_tensor_buffer(desc); + + ckt::clear_tensor_buffer(desc, a.get()); + ckt::clear_tensor_buffer(desc, b.get()); + + ckt::ValidationReport report; + report.check("zero_is_incorrect", desc, b.get(), a.get()); + + const auto errors = report.get_errors(); + ASSERT_THAT(errors.size(), Eq(1)); + EXPECT_THAT(errors[0].tensor_name, StrEq("zero_is_incorrect")); + EXPECT_THAT(errors[0].wrong_elements, Eq(0)); + EXPECT_THAT(errors[0].total_elements, Eq(desc.get_element_size())); + EXPECT_THAT(errors[0].zero_elements, Eq(desc.get_element_size())); +} + TEST(ValidationReportTests, MultipleSomeIncorrect) { ckt::ValidationReport report;