diff --git a/include/ck/tensor_operation/gpu/device/device_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm.hpp index bfb6c7608f..4576aaa7e0 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp @@ -1,5 +1,7 @@ #pragma once #include +#include + #include "device_base.hpp" namespace ck { diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp index eeaa36b736..2409071b48 100644 --- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp +++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp @@ -1,5 +1,4 @@ -#ifndef TENSOR_LAYOUT_HPP -#define TENSOR_LAYOUT_HPP +#pragma once namespace ck { namespace tensor_layout { @@ -128,4 +127,3 @@ std::ostream& operator<<(std::ostream& os, const Layout&) } // namespace tensor_layout } // namespace ck -#endif diff --git a/library/include/ck/library/host_tensor/host_tensor_generator.hpp b/library/include/ck/library/host_tensor/host_tensor_generator.hpp index a2cdc7afc8..17e20351f0 100644 --- a/library/include/ck/library/host_tensor/host_tensor_generator.hpp +++ b/library/include/ck/library/host_tensor/host_tensor_generator.hpp @@ -1,7 +1,8 @@ -#ifndef HOST_TENSOR_GENERATOR_HPP -#define HOST_TENSOR_GENERATOR_HPP +#pragma once #include +#include + #include "config.hpp" template @@ -147,5 +148,3 @@ struct GeneratorTensor_Sequential return dims[Dim]; } }; - -#endif diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp index d57bfd7c09..07e687ebf6 100644 --- a/profiler/include/profile_batched_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_impl.hpp @@ -1,6 +1,13 @@ #pragma once #include + +#include "config.hpp" +#include "element_wise_operation.hpp" +#include "tensor_layout.hpp" +#include "device.hpp" +#include "host_tensor_generator.hpp" +#include "device_gemm.hpp" #include "reference_batched_gemm.hpp" namespace ck { @@ -52,7 +59,7 @@ template -void profile_batched_gemm_impl(int do_verification, +bool profile_batched_gemm_impl(int do_verification, int init_method, bool do_log, int nrepeat, @@ -64,6 +71,8 @@ void profile_batched_gemm_impl(int do_verification, int StrideC, int BatchCount = 1) { + bool pass = true; + auto f_host_tensor_descriptor = [](std::size_t batch_count, std::size_t row, std::size_t col, @@ -379,12 +388,14 @@ void profile_batched_gemm_impl(int do_verification, { bf16_to_f32_(c_g_m_n_device_result, *c_f32_g_m_n_device_result); - check_error(*c_f32_g_m_n_host_result, *c_f32_g_m_n_device_result); + float err = check_error(*c_f32_g_m_n_host_result, *c_f32_g_m_n_device_result); + pass = pass && (err < 1E-6); } else { - check_error(c_g_m_n_host_result, c_g_m_n_device_result); + float err = check_error(c_g_m_n_host_result, c_g_m_n_device_result); + pass = pass && (err < 1E-6); } if(do_log) @@ -408,6 +419,8 @@ void profile_batched_gemm_impl(int do_verification, std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; + + return pass; } } // namespace profiler diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp index 2f04bf35e4..24ba347206 100644 --- a/test/batched_gemm/batched_gemm_fp16.cpp +++ b/test/batched_gemm/batched_gemm_fp16.cpp @@ -1,139 +1,41 @@ -#include -#include -#include +#include "profile_batched_gemm_impl.hpp" -#include "batched_gemm_util.hpp" -#include "reference_batched_gemm.hpp" -#include "config.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "device_tensor.hpp" -#include "device_batched_gemm_xdl.hpp" -#include "element_wise_operation.hpp" -#include "test_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceBatchedGemmPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_batched_gemm_instance { -void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances( - std::vector& instances); -} -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include namespace { -using ADataType = ck::half_t; -using BDataType = ck::half_t; -using CDataType = ck::half_t; -using AccDataType = float; +using ADataType = ck::half_t; +using BDataType = ck::half_t; +using CDataType = ck::half_t; -using ALayout = ck::tensor_layout::gemm::RowMajor; -using BLayout = ck::tensor_layout::gemm::ColumnMajor; -using CLayout = ck::tensor_layout::gemm::RowMajor; - -auto PrepareGemmTensor(const std::size_t batch_count, - const ck::batched_gemm_util::GemmParams& params) -{ - auto f_host_tensor_descriptor = - [batch_count](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - if(std::is_same::value) - { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({row * stride, stride, 1})); - } - else - { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({col * stride, 1, stride})); - } - }; - - Tensor a_g_m_k( - f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); - Tensor b_g_k_n( - f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); - Tensor c_g_m_n_host_result( - f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - Tensor c_g_m_n_device_result( - f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - - a_g_m_k.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - b_g_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - - return std::make_tuple(a_g_m_k, b_g_k_n, c_g_m_n_host_result, c_g_m_n_device_result); -} - -bool TestBatchedGemm(const std::size_t batch_count, DeviceBatchedGemmPtr& gemmPtr) -{ - // Arrange - ck::batched_gemm_util::GemmParams params; - params.M = 1024; - params.N = 1024; - params.K = 1024; - params.StrideA = 1024; - params.StrideB = 1024; - params.StrideC = 1024; - - auto host_tensors = PrepareGemmTensor(batch_count, params); - const Tensor& a = std::get<0>(host_tensors); - const Tensor& b = std::get<1>(host_tensors); - Tensor& c_host = std::get<2>(host_tensors); - Tensor& c_device = std::get<3>(host_tensors); - - auto a_element_op = PassThrough{}; - auto b_element_op = PassThrough{}; - auto c_element_op = PassThrough{}; - - using ReferenceBatchedGemmInstance = - ck::tensor_operation::host::ReferenceBatchedGemm; - ck::batched_gemm_util::RunHostBatchedGemm( - a, b, c_host, a_element_op, b_element_op, c_element_op); - - // Act - ck::batched_gemm_util::RunDeviceBatchedGemm( - gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op); - - // Assert - // bool pass = test::check_err( - // c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); - bool pass = check_error(c_device, c_host) < 0.007815f; - - std::cout << (pass ? "SUCCESS" : "FAILURE") << std::endl; - - return pass; -} +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; } // namespace int main() { - std::vector batched_gemm_ptrs; - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(batched_gemm_ptrs); + int M = 512; + int N = 256; + int K = 128; + int BatchCount = 3; bool pass = true; - const std::size_t batch_count = 4; - for(auto& gemmPtr : batched_gemm_ptrs) - { - pass &= TestBatchedGemm(batch_count, gemmPtr); - } + pass = pass && + ck::profiler::profile_batched_gemm_impl( + true, 1, false, 1, M, N, K, K, N, N, BatchCount); - std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + pass = pass && + ck::profiler::profile_batched_gemm_impl( + true, 1, false, 1, M, N, K, K, K, N, BatchCount); + pass = pass && + ck::profiler::profile_batched_gemm_impl( + true, 1, false, 1, M, N, K, M, N, N, BatchCount); + + pass = pass && + ck::profiler::profile_batched_gemm_impl( + true, 1, false, 1, M, N, K, M, K, N, BatchCount); + + std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl; return pass ? 0 : 1; }