diff --git a/include/ck/tensor_operation/gpu/device/device_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
index bfb6c7608f..4576aaa7e0 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
@@ -1,5 +1,7 @@
 #pragma once
 #include <iostream>
+#include <vector>
+
 #include "device_base.hpp"
 
 namespace ck {
diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
index eeaa36b736..2409071b48 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -1,5 +1,4 @@
-#ifndef TENSOR_LAYOUT_HPP
-#define TENSOR_LAYOUT_HPP
+#pragma once
 
 namespace ck {
 namespace tensor_layout {
@@ -128,4 +127,3 @@ std::ostream& operator<<(std::ostream& os, const Layout&)
 
 } // namespace tensor_layout
 } // namespace ck
-#endif
diff --git a/library/include/ck/library/host_tensor/host_tensor_generator.hpp b/library/include/ck/library/host_tensor/host_tensor_generator.hpp
index a2cdc7afc8..17e20351f0 100644
--- a/library/include/ck/library/host_tensor/host_tensor_generator.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor_generator.hpp
@@ -1,7 +1,8 @@
-#ifndef HOST_TENSOR_GENERATOR_HPP
-#define HOST_TENSOR_GENERATOR_HPP
+#pragma once
 
 #include <cmath>
+#include <numeric>
+
 #include "config.hpp"
 
 template <typename T>
@@ -147,5 +148,3 @@ struct GeneratorTensor_Sequential
         return dims[Dim];
     }
 };
-
-#endif
diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp
index d57bfd7c09..07e687ebf6 100644
--- a/profiler/include/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_impl.hpp
@@ -1,6 +1,13 @@
 #pragma once
 
 #include <memory>
+
+#include "config.hpp"
+#include "element_wise_operation.hpp"
+#include "tensor_layout.hpp"
+#include "device.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_gemm.hpp"
 #include "reference_batched_gemm.hpp"
 
 namespace ck {
@@ -52,7 +59,7 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-void profile_batched_gemm_impl(int do_verification,
+bool profile_batched_gemm_impl(int do_verification,
                                int init_method,
                                bool do_log,
                                int nrepeat,
@@ -64,6 +71,8 @@ void profile_batched_gemm_impl(int do_verification,
                                int StrideC,
                                int BatchCount = 1)
 {
+    bool pass = true;
+
     auto f_host_tensor_descriptor = [](std::size_t batch_count,
                                        std::size_t row,
                                        std::size_t col,
@@ -379,12 +388,14 @@ void profile_batched_gemm_impl(int do_verification,
                 {
 
                     bf16_to_f32_(c_g_m_n_device_result, *c_f32_g_m_n_device_result);
-                    check_error(*c_f32_g_m_n_host_result, *c_f32_g_m_n_device_result);
+                    float err = check_error(*c_f32_g_m_n_host_result, *c_f32_g_m_n_device_result);
+                    pass      = pass && (err < 1E-6);
                 }
                 else
                 {
 
-                    check_error(c_g_m_n_host_result, c_g_m_n_device_result);
+                    float err = check_error(c_g_m_n_host_result, c_g_m_n_device_result);
+                    pass      = pass && (err < 1E-6);
                 }
 
                 if(do_log)
@@ -408,6 +419,8 @@ void profile_batched_gemm_impl(int do_verification,
 
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+
+    return pass;
 }
 
 } // namespace profiler
diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp
index 2f04bf35e4..24ba347206 100644
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -1,139 +1,41 @@
-#include <half.hpp>
-#include <tuple>
-#include <vector>
+#include "profile_batched_gemm_impl.hpp"
 
-#include "batched_gemm_util.hpp"
-#include "reference_batched_gemm.hpp"
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "device_batched_gemm_xdl.hpp"
-#include "element_wise_operation.hpp"
-#include "test_util.hpp"
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-using DeviceBatchedGemmPtr =
-    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough>;
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_batched_gemm_instance {
-void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
-    std::vector<DeviceBatchedGemmPtr>& instances);
-}
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+#include <iostream>
 
 namespace {
-using ADataType   = ck::half_t;
-using BDataType   = ck::half_t;
-using CDataType   = ck::half_t;
-using AccDataType = float;
+using ADataType = ck::half_t;
+using BDataType = ck::half_t;
+using CDataType = ck::half_t;
 
-using ALayout = ck::tensor_layout::gemm::RowMajor;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
-
-auto PrepareGemmTensor(const std::size_t batch_count,
-                       const ck::batched_gemm_util::GemmParams& params)
-{
-    auto f_host_tensor_descriptor =
-        [batch_count](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
-                                            std::vector<std::size_t>({row * stride, stride, 1}));
-            }
-            else
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
-                                            std::vector<std::size_t>({col * stride, 1, stride}));
-            }
-        };
-
-    Tensor<ADataType> a_g_m_k(
-        f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-    Tensor<BDataType> b_g_k_n(
-        f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-    Tensor<CDataType> c_g_m_n_host_result(
-        f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-    Tensor<CDataType> c_g_m_n_device_result(
-        f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-
-    a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
-    b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-
-    return std::make_tuple(a_g_m_k, b_g_k_n, c_g_m_n_host_result, c_g_m_n_device_result);
-}
-
-bool TestBatchedGemm(const std::size_t batch_count, DeviceBatchedGemmPtr& gemmPtr)
-{
-    // Arrange
-    ck::batched_gemm_util::GemmParams params;
-    params.M       = 1024;
-    params.N       = 1024;
-    params.K       = 1024;
-    params.StrideA = 1024;
-    params.StrideB = 1024;
-    params.StrideC = 1024;
-
-    auto host_tensors           = PrepareGemmTensor(batch_count, params);
-    const Tensor<ADataType>& a  = std::get<0>(host_tensors);
-    const Tensor<BDataType>& b  = std::get<1>(host_tensors);
-    Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
-    Tensor<CDataType>& c_device = std::get<3>(host_tensors);
-
-    auto a_element_op = PassThrough{};
-    auto b_element_op = PassThrough{};
-    auto c_element_op = PassThrough{};
-
-    using ReferenceBatchedGemmInstance =
-        ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
-                                                         BDataType,
-                                                         CDataType,
-                                                         PassThrough,
-                                                         PassThrough,
-                                                         PassThrough>;
-    ck::batched_gemm_util::RunHostBatchedGemm<ReferenceBatchedGemmInstance>(
-        a, b, c_host, a_element_op, b_element_op, c_element_op);
-
-    // Act
-    ck::batched_gemm_util::RunDeviceBatchedGemm(
-        gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
-
-    // Assert
-    // bool pass = test::check_err(
-    // c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
-    bool pass = check_error(c_device, c_host) < 0.007815f;
-
-    std::cout << (pass ? "SUCCESS" : "FAILURE") << std::endl;
-
-    return pass;
-}
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
 } // namespace
 
 int main()
 {
-    std::vector<DeviceBatchedGemmPtr> batched_gemm_ptrs;
-    ck::tensor_operation::device::device_batched_gemm_instance::
-        add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(batched_gemm_ptrs);
+    int M          = 512;
+    int N          = 256;
+    int K          = 128;
+    int BatchCount = 3;
 
     bool pass = true;
 
-    const std::size_t batch_count = 4;
-    for(auto& gemmPtr : batched_gemm_ptrs)
-    {
-        pass &= TestBatchedGemm(batch_count, gemmPtr);
-    }
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
+               true, 1, false, 1, M, N, K, K, N, N, BatchCount);
 
-    std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+               true, 1, false, 1, M, N, K, K, K, N, BatchCount);
 
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+               true, 1, false, 1, M, N, K, M, N, N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+               true, 1, false, 1, M, N, K, M, K, N, BatchCount);
+
+    std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
     return pass ? 0 : 1;
 }