diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12bf7ae562..c3c0eab529 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,10 @@ link_libraries(${OpenMP_pthread_LIBRARY})
 find_package(HIP REQUIRED)
 message(STATUS "Build with HIP ${hip_VERSION}")
 
+## half
+#find_path(HALF_INCLUDE_DIR half.hpp)
+#message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+
 ## tidy
 include(EnableCompilerWarnings)
 set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
index 3a8883b460..09b39ab2ca 100644
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
@@ -203,9 +203,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     __host__ __device__ static constexpr auto
     MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
     {
-        const auto M = c_m_n_grid_desc.GetLength(I0);
-        const auto N = c_m_n_grid_desc.GetLength(I1);
-
         constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1>{};
 
         constexpr auto CLayout = xdlops_gemm.GetCLayout();
@@ -217,7 +214,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
         constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
 
-        constexpr auto N0 = Number<CLayout.N1()>{};
         constexpr auto N1 = Number<CLayout.N0()>{};
 
         const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor(
@@ -277,8 +273,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize());
 
         const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
-        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
 
         // divide block work by [M, N]
         const auto block_work_idx =
diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
index 49e0223b33..187a05554b 100644
--- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -319,16 +314,13 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
index ce4dd155f6..e1c6db8045 100644
--- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -304,7 +299,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/host/driver_offline/src/conv_bwd_driver_offline.cpp b/host/driver_offline/src/conv_bwd_driver_offline.cpp
index 3d1faaaf66..828bbae2bd 100644
--- a/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -277,8 +277,6 @@ int main(int argc, char* argv[])
                           in_right_pads_dev);
     };
 
-    const auto nhwc_desc = f_make_for_device_nhwc();
-
 #if USE_CONV_BWD_V4R1_XDL_NHWC
     if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
     {
diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 4aac2b5e4f..42a2d2f681 100644
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,12 +20,12 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 
 #define USE_DYNAMIC_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V4R4_NCHW 0
+#define USE_CONV_FWD_V4R4R2_NHWC 0
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
-#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
+#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
 
 enum ConvForwardAlgo
 {
diff --git a/host/host_tensor/CMakeLists.txt b/host/host_tensor/CMakeLists.txt
index 9c30275220..90249fc664 100644
--- a/host/host_tensor/CMakeLists.txt
+++ b/host/host_tensor/CMakeLists.txt
@@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE
 ## the library target
 add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) 
 
+#target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+
 target_link_libraries(host_tensor PRIVATE hip::device)
 target_link_libraries(host_tensor INTERFACE hip::host)
 
diff --git a/host/host_tensor/include/host_conv_bwd_data.hpp b/host/host_tensor/include/host_conv_bwd_data.hpp
index 07617c3926..ca23422e23 100644
--- a/host/host_tensor/include/host_conv_bwd_data.hpp
+++ b/host/host_tensor/include/host_conv_bwd_data.hpp
@@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
                                            const ConvStrides& conv_strides,
                                            const ConvDilations& conv_dilations,
                                            const InLeftPads& in_left_pads,
-                                           const InRightPads& in_right_pads,
+                                           const InRightPads& /* in_right_pads */,
                                            const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
     using namespace ck;
@@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
     constexpr auto I3 = Number<3>{};
 
     auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t C  = in.mDesc.GetLengths()[I1];
-        std::size_t Hi = in.mDesc.GetLengths()[I2];
-        std::size_t Wi = in.mDesc.GetLengths()[I3];
-
         std::size_t K = wei.mDesc.GetLengths()[I0];
         std::size_t Y = wei.mDesc.GetLengths()[I2];
         std::size_t X = wei.mDesc.GetLengths()[I3];
@@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
     };
 
     auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t Hi = in.mDesc.GetLengths()[I1];
-        std::size_t Wi = in.mDesc.GetLengths()[I2];
-        std::size_t C  = in.mDesc.GetLengths()[I3];
-
         std::size_t K = wei.mDesc.GetLengths()[I0];
         std::size_t Y = wei.mDesc.GetLengths()[I1];
         std::size_t X = wei.mDesc.GetLengths()[I2];
@@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
         in(n, hi, wi, c) = v;
     };
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
         make_ParallelTensorFunctor(f_nchw,
                                    in.mDesc.GetLengths()[0],
                                    in.mDesc.GetLengths()[1],
                                    in.mDesc.GetLengths()[2],
                                    in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    case ConvTensorLayout::NHWC:
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         make_ParallelTensorFunctor(f_nhwc,
                                    in.mDesc.GetLengths()[0],
                                    in.mDesc.GetLengths()[1],
                                    in.mDesc.GetLengths()[2],
                                    in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    default: throw std::runtime_error("wrong! not supported layout");
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
     }
 }