diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
index 9f193b2090..78133af031 100644
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -66,7 +66,7 @@ else()
             -Wunreachable-code
             -Wunused
 
-            -Wno-sign-compare
+            -Wsign-compare
             -Wno-extra-semi-stmt
         )
         if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index 293b593902..7ca9823ff5 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -140,7 +140,7 @@ class SimpleAppArgs
 
     int processArgs(int argc, char* argv[])
     {
-        unsigned int ch;
+        int ch;
 
         while(1)
         {
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp
index 9def6c24fe..a18761095c 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -80,8 +80,8 @@ static void pool_host_verify(const Tensor<InDataType>& in,
                 for(int x = 0; x < window_spatial_lengths[1]; ++x)
                 {
                     int wi = wo * window_strides[1] + x - in_left_pads[1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
+                    if(hi >= 0 && hi < ck::type_convert<int>(in.mDesc.GetLengths()[2]) && wi >= 0 &&
+                       wi < ck::type_convert<int>(in.mDesc.GetLengths()[3]))
                     {
                         AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
 
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 4e9bdbb2f5..29ef01f2ef 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -131,7 +131,7 @@ int main(int argc, char* argv[])
 
     std::size_t flop = 0, num_btype = 0;
 
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
     {
         a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
             gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
         }
     }
 
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
     {
         a_tensors_device.emplace_back(
             std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpace()));
@@ -213,7 +213,7 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        for(int i = 0; i < gemm_shapes.size(); i++)
+        for(std::size_t i = 0; i < gemm_shapes.size(); i++)
         {
             c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
             auto ref_gemm    = ReferenceGemmInstance{};
diff --git a/include/ck/tensor_description/tensor_descriptor_helper.hpp b/include/ck/tensor_description/tensor_descriptor_helper.hpp
index ad75f9245e..ddc0ede404 100644
--- a/include/ck/tensor_description/tensor_descriptor_helper.hpp
+++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp
@@ -1,6 +1,4 @@
-#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
-#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
-
+#pragma once
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "multi_index_transform_helper.hpp"
@@ -35,6 +33,12 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
 }
 #endif
 
+// Lengths..., Strides... could be:
+//   1) index_t, which is known at run-time, or
+//   2) Number<>, which is known at compile-time
+// element_space_size could be:
+//   1) long_index_t, or
+//   2) LongNumber<>
 template <typename... Lengths,
           typename... Strides,
           typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
@@ -68,10 +72,10 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
         }
     };
 
-    const auto element_space_size = f(f, Number<0>{}, Number<1>{});
+    const auto element_space_size = f(f, Number<0>{}, LongNumber<1>{});
 #else
     const auto element_space_size =
-        calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{});
+        calculate_element_space_size_impl(lengths, strides, Number<0>{}, LongNumber<1>{});
 #endif
 
     return TensorDescriptor<remove_cv_t<decltype(transforms)>,
@@ -82,9 +86,12 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
                                                                        element_space_size};
 }
 
-// Lengths... can be:
-//   1) index_t, which is known at run-time
+// Lengths... could be:
+//   1) index_t, which is known at run-time, or
 //   2) Number<>, which is known at compile-time
+// element_space_size could be:
+//   1) long_index_t, or
+//   2) LongNumber<>
 template <typename... Lengths>
 __host__ __device__ constexpr auto
 make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
@@ -100,7 +107,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
 
     constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
 
-    const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{});
+    const auto element_space_size = container_reduce(lengths, math::multiplies{}, LongNumber<1>{});
 
     return TensorDescriptor<remove_cv_t<decltype(transforms)>,
                             remove_cv_t<decltype(low_dim_hidden_idss)>,
@@ -110,6 +117,12 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
                                                                        element_space_size};
 }
 
+// Lengths... could be:
+//   1) index_t, which is known at run-time, or
+//   2) Number<>, which is known at compile-time
+// align could be:
+//   1) index_t, or
+//   2) Number<>
 template <typename... Lengths, typename Align>
 __host__ __device__ constexpr auto
 make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
@@ -146,4 +159,3 @@ make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align ali
 }
 
 } // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
index a90bc44fdf..92655b2755 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -635,11 +635,12 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
               d_grid_desc_m_{DeviceOp::MakeDGridDescriptor_M(MRaw)},
               c_grid_desc_mblock_mperblock_nblock_nperblock_{},
               d_grid_desc_mblock_mperblock_{},
-              compute_base_ptr_of_batch_{a_grid_desc_ak0_m_ak1_.GetElementSpaceSize(),
-                                         b_grid_desc_bk0_n_bk1_.GetElementSpaceSize(),
-                                         c_grid_desc_m_n_.GetElementSpaceSize(),
-                                         d_grid_desc_m_.GetElementSpaceSize(),
-                                         d_grid_desc_m_.GetElementSpaceSize()},
+              compute_base_ptr_of_batch_{
+                  type_convert<index_t>(a_grid_desc_ak0_m_ak1_.GetElementSpaceSize()),
+                  type_convert<index_t>(b_grid_desc_bk0_n_bk1_.GetElementSpaceSize()),
+                  type_convert<index_t>(c_grid_desc_m_n_.GetElementSpaceSize()),
+                  type_convert<index_t>(d_grid_desc_m_.GetElementSpaceSize()),
+                  type_convert<index_t>(d_grid_desc_m_.GetElementSpaceSize())},
               block_2_ctile_map_{},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
index 5110e54ad1..88974a5221 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -384,9 +384,10 @@ struct DeviceBatchedGemmXdl
                   DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)},
               c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
               c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
-              compute_ptr_offset_of_batch_{a_grid_desc_k0_m_k1_.GetElementSpaceSize(),
-                                           b_grid_desc_k0_n_k1_.GetElementSpaceSize(),
-                                           c_grid_desc_m_n_.GetElementSpaceSize()},
+              compute_ptr_offset_of_batch_{
+                  type_convert<index_t>(a_grid_desc_k0_m_k1_.GetElementSpaceSize()),
+                  type_convert<index_t>(b_grid_desc_k0_n_k1_.GetElementSpaceSize()),
+                  type_convert<index_t>(c_grid_desc_m_n_.GetElementSpaceSize())},
               block_2_ctile_map_{},
               M01_{M01},
               N01_{N01},
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index 5606dad034..fad4ec1ffa 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -697,7 +697,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         }
 
         // Gridwise GEMM size
-        for(int i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+        for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                             arg.b_grid_desc_k0_n_k1_container_[i],
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
index ff267c6cdf..5dca8f9629 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1412,7 +1412,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
         }
 
         // Gridwise GEMM size
-        for(int i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+        for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
         {
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
                                             arg.b_grid_desc_k0_n_k1_container_[i],
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
index ac62448386..7365f9a3e2 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -861,17 +861,11 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     static bool IsSupportedArgument(const Argument& arg)
     {
         // Input tensors can't be bigger than 2GB each.
-        constexpr std::size_t GB2 = 2 * 1e9;
+        constexpr ck::long_index_t GB2 = (ck::long_index_t{1} << 31);
 
-        if(arg.a_grid_desc_k0_m_k1_.GetElementSpaceSize() > GB2)
-        {
-            return false;
-        }
-        if(arg.b_grid_desc_k0_n_k1_.GetElementSpaceSize() > GB2)
-        {
-            return false;
-        }
-        if(arg.c_grid_desc_m_n_.GetElementSpaceSize() > GB2)
+        if(arg.a_grid_desc_k0_m_k1_.GetElementSpaceSize() * sizeof(ADataType) > GB2 ||
+           arg.b_grid_desc_k0_n_k1_.GetElementSpaceSize() * sizeof(BDataType) > GB2 ||
+           arg.c_grid_desc_m_n_.GetElementSpaceSize() * sizeof(CDataType) > GB2)
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
index b9ad39578d..dfc1ce2715 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -372,17 +372,18 @@ struct DeviceGroupedGemmXdl
         {
             grid_size_ = 0;
 
-            group_count_ = static_cast<int>(gemm_shapes.size());
+            group_count_ = ck::type_convert<ck::index_t>(gemm_shapes.size());
 
-            if(!(group_count_ == p_a.size() && group_count_ == p_b.size() &&
-                 group_count_ == p_c.size()))
+            if(!(group_count_ == ck::type_convert<ck::index_t>(p_a.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_b.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_c.size())))
             {
                 throw std::runtime_error("wrong! group_count_ != P_a/b/c.size");
             }
 
             gemm_desc_kernel_arg_.reserve(group_count_);
 
-            for(index_t i = 0; i < gemm_shapes.size(); i++)
+            for(std::size_t i = 0; i < gemm_shapes.size(); i++)
             {
                 const index_t M = gemm_shapes[i].M;
                 const index_t N = gemm_shapes[i].N;
@@ -563,7 +564,7 @@ struct DeviceGroupedGemmXdl
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(arg.gemm_desc_kernel_arg_.size() != arg.group_count_)
+        if(ck::type_convert<ck::index_t>(arg.gemm_desc_kernel_arg_.size()) != arg.group_count_)
             return false;
         else
             return true;
diff --git a/include/ck/utility/number.hpp b/include/ck/utility/number.hpp
index 6f262a4d9f..97a71f8a41 100644
--- a/include/ck/utility/number.hpp
+++ b/include/ck/utility/number.hpp
@@ -8,5 +8,8 @@ namespace ck {
 template <index_t N>
 using Number = integral_constant<index_t, N>;
 
+template <index_t N>
+using LongNumber = integral_constant<long_index_t, N>;
+
 } // namespace ck
 #endif
diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp
index f36328fa5f..1a59f3c81e 100644
--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
@@ -158,5 +158,11 @@ __host__ __device__ constexpr auto make_static_buffer(Number<N>)
     return StaticBuffer<AddressSpace, T, N, true>{};
 }
 
+template <AddressSpaceEnum AddressSpace, typename T, long_index_t N>
+__host__ __device__ constexpr auto make_static_buffer(LongNumber<N>)
+{
+    return StaticBuffer<AddressSpace, T, N, true>{};
+}
+
 } // namespace ck
 #endif
diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp
index 786d34b73a..b67f794505 100644
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -211,7 +211,8 @@ struct ReductionHost
             AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
             IndexDataType accuIndex = 0;
 
-            for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
+            for(IndexDataType i = 0; i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
+                i++)
             {
                 auto offset_reduce =
                     get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
@@ -246,7 +247,9 @@ struct ReductionHost
                 auto offset_invariant =
                     get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
 
-                for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
+                for(IndexDataType i = 0;
+                    i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
+                    i++)
                 {
                     auto offset_reduce =
                         get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
diff --git a/library/include/ck/library/host_tensor/host_tensor.hpp b/library/include/ck/library/host_tensor/host_tensor.hpp
index 0d4c9f73d4..ad6aeecb50 100644
--- a/library/include/ck/library/host_tensor/host_tensor.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor.hpp
@@ -154,7 +154,7 @@ struct ParallelTensorFunctor
     {
         std::array<std::size_t, NDIM> indices;
 
-        for(int idim = 0; idim < NDIM; ++idim)
+        for(std::size_t idim = 0; idim < NDIM; ++idim)
         {
             indices[idim] = i / mStrides[idim];
             i -= indices[idim] * mStrides[idim];
@@ -316,7 +316,7 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
 
     constexpr float eps = 1e-10;
 
-    for(int i = 0; i < ref.mData.size(); ++i)
+    for(std::size_t i = 0; i < ref.mData.size(); ++i)
     {
         float ref_v    = ck::type_convert<float>(ref.mData[i]);
         float result_v = ck::type_convert<float>(result.mData[i]);
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
index 70f9e3617e..c5f3cbad69 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
@@ -70,18 +70,25 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
             constexpr auto I1 = Number<1>{};
             auto f_kcyx       = [&](auto k, auto c, auto y, auto x) {
                 float v_acc = 0;
-                for(int n = 0; n < arg.out_n_k_ho_wo_.mDesc.GetLengths()[0]; ++n)
+                for(std::size_t n = 0; n < arg.out_n_k_ho_wo_.mDesc.GetLengths()[0]; ++n)
                 {
-                    for(int ho = 0; ho < arg.out_n_k_ho_wo_.mDesc.GetLengths()[2]; ++ho)
+                    for(std::size_t ho = 0; ho < arg.out_n_k_ho_wo_.mDesc.GetLengths()[2]; ++ho)
                     {
-                        int hi = ho * arg.conv_strides_[I0] + y * arg.conv_dilations_[I0] -
-                                 arg.in_left_pads_[I0];
-                        for(int wo = 0; wo < arg.out_n_k_ho_wo_.mDesc.GetLengths()[3]; ++wo)
+                        auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[I0]) +
+                                  ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[I0]) -
+                                  ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I0]);
+                        for(std::size_t wo = 0; wo < arg.out_n_k_ho_wo_.mDesc.GetLengths()[3]; ++wo)
                         {
-                            int wi = wo * arg.conv_strides_[I1] + x * arg.conv_dilations_[I1] -
-                                     arg.in_left_pads_[I1];
-                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
-                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            auto wi =
+                                ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[I1]) +
+                                ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[I1]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I1]);
+                            if(hi >= 0 &&
+                               ck::type_convert<std::size_t>(hi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
+                               wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
                             {
                                 float v_out;
                                 float v_in;
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
index 0f210a23e1..9e91f06e7f 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -78,15 +78,18 @@ struct ReferenceConvBwdData : public device::BaseOperator
 
                     AccDataType v_acc = 0;
 
-                    for(int x = 0; x < X; ++x)
+                    for(std::size_t x = 0; x < X; ++x)
                     {
-                        int w_tmp = wi + arg.in_left_pads_[0] - x * arg.conv_dilations_[0];
+                        auto w_tmp = ck::type_convert<ck::long_index_t>(wi) +
+                                     ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                     ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]);
                         if(w_tmp % arg.conv_strides_[0] == 0)
                         {
-                            int wo = w_tmp / arg.conv_strides_[0];
-                            if(wo >= 0 && wo < Wo)
+                            auto wo = ck::type_convert<ck::long_index_t>(w_tmp) /
+                                      ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]);
+                            if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
                             {
-                                for(int k = 0; k < K; ++k)
+                                for(std::size_t k = 0; k < K; ++k)
                                 {
                                     AccDataType v_out = 0;
                                     AccDataType v_wei = 0;
@@ -128,24 +131,32 @@ struct ReferenceConvBwdData : public device::BaseOperator
 
                     AccDataType v_acc = 0;
 
-                    for(int y = 0; y < Y; ++y)
+                    for(std::size_t y = 0; y < Y; ++y)
                     {
-                        int h_tmp = hi + arg.in_left_pads_[0] - y * arg.conv_dilations_[0];
+                        auto h_tmp = ck::type_convert<ck::long_index_t>(hi) +
+                                     ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                     ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]);
                         if(h_tmp % arg.conv_strides_[0] == 0)
                         {
-                            int ho = h_tmp / arg.conv_strides_[0];
-                            if(ho >= 0 && ho < Ho)
+                            auto ho = ck::type_convert<ck::long_index_t>(h_tmp) /
+                                      ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]);
+                            if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
                             {
-                                for(int x = 0; x < X; ++x)
+                                for(std::size_t x = 0; x < X; ++x)
                                 {
-                                    int w_tmp =
-                                        wi + arg.in_left_pads_[1] - x * arg.conv_dilations_[1];
+                                    auto w_tmp =
+                                        ck::type_convert<ck::long_index_t>(wi) +
+                                        ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]) -
+                                        ck::type_convert<ck::long_index_t>(x *
+                                                                           arg.conv_dilations_[1]);
                                     if(w_tmp % arg.conv_strides_[1] == 0)
                                     {
-                                        int wo = w_tmp / arg.conv_strides_[1];
-                                        if(wo >= 0 && wo < Wo)
+                                        auto wo = ck::type_convert<ck::long_index_t>(w_tmp) /
+                                                  ck::type_convert<ck::long_index_t>(
+                                                      arg.conv_strides_[1]);
+                                        if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
                                         {
-                                            for(int k = 0; k < K; ++k)
+                                            for(std::size_t k = 0; k < K; ++k)
                                             {
                                                 AccDataType v_out = 0;
                                                 AccDataType v_wei = 0;
@@ -194,33 +205,49 @@ struct ReferenceConvBwdData : public device::BaseOperator
 
                     AccDataType v_acc = 0;
 
-                    for(int z = 0; z < Z; ++z)
+                    for(std::size_t z = 0; z < Z; ++z)
                     {
-                        int d_tmp = di + arg.in_left_pads_[0] - z * arg.conv_dilations_[0];
+                        auto d_tmp = ck::type_convert<ck::long_index_t>(di) +
+                                     ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                     ck::type_convert<ck::long_index_t>(z * arg.conv_dilations_[0]);
                         if(d_tmp % arg.conv_strides_[0] == 0)
                         {
-                            int do_ = d_tmp / arg.conv_strides_[0];
-                            if(do_ >= 0 && do_ < Do)
+                            auto do_ = ck::type_convert<ck::long_index_t>(d_tmp) /
+                                       ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]);
+                            if(do_ >= 0 && ck::type_convert<std::size_t>(do_) < Do)
                             {
-                                for(int y = 0; y < Y; ++y)
+                                for(std::size_t y = 0; y < Y; ++y)
                                 {
-                                    int h_tmp =
-                                        hi + arg.in_left_pads_[1] - y * arg.conv_dilations_[1];
+                                    auto h_tmp =
+                                        ck::type_convert<ck::long_index_t>(hi) +
+                                        ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]) -
+                                        ck::type_convert<ck::long_index_t>(y *
+                                                                           arg.conv_dilations_[1]);
                                     if(h_tmp % arg.conv_strides_[1] == 0)
                                     {
-                                        int ho = h_tmp / arg.conv_strides_[1];
-                                        if(ho >= 0 && ho < Ho)
+                                        auto ho = ck::type_convert<ck::long_index_t>(h_tmp) /
+                                                  ck::type_convert<ck::long_index_t>(
+                                                      arg.conv_strides_[1]);
+                                        if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
                                         {
-                                            for(int x = 0; x < X; ++x)
+                                            for(std::size_t x = 0; x < X; ++x)
                                             {
-                                                int w_tmp = wi + arg.in_left_pads_[2] -
-                                                            x * arg.conv_dilations_[2];
+                                                auto w_tmp =
+                                                    ck::type_convert<ck::long_index_t>(wi) +
+                                                    ck::type_convert<ck::long_index_t>(
+                                                        arg.in_left_pads_[2]) -
+                                                    ck::type_convert<ck::long_index_t>(
+                                                        x * arg.conv_dilations_[2]);
                                                 if(w_tmp % arg.conv_strides_[2] == 0)
                                                 {
-                                                    int wo = w_tmp / arg.conv_strides_[2];
-                                                    if(wo >= 0 && wo < Wo)
+                                                    auto wo =
+                                                        ck::type_convert<ck::long_index_t>(w_tmp) /
+                                                        ck::type_convert<ck::long_index_t>(
+                                                            arg.conv_strides_[2]);
+                                                    if(wo >= 0 &&
+                                                       ck::type_convert<std::size_t>(wo) < Wo)
                                                     {
-                                                        for(int k = 0; k < K; ++k)
+                                                        for(std::size_t k = 0; k < K; ++k)
                                                         {
                                                             AccDataType v_out = 0;
                                                             AccDataType v_wei = 0;
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index 0095d51a5b..65e59db2f8 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -88,13 +88,16 @@ struct ReferenceConvFwd : public device::BaseOperator
                 auto f_ncw = [&](auto n, auto k, auto wo) {
                     float v_acc = 0;
 
-                    for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
+                    for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
                     {
-                        for(int x = 0; x < arg.weight_.mDesc.GetLengths()[2]; ++x)
+                        for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[2]; ++x)
                         {
-                            int wi = wo * arg.conv_strides_[0] + x * arg.conv_dilations_[0] -
-                                     arg.in_left_pads_[0];
-                            if(wi >= 0 && wi < arg.input_.mDesc.GetLengths()[2])
+                            auto wi =
+                                ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[0]) +
+                                ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+                            if(wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) < arg.input_.mDesc.GetLengths()[2])
                             {
                                 float v_in;
                                 float v_wei;
@@ -128,18 +131,26 @@ struct ReferenceConvFwd : public device::BaseOperator
                 auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
                     float v_acc = 0;
 
-                    for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
+                    for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
                     {
-                        for(int y = 0; y < arg.weight_.mDesc.GetLengths()[2]; ++y)
+                        for(std::size_t y = 0; y < arg.weight_.mDesc.GetLengths()[2]; ++y)
                         {
-                            int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
-                                     arg.in_left_pads_[0];
-                            for(int x = 0; x < arg.weight_.mDesc.GetLengths()[3]; ++x)
+                            auto hi =
+                                ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+                            for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[3]; ++x)
                             {
-                                int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
-                                         arg.in_left_pads_[1];
-                                if(hi >= 0 && hi < arg.input_.mDesc.GetLengths()[2] && wi >= 0 &&
-                                   wi < arg.input_.mDesc.GetLengths()[3])
+                                auto wi =
+                                    ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                    ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                    ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
+                                if(hi >= 0 &&
+                                   ck::type_convert<std::size_t>(hi) <
+                                       arg.input_.mDesc.GetLengths()[2] &&
+                                   wi >= 0 &&
+                                   ck::type_convert<std::size_t>(wi) <
+                                       arg.input_.mDesc.GetLengths()[3])
                                 {
                                     float v_in;
                                     float v_wei;
@@ -174,23 +185,37 @@ struct ReferenceConvFwd : public device::BaseOperator
                 auto f_nchw = [&](auto n, auto k, auto d_o, auto ho, auto wo) {
                     float v_acc = 0;
 
-                    for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
+                    for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
                     {
-                        for(int z = 0; z < arg.weight_.mDesc.GetLengths()[2]; ++z)
+                        for(std::size_t z = 0; z < arg.weight_.mDesc.GetLengths()[2]; ++z)
                         {
-                            int di = d_o * arg.conv_strides_[0] + z * arg.conv_dilations_[0] -
-                                     arg.in_left_pads_[0];
-                            for(int y = 0; y < arg.weight_.mDesc.GetLengths()[3]; ++y)
+                            auto di =
+                                ck::type_convert<ck::long_index_t>(d_o * arg.conv_strides_[0]) +
+                                ck::type_convert<ck::long_index_t>(z * arg.conv_dilations_[0]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+                            for(std::size_t y = 0; y < arg.weight_.mDesc.GetLengths()[3]; ++y)
                             {
-                                int hi = ho * arg.conv_strides_[1] + y * arg.conv_dilations_[1] -
-                                         arg.in_left_pads_[1];
-                                for(int x = 0; x < arg.weight_.mDesc.GetLengths()[4]; ++x)
+                                auto hi =
+                                    ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[1]) +
+                                    ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[1]) -
+                                    ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
+                                for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[4]; ++x)
                                 {
-                                    int wi = wo * arg.conv_strides_[2] +
-                                             x * arg.conv_dilations_[2] - arg.in_left_pads_[2];
-                                    if(di >= 0 && di < arg.input_.mDesc.GetLengths()[2] &&
-                                       hi >= 0 && hi < arg.input_.mDesc.GetLengths()[3] &&
-                                       wi >= 0 && wi < arg.input_.mDesc.GetLengths()[4])
+                                    auto wi =
+                                        ck::type_convert<ck::long_index_t>(wo *
+                                                                           arg.conv_strides_[2]) +
+                                        ck::type_convert<ck::long_index_t>(x *
+                                                                           arg.conv_dilations_[2]) -
+                                        ck::type_convert<ck::long_index_t>(arg.in_left_pads_[2]);
+                                    if(di >= 0 &&
+                                       ck::type_convert<std::size_t>(di) <
+                                           arg.input_.mDesc.GetLengths()[2] &&
+                                       hi >= 0 &&
+                                       ck::type_convert<std::size_t>(hi) <
+                                           arg.input_.mDesc.GetLengths()[3] &&
+                                       wi >= 0 &&
+                                       ck::type_convert<std::size_t>(wi) <
+                                           arg.input_.mDesc.GetLengths()[4])
                                     {
                                         float v_in;
                                         float v_wei;
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
index 8f49b79a1a..ee95cd410a 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
@@ -73,18 +73,25 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
             auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
                 float v_acc = 0;
 
-                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
                 {
-                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
                     {
-                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
-                                 arg.in_left_pads_[0];
-                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                  ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                  ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+                        for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
                         {
-                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
-                                     arg.in_left_pads_[1];
-                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
-                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            auto wi =
+                                ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
+                            if(hi >= 0 &&
+                               ck::type_convert<std::size_t>(hi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
+                               wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
                             {
                                 float v_in;
                                 float v_wei;
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
index e4e0899416..11232cc98f 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
@@ -76,18 +76,25 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
             auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
                 float v_acc = 0;
 
-                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
                 {
-                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
                     {
-                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
-                                 arg.in_left_pads_[0];
-                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                  ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                  ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+                        for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
                         {
-                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
-                                     arg.in_left_pads_[1];
-                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
-                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            auto wi =
+                                ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
+                            if(hi >= 0 &&
+                               ck::type_convert<std::size_t>(hi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
+                               wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
                             {
                                 float v_in;
                                 float v_wei;
diff --git a/library/src/host_tensor/host_tensor.cpp b/library/src/host_tensor/host_tensor.cpp
index 38b0796635..138e3fc254 100644
--- a/library/src/host_tensor/host_tensor.cpp
+++ b/library/src/host_tensor/host_tensor.cpp
@@ -25,7 +25,7 @@ std::size_t HostTensorDescriptor::GetElementSize() const
 std::size_t HostTensorDescriptor::GetElementSpace() const
 {
     std::size_t space = 1;
-    for(int i = 0; i < mLens.size(); ++i)
+    for(std::size_t i = 0; i < mLens.size(); ++i)
     {
         space += (mLens[i] - 1) * mStrides[i];
     }
@@ -68,7 +68,7 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
 // FIXME: remove
 void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
 {
-    for(int i = 0; i < src.mData.size(); ++i)
+    for(std::size_t i = 0; i < src.mData.size(); ++i)
         dst.mData[i] = ck::type_convert<float>(src.mData[i]);
 }
 #endif
diff --git a/library/src/utility/conv_fwd_util.cpp b/library/src/utility/conv_fwd_util.cpp
index 1658450388..01bfeda16d 100644
--- a/library/src/utility/conv_fwd_util.cpp
+++ b/library/src/utility/conv_fwd_util.cpp
@@ -71,11 +71,12 @@ ConvParams::ConvParams(ck::index_t n_dim,
       input_left_pads(left_pads),
       input_right_pads(right_pads)
 {
-    if(filter_spatial_lengths.size() != num_dim_spatial ||
-       input_spatial_lengths.size() != num_dim_spatial ||
-       conv_filter_strides.size() != num_dim_spatial ||
-       conv_filter_dilations.size() != num_dim_spatial ||
-       input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
+    if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
     {
         throw(
             std::runtime_error("ConvParams::GetOutputSpatialLengths: "
@@ -85,11 +86,12 @@ ConvParams::ConvParams(ck::index_t n_dim,
 
 std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const
 {
-    if(filter_spatial_lengths.size() != num_dim_spatial ||
-       input_spatial_lengths.size() != num_dim_spatial ||
-       conv_filter_strides.size() != num_dim_spatial ||
-       conv_filter_dilations.size() != num_dim_spatial ||
-       input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
+    if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
+       ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
     {
         throw(
             std::runtime_error("ConvParams::GetOutputSpatialLengths: "
diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp
index 4f9038a72b..c9051f006f 100644
--- a/profiler/include/profile_convnd_bwd_data_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_data_impl.hpp
@@ -222,7 +222,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
 {
     float max_diff = 1e-6;
 
-    for(int i = 0; i < ref.mData.size(); ++i)
+    for(std::size_t i = 0; i < ref.mData.size(); ++i)
     {
         float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
         if(max_diff < diff)
@@ -236,16 +236,16 @@ template <typename DataType>
 void show_data_nhwc_layout(Tensor<DataType>& nhwc)
 {
     std::cout << "[";
-    for(int n = 0; n < nhwc.mDesc.GetLengths()[0]; n++)
+    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
     {
         std::cout << "[";
-        for(int hi = 0; hi < nhwc.mDesc.GetLengths()[2]; hi++)
+        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
         {
             std::cout << "[";
-            for(int wi = 0; wi < nhwc.mDesc.GetLengths()[3]; wi++)
+            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
             {
                 std::cout << "[";
-                for(int c = 0; c < nhwc.mDesc.GetLengths()[1]; c++)
+                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
                 {
                     std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
                 }
diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp
index cced480c36..ae70f551f1 100644
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -50,12 +50,12 @@ void profile_grouped_gemm_impl(int do_verification,
                                int init_method,
                                bool do_log,
                                int nrepeat,
-                               std::vector<int> Ms,
-                               std::vector<int> Ns,
-                               std::vector<int> Ks,
-                               std::vector<int> StrideAs,
-                               std::vector<int> StrideBs,
-                               std::vector<int> StrideCs)
+                               const std::vector<int>& Ms,
+                               const std::vector<int>& Ns,
+                               const std::vector<int>& Ks,
+                               const std::vector<int>& StrideAs,
+                               const std::vector<int>& StrideBs,
+                               const std::vector<int>& StrideCs)
 {
     auto f_host_tensor_descriptor =
         [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -71,7 +71,7 @@ void profile_grouped_gemm_impl(int do_verification,
             }
         };
 
-    int group_count = Ms.size();
+    std::size_t group_count = Ms.size();
 
     if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() &&
          group_count == StrideBs.size() && group_count == StrideCs.size()))
@@ -83,7 +83,7 @@ void profile_grouped_gemm_impl(int do_verification,
     std::vector<Tensor<BDataType>> b_k_n;
     std::vector<Tensor<CDataType>> c_m_n_device_results;
 
-    for(int i = 0; i < Ms.size(); i++)
+    for(std::size_t i = 0; i < group_count; i++)
     {
         a_m_k.push_back(
             Tensor<ADataType>(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{})));
@@ -144,7 +144,7 @@ void profile_grouped_gemm_impl(int do_verification,
 
     gemm_shapes.reserve(group_count);
 
-    for(int i = 0; i < group_count; i++)
+    for(std::size_t i = 0; i < group_count; i++)
     {
         a_device_buf.emplace_back(
             std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpace()));
@@ -234,7 +234,7 @@ void profile_grouped_gemm_impl(int do_verification,
             float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
 
             std::size_t flop = 0, num_btype = 0;
-            for(int i = 0; i < gemm_shapes.size(); i++)
+            for(std::size_t i = 0; i < gemm_shapes.size(); i++)
             {
                 flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
 
@@ -258,7 +258,7 @@ void profile_grouped_gemm_impl(int do_verification,
 
             if(do_verification)
             {
-                for(int i = 0; i < gemm_shapes.size(); i++)
+                for(std::size_t i = 0; i < gemm_shapes.size(); i++)
                 {
 
                     c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index c6dea1e385..96fa78964a 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -186,7 +186,7 @@ class AppArgs
 
     int processArgs(int argc, char* argv[])
     {
-        unsigned int ch;
+        int ch;
 
         optind++; // to skip the "reduce" module name
 
diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp
index a3d4f9b2ec..c788b66aa3 100644
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -45,7 +45,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
 {
     float max_diff = 1e-6;
 
-    for(int i = 0; i < ref.mData.size(); ++i)
+    for(std::size_t i = 0; i < ref.mData.size(); ++i)
     {
         float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
         if(max_diff < diff)
diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp
index 2260b01462..ef131ed867 100644
--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -104,7 +104,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
     b_tensors_device.reserve(group_count);
     c_tensors_device.reserve(group_count);
 
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
     {
         a_tensors.emplace_back(Tensor<ADataType>(f_host_tensor_descriptor(
             gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
@@ -119,7 +119,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
         b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
     }
 
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
     {
         a_tensors_device.emplace_back(
             std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize()));
@@ -147,7 +147,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
 
     invoker_ptr->Run(argument_ptr.get());
 
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
     {
         c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
 
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 28370cb2cd..317abab53a 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -460,7 +460,7 @@ class SimpleAppArgs
 
     int processArgs(int argc, char* argv[])
     {
-        unsigned int ch;
+        int ch;
 
         while(1)
         {
diff --git a/test/reduce/reduce_util.hpp b/test/reduce/reduce_util.hpp
index e9a7b4896e..9eb66513bf 100644
--- a/test/reduce/reduce_util.hpp
+++ b/test/reduce/reduce_util.hpp
@@ -9,7 +9,7 @@ namespace reduce_util {
 template <typename T>
 void to_f32_vector(const Tensor<T>& src, Tensor<float>& dst)
 {
-    for(int i = 0; i < src.mData.size(); ++i)
+    for(std::size_t i = 0; i < src.mData.size(); ++i)
         dst.mData[i] = type_convert<float>(src.mData[i]);
 }
 
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index 667b84a8dc..d7d5e551a2 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -463,7 +463,7 @@ class SimpleAppArgs
 
     int processArgs(int argc, char* argv[])
     {
-        unsigned int ch;
+        int ch;
 
         while(1)
         {