diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake index 9f193b2090..78133af031 100644 --- a/cmake/EnableCompilerWarnings.cmake +++ b/cmake/EnableCompilerWarnings.cmake @@ -66,7 +66,7 @@ else() -Wunreachable-code -Wunused - -Wno-sign-compare + -Wsign-compare -Wno-extra-semi-stmt ) if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang") diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 293b593902..7ca9823ff5 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -140,7 +140,7 @@ class SimpleAppArgs int processArgs(int argc, char* argv[]) { - unsigned int ch; + int ch; while(1) { diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp index 9def6c24fe..a18761095c 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd.cpp @@ -80,8 +80,8 @@ static void pool_host_verify(const Tensor& in, for(int x = 0; x < window_spatial_lengths[1]; ++x) { int wi = wo * window_strides[1] + x - in_left_pads[1]; - if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && - wi < in.mDesc.GetLengths()[3]) + if(hi >= 0 && hi < ck::type_convert(in.mDesc.GetLengths()[2]) && wi >= 0 && + wi < ck::type_convert(in.mDesc.GetLengths()[3])) { AccDataType currVal = static_cast(in(n, c, hi, wi)); diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index 4e9bdbb2f5..29ef01f2ef 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -131,7 +131,7 @@ int main(int argc, char* argv[]) std::size_t flop = 0, num_btype = 0; - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { a_tensors.push_back(Tensor(f_host_tensor_descriptor( gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{}))); @@ -168,7 +168,7 @@ int main(int argc, char* argv[]) } } - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { a_tensors_device.emplace_back( std::make_unique(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpace())); @@ -213,7 +213,7 @@ int main(int argc, char* argv[]) if(do_verification) { - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data()); auto ref_gemm = ReferenceGemmInstance{}; diff --git a/include/ck/tensor_description/tensor_descriptor_helper.hpp b/include/ck/tensor_description/tensor_descriptor_helper.hpp index ad75f9245e..ddc0ede404 100644 --- a/include/ck/tensor_description/tensor_descriptor_helper.hpp +++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp @@ -1,6 +1,4 @@ -#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP -#define CK_TENSOR_DESCRIPTOR_HELPER_HPP - +#pragma once #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "multi_index_transform_helper.hpp" @@ -35,6 +33,12 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt } #endif +// Lengths..., Strides... could be: +// 1) index_t, which is known at run-time, or +// 2) Number<>, which is known at compile-time +// element_space_size could be: +// 1) long_index_t, or +// 2) LongNumber<> template ::type = false> @@ -68,10 +72,10 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple{}, Number<1>{}); + const auto element_space_size = f(f, Number<0>{}, LongNumber<1>{}); #else const auto element_space_size = - calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{}); + calculate_element_space_size_impl(lengths, strides, Number<0>{}, LongNumber<1>{}); #endif return TensorDescriptor, @@ -82,9 +86,12 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple, which is known at compile-time +// element_space_size could be: +// 1) long_index_t, or +// 2) LongNumber<> template __host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple& lengths) @@ -100,7 +107,7 @@ make_naive_tensor_descriptor_packed(const Tuple& lengths) constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{}; - const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{}); + const auto element_space_size = container_reduce(lengths, math::multiplies{}, LongNumber<1>{}); return TensorDescriptor, remove_cv_t, @@ -110,6 +117,12 @@ make_naive_tensor_descriptor_packed(const Tuple& lengths) element_space_size}; } +// Lengths... could be: +// 1) index_t, which is known at run-time, or +// 2) Number<>, which is known at compile-time +// align could be: +// 1) index_t, or +// 2) Number<> template __host__ __device__ constexpr auto make_naive_tensor_descriptor_aligned(const Tuple& lengths, Align align) @@ -146,4 +159,3 @@ make_naive_tensor_descriptor_aligned(const Tuple& lengths, Align ali } } // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp index a90bc44fdf..92655b2755 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp @@ -635,11 +635,12 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce(a_grid_desc_ak0_m_ak1_.GetElementSpaceSize()), + type_convert(b_grid_desc_bk0_n_bk1_.GetElementSpaceSize()), + type_convert(c_grid_desc_m_n_.GetElementSpaceSize()), + type_convert(d_grid_desc_m_.GetElementSpaceSize()), + type_convert(d_grid_desc_m_.GetElementSpaceSize())}, block_2_ctile_map_{}, a_element_op_{a_element_op}, b_element_op_{b_element_op}, diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp index 5110e54ad1..88974a5221 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp @@ -384,9 +384,10 @@ struct DeviceBatchedGemmXdl DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)}, c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)}, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{}, - compute_ptr_offset_of_batch_{a_grid_desc_k0_m_k1_.GetElementSpaceSize(), - b_grid_desc_k0_n_k1_.GetElementSpaceSize(), - c_grid_desc_m_n_.GetElementSpaceSize()}, + compute_ptr_offset_of_batch_{ + type_convert(a_grid_desc_k0_m_k1_.GetElementSpaceSize()), + type_convert(b_grid_desc_k0_n_k1_.GetElementSpaceSize()), + type_convert(c_grid_desc_m_n_.GetElementSpaceSize())}, block_2_ctile_map_{}, M01_{M01}, N01_{N01}, diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp index 5606dad034..fad4ec1ffa 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp @@ -697,7 +697,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K } // Gridwise GEMM size - for(int i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) + for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) { if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i], arg.b_grid_desc_k0_n_k1_container_[i], diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp index ff267c6cdf..5dca8f9629 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp @@ -1412,7 +1412,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho } // Gridwise GEMM size - for(int i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) + for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) { if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i], arg.b_grid_desc_k0_n_k1_container_[i], diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp index ac62448386..7365f9a3e2 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -861,17 +861,11 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K static bool IsSupportedArgument(const Argument& arg) { // Input tensors can't be bigger than 2GB each. - constexpr std::size_t GB2 = 2 * 1e9; + constexpr ck::long_index_t GB2 = (ck::long_index_t{1} << 31); - if(arg.a_grid_desc_k0_m_k1_.GetElementSpaceSize() > GB2) - { - return false; - } - if(arg.b_grid_desc_k0_n_k1_.GetElementSpaceSize() > GB2) - { - return false; - } - if(arg.c_grid_desc_m_n_.GetElementSpaceSize() > GB2) + if(arg.a_grid_desc_k0_m_k1_.GetElementSpaceSize() * sizeof(ADataType) > GB2 || + arg.b_grid_desc_k0_n_k1_.GetElementSpaceSize() * sizeof(BDataType) > GB2 || + arg.c_grid_desc_m_n_.GetElementSpaceSize() * sizeof(CDataType) > GB2) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp index b9ad39578d..dfc1ce2715 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp @@ -372,17 +372,18 @@ struct DeviceGroupedGemmXdl { grid_size_ = 0; - group_count_ = static_cast(gemm_shapes.size()); + group_count_ = ck::type_convert(gemm_shapes.size()); - if(!(group_count_ == p_a.size() && group_count_ == p_b.size() && - group_count_ == p_c.size())) + if(!(group_count_ == ck::type_convert(p_a.size()) && + group_count_ == ck::type_convert(p_b.size()) && + group_count_ == ck::type_convert(p_c.size()))) { throw std::runtime_error("wrong! group_count_ != P_a/b/c.size"); } gemm_desc_kernel_arg_.reserve(group_count_); - for(index_t i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { const index_t M = gemm_shapes[i].M; const index_t N = gemm_shapes[i].N; @@ -563,7 +564,7 @@ struct DeviceGroupedGemmXdl static bool IsSupportedArgument(const Argument& arg) { - if(arg.gemm_desc_kernel_arg_.size() != arg.group_count_) + if(ck::type_convert(arg.gemm_desc_kernel_arg_.size()) != arg.group_count_) return false; else return true; diff --git a/include/ck/utility/number.hpp b/include/ck/utility/number.hpp index 6f262a4d9f..97a71f8a41 100644 --- a/include/ck/utility/number.hpp +++ b/include/ck/utility/number.hpp @@ -8,5 +8,8 @@ namespace ck { template using Number = integral_constant; +template +using LongNumber = integral_constant; + } // namespace ck #endif diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp index f36328fa5f..1a59f3c81e 100644 --- a/include/ck/utility/static_buffer.hpp +++ b/include/ck/utility/static_buffer.hpp @@ -158,5 +158,11 @@ __host__ __device__ constexpr auto make_static_buffer(Number) return StaticBuffer{}; } +template +__host__ __device__ constexpr auto make_static_buffer(LongNumber) +{ + return StaticBuffer{}; +} + } // namespace ck #endif diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp index 786d34b73a..b67f794505 100644 --- a/library/include/ck/library/host_tensor/host_reduction.hpp +++ b/library/include/ck/library/host_tensor/host_reduction.hpp @@ -211,7 +211,8 @@ struct ReductionHost AccDataType accuVal = ReduceOpZeroVal(); IndexDataType accuIndex = 0; - for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++) + for(IndexDataType i = 0; i < ck::type_convert(reduce_dim_indexes.size()); + i++) { auto offset_reduce = get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); @@ -246,7 +247,9 @@ struct ReductionHost auto offset_invariant = get_offset_from_index(invariantStrides, invariant_index); - for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++) + for(IndexDataType i = 0; + i < ck::type_convert(reduce_dim_indexes.size()); + i++) { auto offset_reduce = get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); diff --git a/library/include/ck/library/host_tensor/host_tensor.hpp b/library/include/ck/library/host_tensor/host_tensor.hpp index 0d4c9f73d4..ad6aeecb50 100644 --- a/library/include/ck/library/host_tensor/host_tensor.hpp +++ b/library/include/ck/library/host_tensor/host_tensor.hpp @@ -154,7 +154,7 @@ struct ParallelTensorFunctor { std::array indices; - for(int idim = 0; idim < NDIM; ++idim) + for(std::size_t idim = 0; idim < NDIM; ++idim) { indices[idim] = i / mStrides[idim]; i -= indices[idim] * mStrides[idim]; @@ -316,7 +316,7 @@ float check_error(const Tensor& ref, const Tensor& result) constexpr float eps = 1e-10; - for(int i = 0; i < ref.mData.size(); ++i) + for(std::size_t i = 0; i < ref.mData.size(); ++i) { float ref_v = ck::type_convert(ref.mData[i]); float result_v = ck::type_convert(result.mData[i]); diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp index 70f9e3617e..c5f3cbad69 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp @@ -70,18 +70,25 @@ struct ReferenceConvBwdWeight : public device::BaseOperator constexpr auto I1 = Number<1>{}; auto f_kcyx = [&](auto k, auto c, auto y, auto x) { float v_acc = 0; - for(int n = 0; n < arg.out_n_k_ho_wo_.mDesc.GetLengths()[0]; ++n) + for(std::size_t n = 0; n < arg.out_n_k_ho_wo_.mDesc.GetLengths()[0]; ++n) { - for(int ho = 0; ho < arg.out_n_k_ho_wo_.mDesc.GetLengths()[2]; ++ho) + for(std::size_t ho = 0; ho < arg.out_n_k_ho_wo_.mDesc.GetLengths()[2]; ++ho) { - int hi = ho * arg.conv_strides_[I0] + y * arg.conv_dilations_[I0] - - arg.in_left_pads_[I0]; - for(int wo = 0; wo < arg.out_n_k_ho_wo_.mDesc.GetLengths()[3]; ++wo) + auto hi = ck::type_convert(ho * arg.conv_strides_[I0]) + + ck::type_convert(y * arg.conv_dilations_[I0]) - + ck::type_convert(arg.in_left_pads_[I0]); + for(std::size_t wo = 0; wo < arg.out_n_k_ho_wo_.mDesc.GetLengths()[3]; ++wo) { - int wi = wo * arg.conv_strides_[I1] + x * arg.conv_dilations_[I1] - - arg.in_left_pads_[I1]; - if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && - wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) + auto wi = + ck::type_convert(wo * arg.conv_strides_[I1]) + + ck::type_convert(x * arg.conv_dilations_[I1]) - + ck::type_convert(arg.in_left_pads_[I1]); + if(hi >= 0 && + ck::type_convert(hi) < + arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && + wi >= 0 && + ck::type_convert(wi) < + arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) { float v_out; float v_in; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp index 0f210a23e1..9e91f06e7f 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp @@ -78,15 +78,18 @@ struct ReferenceConvBwdData : public device::BaseOperator AccDataType v_acc = 0; - for(int x = 0; x < X; ++x) + for(std::size_t x = 0; x < X; ++x) { - int w_tmp = wi + arg.in_left_pads_[0] - x * arg.conv_dilations_[0]; + auto w_tmp = ck::type_convert(wi) + + ck::type_convert(arg.in_left_pads_[0]) - + ck::type_convert(x * arg.conv_dilations_[0]); if(w_tmp % arg.conv_strides_[0] == 0) { - int wo = w_tmp / arg.conv_strides_[0]; - if(wo >= 0 && wo < Wo) + auto wo = ck::type_convert(w_tmp) / + ck::type_convert(arg.conv_strides_[0]); + if(wo >= 0 && ck::type_convert(wo) < Wo) { - for(int k = 0; k < K; ++k) + for(std::size_t k = 0; k < K; ++k) { AccDataType v_out = 0; AccDataType v_wei = 0; @@ -128,24 +131,32 @@ struct ReferenceConvBwdData : public device::BaseOperator AccDataType v_acc = 0; - for(int y = 0; y < Y; ++y) + for(std::size_t y = 0; y < Y; ++y) { - int h_tmp = hi + arg.in_left_pads_[0] - y * arg.conv_dilations_[0]; + auto h_tmp = ck::type_convert(hi) + + ck::type_convert(arg.in_left_pads_[0]) - + ck::type_convert(y * arg.conv_dilations_[0]); if(h_tmp % arg.conv_strides_[0] == 0) { - int ho = h_tmp / arg.conv_strides_[0]; - if(ho >= 0 && ho < Ho) + auto ho = ck::type_convert(h_tmp) / + ck::type_convert(arg.conv_strides_[0]); + if(ho >= 0 && ck::type_convert(ho) < Ho) { - for(int x = 0; x < X; ++x) + for(std::size_t x = 0; x < X; ++x) { - int w_tmp = - wi + arg.in_left_pads_[1] - x * arg.conv_dilations_[1]; + auto w_tmp = + ck::type_convert(wi) + + ck::type_convert(arg.in_left_pads_[1]) - + ck::type_convert(x * + arg.conv_dilations_[1]); if(w_tmp % arg.conv_strides_[1] == 0) { - int wo = w_tmp / arg.conv_strides_[1]; - if(wo >= 0 && wo < Wo) + auto wo = ck::type_convert(w_tmp) / + ck::type_convert( + arg.conv_strides_[1]); + if(wo >= 0 && ck::type_convert(wo) < Wo) { - for(int k = 0; k < K; ++k) + for(std::size_t k = 0; k < K; ++k) { AccDataType v_out = 0; AccDataType v_wei = 0; @@ -194,33 +205,49 @@ struct ReferenceConvBwdData : public device::BaseOperator AccDataType v_acc = 0; - for(int z = 0; z < Z; ++z) + for(std::size_t z = 0; z < Z; ++z) { - int d_tmp = di + arg.in_left_pads_[0] - z * arg.conv_dilations_[0]; + auto d_tmp = ck::type_convert(di) + + ck::type_convert(arg.in_left_pads_[0]) - + ck::type_convert(z * arg.conv_dilations_[0]); if(d_tmp % arg.conv_strides_[0] == 0) { - int do_ = d_tmp / arg.conv_strides_[0]; - if(do_ >= 0 && do_ < Do) + auto do_ = ck::type_convert(d_tmp) / + ck::type_convert(arg.conv_strides_[0]); + if(do_ >= 0 && ck::type_convert(do_) < Do) { - for(int y = 0; y < Y; ++y) + for(std::size_t y = 0; y < Y; ++y) { - int h_tmp = - hi + arg.in_left_pads_[1] - y * arg.conv_dilations_[1]; + auto h_tmp = + ck::type_convert(hi) + + ck::type_convert(arg.in_left_pads_[1]) - + ck::type_convert(y * + arg.conv_dilations_[1]); if(h_tmp % arg.conv_strides_[1] == 0) { - int ho = h_tmp / arg.conv_strides_[1]; - if(ho >= 0 && ho < Ho) + auto ho = ck::type_convert(h_tmp) / + ck::type_convert( + arg.conv_strides_[1]); + if(ho >= 0 && ck::type_convert(ho) < Ho) { - for(int x = 0; x < X; ++x) + for(std::size_t x = 0; x < X; ++x) { - int w_tmp = wi + arg.in_left_pads_[2] - - x * arg.conv_dilations_[2]; + auto w_tmp = + ck::type_convert(wi) + + ck::type_convert( + arg.in_left_pads_[2]) - + ck::type_convert( + x * arg.conv_dilations_[2]); if(w_tmp % arg.conv_strides_[2] == 0) { - int wo = w_tmp / arg.conv_strides_[2]; - if(wo >= 0 && wo < Wo) + auto wo = + ck::type_convert(w_tmp) / + ck::type_convert( + arg.conv_strides_[2]); + if(wo >= 0 && + ck::type_convert(wo) < Wo) { - for(int k = 0; k < K; ++k) + for(std::size_t k = 0; k < K; ++k) { AccDataType v_out = 0; AccDataType v_wei = 0; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp index 0095d51a5b..65e59db2f8 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp @@ -88,13 +88,16 @@ struct ReferenceConvFwd : public device::BaseOperator auto f_ncw = [&](auto n, auto k, auto wo) { float v_acc = 0; - for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) + for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) { - for(int x = 0; x < arg.weight_.mDesc.GetLengths()[2]; ++x) + for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[2]; ++x) { - int wi = wo * arg.conv_strides_[0] + x * arg.conv_dilations_[0] - - arg.in_left_pads_[0]; - if(wi >= 0 && wi < arg.input_.mDesc.GetLengths()[2]) + auto wi = + ck::type_convert(wo * arg.conv_strides_[0]) + + ck::type_convert(x * arg.conv_dilations_[0]) - + ck::type_convert(arg.in_left_pads_[0]); + if(wi >= 0 && + ck::type_convert(wi) < arg.input_.mDesc.GetLengths()[2]) { float v_in; float v_wei; @@ -128,18 +131,26 @@ struct ReferenceConvFwd : public device::BaseOperator auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { float v_acc = 0; - for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) + for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) { - for(int y = 0; y < arg.weight_.mDesc.GetLengths()[2]; ++y) + for(std::size_t y = 0; y < arg.weight_.mDesc.GetLengths()[2]; ++y) { - int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - - arg.in_left_pads_[0]; - for(int x = 0; x < arg.weight_.mDesc.GetLengths()[3]; ++x) + auto hi = + ck::type_convert(ho * arg.conv_strides_[0]) + + ck::type_convert(y * arg.conv_dilations_[0]) - + ck::type_convert(arg.in_left_pads_[0]); + for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[3]; ++x) { - int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - - arg.in_left_pads_[1]; - if(hi >= 0 && hi < arg.input_.mDesc.GetLengths()[2] && wi >= 0 && - wi < arg.input_.mDesc.GetLengths()[3]) + auto wi = + ck::type_convert(wo * arg.conv_strides_[1]) + + ck::type_convert(x * arg.conv_dilations_[1]) - + ck::type_convert(arg.in_left_pads_[1]); + if(hi >= 0 && + ck::type_convert(hi) < + arg.input_.mDesc.GetLengths()[2] && + wi >= 0 && + ck::type_convert(wi) < + arg.input_.mDesc.GetLengths()[3]) { float v_in; float v_wei; @@ -174,23 +185,37 @@ struct ReferenceConvFwd : public device::BaseOperator auto f_nchw = [&](auto n, auto k, auto d_o, auto ho, auto wo) { float v_acc = 0; - for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) + for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) { - for(int z = 0; z < arg.weight_.mDesc.GetLengths()[2]; ++z) + for(std::size_t z = 0; z < arg.weight_.mDesc.GetLengths()[2]; ++z) { - int di = d_o * arg.conv_strides_[0] + z * arg.conv_dilations_[0] - - arg.in_left_pads_[0]; - for(int y = 0; y < arg.weight_.mDesc.GetLengths()[3]; ++y) + auto di = + ck::type_convert(d_o * arg.conv_strides_[0]) + + ck::type_convert(z * arg.conv_dilations_[0]) - + ck::type_convert(arg.in_left_pads_[0]); + for(std::size_t y = 0; y < arg.weight_.mDesc.GetLengths()[3]; ++y) { - int hi = ho * arg.conv_strides_[1] + y * arg.conv_dilations_[1] - - arg.in_left_pads_[1]; - for(int x = 0; x < arg.weight_.mDesc.GetLengths()[4]; ++x) + auto hi = + ck::type_convert(ho * arg.conv_strides_[1]) + + ck::type_convert(y * arg.conv_dilations_[1]) - + ck::type_convert(arg.in_left_pads_[1]); + for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[4]; ++x) { - int wi = wo * arg.conv_strides_[2] + - x * arg.conv_dilations_[2] - arg.in_left_pads_[2]; - if(di >= 0 && di < arg.input_.mDesc.GetLengths()[2] && - hi >= 0 && hi < arg.input_.mDesc.GetLengths()[3] && - wi >= 0 && wi < arg.input_.mDesc.GetLengths()[4]) + auto wi = + ck::type_convert(wo * + arg.conv_strides_[2]) + + ck::type_convert(x * + arg.conv_dilations_[2]) - + ck::type_convert(arg.in_left_pads_[2]); + if(di >= 0 && + ck::type_convert(di) < + arg.input_.mDesc.GetLengths()[2] && + hi >= 0 && + ck::type_convert(hi) < + arg.input_.mDesc.GetLengths()[3] && + wi >= 0 && + ck::type_convert(wi) < + arg.input_.mDesc.GetLengths()[4]) { float v_in; float v_wei; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp index 8f49b79a1a..ee95cd410a 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp @@ -73,18 +73,25 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { float v_acc = 0; - for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) + for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) { - for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) + for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) { - int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - - arg.in_left_pads_[0]; - for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) + auto hi = ck::type_convert(ho * arg.conv_strides_[0]) + + ck::type_convert(y * arg.conv_dilations_[0]) - + ck::type_convert(arg.in_left_pads_[0]); + for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) { - int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - - arg.in_left_pads_[1]; - if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && - wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) + auto wi = + ck::type_convert(wo * arg.conv_strides_[1]) + + ck::type_convert(x * arg.conv_dilations_[1]) - + ck::type_convert(arg.in_left_pads_[1]); + if(hi >= 0 && + ck::type_convert(hi) < + arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && + wi >= 0 && + ck::type_convert(wi) < + arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) { float v_in; float v_wei; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp index e4e0899416..11232cc98f 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp @@ -76,18 +76,25 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { float v_acc = 0; - for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) + for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) { - for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) + for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) { - int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - - arg.in_left_pads_[0]; - for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) + auto hi = ck::type_convert(ho * arg.conv_strides_[0]) + + ck::type_convert(y * arg.conv_dilations_[0]) - + ck::type_convert(arg.in_left_pads_[0]); + for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) { - int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - - arg.in_left_pads_[1]; - if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && - wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) + auto wi = + ck::type_convert(wo * arg.conv_strides_[1]) + + ck::type_convert(x * arg.conv_dilations_[1]) - + ck::type_convert(arg.in_left_pads_[1]); + if(hi >= 0 && + ck::type_convert(hi) < + arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && + wi >= 0 && + ck::type_convert(wi) < + arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) { float v_in; float v_wei; diff --git a/library/src/host_tensor/host_tensor.cpp b/library/src/host_tensor/host_tensor.cpp index 38b0796635..138e3fc254 100644 --- a/library/src/host_tensor/host_tensor.cpp +++ b/library/src/host_tensor/host_tensor.cpp @@ -25,7 +25,7 @@ std::size_t HostTensorDescriptor::GetElementSize() const std::size_t HostTensorDescriptor::GetElementSpace() const { std::size_t space = 1; - for(int i = 0; i < mLens.size(); ++i) + for(std::size_t i = 0; i < mLens.size(); ++i) { space += (mLens[i] - 1) * mStrides[i]; } @@ -68,7 +68,7 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream // FIXME: remove void bf16_to_f32_(const Tensor& src, Tensor& dst) { - for(int i = 0; i < src.mData.size(); ++i) + for(std::size_t i = 0; i < src.mData.size(); ++i) dst.mData[i] = ck::type_convert(src.mData[i]); } #endif diff --git a/library/src/utility/conv_fwd_util.cpp b/library/src/utility/conv_fwd_util.cpp index 1658450388..01bfeda16d 100644 --- a/library/src/utility/conv_fwd_util.cpp +++ b/library/src/utility/conv_fwd_util.cpp @@ -71,11 +71,12 @@ ConvParams::ConvParams(ck::index_t n_dim, input_left_pads(left_pads), input_right_pads(right_pads) { - if(filter_spatial_lengths.size() != num_dim_spatial || - input_spatial_lengths.size() != num_dim_spatial || - conv_filter_strides.size() != num_dim_spatial || - conv_filter_dilations.size() != num_dim_spatial || - input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) + if(ck::type_convert(filter_spatial_lengths.size()) != num_dim_spatial || + ck::type_convert(input_spatial_lengths.size()) != num_dim_spatial || + ck::type_convert(conv_filter_strides.size()) != num_dim_spatial || + ck::type_convert(conv_filter_dilations.size()) != num_dim_spatial || + ck::type_convert(input_left_pads.size()) != num_dim_spatial || + ck::type_convert(input_right_pads.size()) != num_dim_spatial) { throw( std::runtime_error("ConvParams::GetOutputSpatialLengths: " @@ -85,11 +86,12 @@ ConvParams::ConvParams(ck::index_t n_dim, std::vector ConvParams::GetOutputSpatialLengths() const { - if(filter_spatial_lengths.size() != num_dim_spatial || - input_spatial_lengths.size() != num_dim_spatial || - conv_filter_strides.size() != num_dim_spatial || - conv_filter_dilations.size() != num_dim_spatial || - input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) + if(ck::type_convert(filter_spatial_lengths.size()) != num_dim_spatial || + ck::type_convert(input_spatial_lengths.size()) != num_dim_spatial || + ck::type_convert(conv_filter_strides.size()) != num_dim_spatial || + ck::type_convert(conv_filter_dilations.size()) != num_dim_spatial || + ck::type_convert(input_left_pads.size()) != num_dim_spatial || + ck::type_convert(input_right_pads.size()) != num_dim_spatial) { throw( std::runtime_error("ConvParams::GetOutputSpatialLengths: " diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index 4f9038a72b..c9051f006f 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -222,7 +222,7 @@ static bool check_out(const Tensor& ref, const Tensor& result) { float max_diff = 1e-6; - for(int i = 0; i < ref.mData.size(); ++i) + for(std::size_t i = 0; i < ref.mData.size(); ++i) { float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); if(max_diff < diff) @@ -236,16 +236,16 @@ template void show_data_nhwc_layout(Tensor& nhwc) { std::cout << "["; - for(int n = 0; n < nhwc.mDesc.GetLengths()[0]; n++) + for(int n = 0; n < ck::type_convert(nhwc.mDesc.GetLengths()[0]); n++) { std::cout << "["; - for(int hi = 0; hi < nhwc.mDesc.GetLengths()[2]; hi++) + for(int hi = 0; hi < ck::type_convert(nhwc.mDesc.GetLengths()[2]); hi++) { std::cout << "["; - for(int wi = 0; wi < nhwc.mDesc.GetLengths()[3]; wi++) + for(int wi = 0; wi < ck::type_convert(nhwc.mDesc.GetLengths()[3]); wi++) { std::cout << "["; - for(int c = 0; c < nhwc.mDesc.GetLengths()[1]; c++) + for(int c = 0; c < ck::type_convert(nhwc.mDesc.GetLengths()[1]); c++) { std::cout << static_cast(nhwc(n, c, hi, wi)) << " "; } diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index cced480c36..ae70f551f1 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -50,12 +50,12 @@ void profile_grouped_gemm_impl(int do_verification, int init_method, bool do_log, int nrepeat, - std::vector Ms, - std::vector Ns, - std::vector Ks, - std::vector StrideAs, - std::vector StrideBs, - std::vector StrideCs) + const std::vector& Ms, + const std::vector& Ns, + const std::vector& Ks, + const std::vector& StrideAs, + const std::vector& StrideBs, + const std::vector& StrideCs) { auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { @@ -71,7 +71,7 @@ void profile_grouped_gemm_impl(int do_verification, } }; - int group_count = Ms.size(); + std::size_t group_count = Ms.size(); if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() && group_count == StrideBs.size() && group_count == StrideCs.size())) @@ -83,7 +83,7 @@ void profile_grouped_gemm_impl(int do_verification, std::vector> b_k_n; std::vector> c_m_n_device_results; - for(int i = 0; i < Ms.size(); i++) + for(std::size_t i = 0; i < group_count; i++) { a_m_k.push_back( Tensor(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{}))); @@ -144,7 +144,7 @@ void profile_grouped_gemm_impl(int do_verification, gemm_shapes.reserve(group_count); - for(int i = 0; i < group_count; i++) + for(std::size_t i = 0; i < group_count; i++) { a_device_buf.emplace_back( std::make_unique(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpace())); @@ -234,7 +234,7 @@ void profile_grouped_gemm_impl(int do_verification, float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); std::size_t flop = 0, num_btype = 0; - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i]; @@ -258,7 +258,7 @@ void profile_grouped_gemm_impl(int do_verification, if(do_verification) { - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp index c6dea1e385..96fa78964a 100644 --- a/profiler/src/profile_reduce.cpp +++ b/profiler/src/profile_reduce.cpp @@ -186,7 +186,7 @@ class AppArgs int processArgs(int argc, char* argv[]) { - unsigned int ch; + int ch; optind++; // to skip the "reduce" module name diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp index a3d4f9b2ec..c788b66aa3 100644 --- a/test/gemm_split_k/gemm_split_k.cpp +++ b/test/gemm_split_k/gemm_split_k.cpp @@ -45,7 +45,7 @@ static bool check_out(const Tensor& ref, const Tensor& result) { float max_diff = 1e-6; - for(int i = 0; i < ref.mData.size(); ++i) + for(std::size_t i = 0; i < ref.mData.size(); ++i) { float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); if(max_diff < diff) diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp index 2260b01462..ef131ed867 100644 --- a/test/grouped_gemm/grouped_gemm_fp16.cpp +++ b/test/grouped_gemm/grouped_gemm_fp16.cpp @@ -104,7 +104,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr) b_tensors_device.reserve(group_count); c_tensors_device.reserve(group_count); - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { a_tensors.emplace_back(Tensor(f_host_tensor_descriptor( gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{}))); @@ -119,7 +119,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr) b_tensors[i].GenerateTensorValue(GeneratorTensor_2{-5, 5}); } - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { a_tensors_device.emplace_back( std::make_unique(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize())); @@ -147,7 +147,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr) invoker_ptr->Run(argument_ptr.get()); - for(int i = 0; i < gemm_shapes.size(); i++) + for(std::size_t i = 0; i < gemm_shapes.size(); i++) { c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data()); diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp index 28370cb2cd..317abab53a 100644 --- a/test/reduce/reduce_no_index.cpp +++ b/test/reduce/reduce_no_index.cpp @@ -460,7 +460,7 @@ class SimpleAppArgs int processArgs(int argc, char* argv[]) { - unsigned int ch; + int ch; while(1) { diff --git a/test/reduce/reduce_util.hpp b/test/reduce/reduce_util.hpp index e9a7b4896e..9eb66513bf 100644 --- a/test/reduce/reduce_util.hpp +++ b/test/reduce/reduce_util.hpp @@ -9,7 +9,7 @@ namespace reduce_util { template void to_f32_vector(const Tensor& src, Tensor& dst) { - for(int i = 0; i < src.mData.size(); ++i) + for(std::size_t i = 0; i < src.mData.size(); ++i) dst.mData[i] = type_convert(src.mData[i]); } diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp index 667b84a8dc..d7d5e551a2 100644 --- a/test/reduce/reduce_with_index.cpp +++ b/test/reduce/reduce_with_index.cpp @@ -463,7 +463,7 @@ class SimpleAppArgs int processArgs(int argc, char* argv[]) { - unsigned int ch; + int ch; while(1) {