diff --git a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index a7fa193da1..06a755d13b 100644 --- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -362,173 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, - wei_k_c_y_x_global_desc, - out_n_k_ho_wo_global_desc) / - (std::size_t(1000) * 1000 * 1000) / ave_time; - - std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" - << std::endl; - } -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER - using ADesc = decltype(wei_gemmk_gemmm_global_desc); - using BDesc = decltype(in_gemmk_gemmn_global_desc); - using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc)); - DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc)); - DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc)); - - wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc); - in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc); - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice( - &out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - index_t nrepeat = 100; - - for(index_t i = 0; i < 5; ++i) - { - std::cout << "Start running " << nrepeat << " times..." << std::endl; - - KernelTimer timer; - timer.Start(); - - for(index_t j = 0; j < nrepeat; ++j) - { - if(has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(has_main_k_block_loop && !has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(!has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - } - - timer.End(); - - float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, wei_k_c_y_x_global_desc, out_n_k_ho_wo_global_desc) / @@ -564,111 +397,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad { if(has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } } @@ -1035,173 +872,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, - wei_k_c_y_x_global_desc, - out_n_k_ho_wo_global_desc) / - (std::size_t(1000) * 1000 * 1000) / ave_time; - - std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" - << std::endl; - } -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER - using ADesc = decltype(wei_gemmk_gemmm_global_desc); - using BDesc = decltype(in_gemmk_gemmn_global_desc); - using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc)); - DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc)); - DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc)); - - wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc); - in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc); - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice( - &out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - index_t nrepeat = 100; - - for(index_t i = 0; i < 5; ++i) - { - std::cout << "Start running " << nrepeat << " times..." << std::endl; - - KernelTimer timer; - timer.Start(); - - for(index_t j = 0; j < nrepeat; ++j) - { - if(has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(has_main_k_block_loop && !has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(!has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - } - - timer.End(); - - float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, wei_k_c_y_x_global_desc, out_n_k_ho_wo_global_desc) / @@ -1237,111 +907,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad { if(has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } } @@ -1694,173 +1368,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, - wei_k_c_y_x_global_desc, - out_n_k_ho_wo_global_desc) / - (std::size_t(1000) * 1000 * 1000) / ave_time; - - std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" - << std::endl; - } -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER - using ADesc = decltype(wei_gemmk_gemmm_global_desc); - using BDesc = decltype(in_gemmk_gemmn_global_desc); - using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc)); - DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc)); - DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc)); - - wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc); - in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc); - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice( - &out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - index_t nrepeat = 100; - - for(index_t i = 0; i < 5; ++i) - { - std::cout << "Start running " << nrepeat << " times..." << std::endl; - - KernelTimer timer; - timer.Start(); - - for(index_t j = 0; j < nrepeat; ++j) - { - if(has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(has_main_k_block_loop && !has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(!has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - } - - timer.End(); - - float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, wei_k_c_y_x_global_desc, out_n_k_ho_wo_global_desc) / @@ -1896,111 +1403,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 { if(has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } } diff --git a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp index 7e9cc0af5f..922a036013 100644 --- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp +++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp @@ -363,171 +363,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / ave_time; - - std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" - << std::endl; - } -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER - using ADesc = decltype(wei_gemmk_gemmm_global_desc); - using BDesc = decltype(in_gemmk_gemmn_global_desc); - using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc)); - DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc)); - DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc)); - - wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc); - in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc); - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice( - &out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - index_t nrepeat = 100; - - for(index_t i = 0; i < 5; ++i) - { - std::cout << "Start running " << nrepeat << " times..." << std::endl; - - KernelTimer timer; - timer.Start(); - - for(index_t j = 0; j < nrepeat; ++j) - { - if(has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(has_main_k_block_loop && !has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(!has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - } - - timer.End(); - - float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / (std::size_t(1000) * 1000 * 1000) / ave_time; @@ -561,111 +396,115 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad { if(has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } } @@ -1017,171 +856,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1 float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / ave_time; - - std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" - << std::endl; - } -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER - using ADesc = decltype(wei_gemmk_gemmm_global_desc); - using BDesc = decltype(in_gemmk_gemmn_global_desc); - using CDesc = decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - DeviceMem wei_gemmk_gemmm_global_desc_device_buf(sizeof(ADesc)); - DeviceMem in_gemmk_gemmn_global_desc_device_buf(sizeof(BDesc)); - DeviceMem out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf(sizeof(CDesc)); - - wei_gemmk_gemmm_global_desc_device_buf.ToDevice(&wei_gemmk_gemmm_global_desc); - in_gemmk_gemmn_global_desc_device_buf.ToDevice(&in_gemmk_gemmn_global_desc); - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf.ToDevice( - &out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc); - - index_t nrepeat = 100; - - for(index_t i = 0; i < 5; ++i) - { - std::cout << "Start running " << nrepeat << " times..." << std::endl; - - KernelTimer timer; - timer.Start(); - - for(index_t j = 0; j < nrepeat; ++j) - { - if(has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(has_main_k_block_loop && !has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else if(!has_main_k_block_loop && has_double_tail_k_block_loop) - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - else - { - const auto kernel = - run_gridwise_operation, - integral_constant>; - - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - reinterpret_cast( - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer()), - p_wei_global, - reinterpret_cast( - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer()), - p_in_global, - reinterpret_cast( - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer()), - p_out_global, - integral_constant{}, - integral_constant{}); - } - } - - timer.End(); - - float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / (std::size_t(1000) * 1000 * 1000) / ave_time; @@ -1215,114 +889,117 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1 { if(has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(has_main_k_block_loop && !has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else if(!has_main_k_block_loop && has_double_tail_k_block_loop) { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } else { - const auto kernel = run_gridwise_operation, - integral_constant>; + const auto kernel = run_gridwise_dynamic_gemm_v1; - launch_kernel(kernel, - dim3(GridSize), - dim3(BlockSize), - 0, - 0, - wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), - p_wei_global, - in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), - p_in_global, - out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf - .GetDeviceBuffer(), - p_out_global, - integral_constant{}, - integral_constant{}); + launch_kernel( + kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + 0, + (void __CONSTANT__*) + wei_gemmk_gemmm_global_desc_device_buf.GetDeviceBuffer(), + p_wei_global, + (void __CONSTANT__*)in_gemmk_gemmn_global_desc_device_buf.GetDeviceBuffer(), + p_in_global, + (void __CONSTANT__*) + out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc_desc_device_buf + .GetDeviceBuffer(), + p_out_global); } } - timer.End(); float ave_time = timer.GetElapsedTime() / nrepeat; diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp index b0674debfa..15df1d23f4 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp @@ -11,6 +11,47 @@ namespace ck { +#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER +// pass tensor descriptor by __CONSTANT__ void pointer +// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to +// non-modifiable parameter address space, so compiler can enable corresponding optimization +template +__global__ void run_gridwise_dynamic_gemm_v1(const void __CONSTANT__* p_a_k_m_global_desc, + const FloatA* __restrict__ p_a_global, + const void __CONSTANT__* p_b_k_n_global_desc, + const FloatB* __restrict__ p_b_global, + const void __CONSTANT__* p_c_m0_m1_n0_n1_global_desc, + FloatC* __restrict__ p_c_global) +{ + // first cast void __CONSTANT__* to void* + // second cast void* to Desc* + // the copy constructor of tensor descriptor doesn't take address_space(4) + const auto a_k_m_global_desc = + *reinterpret_cast((const void*)p_a_k_m_global_desc); + const auto b_k_n_global_desc = + *reinterpret_cast((const void*)p_b_k_n_global_desc); + const auto c_m0_m1_n0_n1_global_desc = + *reinterpret_cast((const void*)p_c_m0_m1_n0_n1_global_desc); + + GridwiseGemm{}.Run(a_k_m_global_desc, + p_a_global, + b_k_n_global_desc, + p_b_global, + c_m0_m1_n0_n1_global_desc, + p_c_global, + integral_constant{}, + integral_constant{}); +} +#endif + template __device__ void Run(const AGlobalDesc& a_k_m_global_desc, const FloatAB* __restrict__ p_a_global, @@ -452,57 +492,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1 integral_constant{}, integral_constant{}); } - - // pass tensor descriptors by pointers - template - __device__ void Run(const AGlobalDesc* p_a_k_m_global_desc, - const FloatAB* __restrict__ p_a_global, - const BGlobalDesc* p_b_k_n_global_desc, - const FloatAB* __restrict__ p_b_global, - const CGlobalDesc* p_c_m0_m1_n0_n1_global_desc, - FloatC* __restrict__ p_c_global, - integral_constant, - integral_constant) const - { - const auto a_k_m_global_desc = *p_a_k_m_global_desc; - const auto b_k_n_global_desc = *p_b_k_n_global_desc; - const auto c_m0_m1_n0_n1_global_desc = *p_c_m0_m1_n0_n1_global_desc; - - Run(a_k_m_global_desc, - p_a_global, - b_k_n_global_desc, - p_b_global, - c_m0_m1_n0_n1_global_desc, - p_c_global, - integral_constant{}, - integral_constant{}); - } - - // pass tensor descriptors by void* - template - __device__ void Run(const void* p_a_k_m_global_desc, - const FloatAB* __restrict__ p_a_global, - const void* p_b_k_n_global_desc, - const FloatAB* __restrict__ p_b_global, - const void* p_c_m0_m1_n0_n1_global_desc, - FloatC* __restrict__ p_c_global, - integral_constant, - integral_constant) const - { - const auto a_k_m_global_desc = *reinterpret_cast(p_a_k_m_global_desc); - const auto b_k_n_global_desc = *reinterpret_cast(p_b_k_n_global_desc); - const auto c_m0_m1_n0_n1_global_desc = - *reinterpret_cast(p_c_m0_m1_n0_n1_global_desc); - - Run(a_k_m_global_desc, - p_a_global, - b_k_n_global_desc, - p_b_global, - c_m0_m1_n0_n1_global_desc, - p_c_global, - integral_constant{}, - integral_constant{}); - } }; } // namespace ck diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index 09e2aaba80..0f8388d09f 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -7,6 +7,9 @@ #endif #include "bfloat16_dev.hpp" +// address space for kernel parameter +#define __CONSTANT__ __attribute__((address_space(4))) + // device backend #define CK_DEVICE_BACKEND_AMD 1 @@ -108,9 +111,8 @@ #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0 #endif -// pass tensor descriptor by value, pointer or void* +// pass tensor descriptor by value or void* #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 -#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 // hack: have underlying assumption that need to be satsified, otherwise it's a bug