diff --git a/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp index a78d0fd0a9..65b6426003 100644 --- a/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp +++ b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp @@ -85,6 +85,8 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[]) return 1; } + std::cout << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " " << argv[4] << std::endl; + const auto data_type = static_cast(std::stoi(argv[2])); const auto layout = static_cast(std::stoi(argv[3])); const bool do_verification = std::stoi(argv[4]); @@ -93,8 +95,8 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[]) const bool time_kernel = std::stoi(argv[7]); const int num_dim_spatial = std::stoi(argv[8]); - // 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial - const int expected_num_args = 7 + 1 + 4 + 6 * num_dim_spatial + 1; + // program name, 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + const int expected_num_args = 1 + 7 + 1 + 4 + 6 * num_dim_spatial; if(argc != expected_num_args) { std::cout << "Received " << argc << " args"<< std::endl; diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp index c73966a57d..1c17005acf 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp @@ -32,9 +32,41 @@ using tile_grouped_conv_fwd_fp16_instances = std::tuple< //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| + +// These instances do not compile on Navi4x cards +#if defined(__gfx9__) +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +#endif + GroupedConvolutionForwardInvoker, -GroupedConvolutionForwardInvoker -// clang-format on +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + // clang-format on >; diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_int8_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_int8_instances.hpp index b06344ec23..b8cf73221a 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_int8_instances.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_int8_instances.hpp @@ -34,38 +34,39 @@ using tile_grouped_conv_fwd_int8_instances = std::tuple< //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionForwardInvoker, -GroupedConvolutionForwardInvoker -// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_2x -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, -// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 1 -// GroupedConvolutionForwardInvoker, -// GroupedConvolutionForwardInvoker, +// These instances do not compile on Navi cards for FP16 due to high memory usage +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, + +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + + +// GroupedConvolutionForwardInvoker, // GroupedConvolutionForwardInvoker, // GroupedConvolutionForwardInvoker, // GroupedConvolutionForwardInvoker, diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index e1cbea6c2c..935deae070 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -97,8 +97,8 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1 list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp) # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp) # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) # list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp) # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) # list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) @@ -114,7 +114,7 @@ endif() if(DL_KERNELS) #list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) + #list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) endif() # if(CK_ENABLE_INT8) @@ -208,9 +208,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1 # list(APPEND DEVICE_INSTANCES device_conv1d_bwd_data_instance) # list(APPEND DEVICE_INSTANCES device_conv3d_bwd_data_instance) # list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance) # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance) # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance) # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance) @@ -233,10 +233,10 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1 # list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance) # list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) # list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance) # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) @@ -253,9 +253,9 @@ endif() if(DL_KERNELS) #list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) endif() # if(CK_ENABLE_INT8) diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp index 3e69ace3ea..950518acd2 100644 --- a/profiler/src/profile_grouped_conv_fwd.cpp +++ b/profiler/src/profile_grouped_conv_fwd.cpp @@ -89,9 +89,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) const bool time_kernel = std::stoi(argv[7]); const int num_dim_spatial = std::stoi(argv[8]); - // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial - if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) + // program name, 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + const int expected_num_args = 1 + 7 + 1 + 4 + 6 * num_dim_spatial; + if(argc != expected_num_args) { + std::cout << "Received " << argc << " arguments." << std::endl; + std::cout << "Expected " << expected_num_args << " arguments." << std::endl; print_helper_msg(); return 1; } diff --git a/script/navi4x_int8_test_shapes.txt b/script/navi4x_int8_test_shapes.txt index f7885f5d46..1492955a37 100644 --- a/script/navi4x_int8_test_shapes.txt +++ b/script/navi4x_int8_test_shapes.txt @@ -4,4 +4,4 @@ grouped_conv_fwd 1 1 0 1 0 1 INT8 op data_type layout verify init_method print time_kernel spat_dim G N K C Y X Hi Wi Sy Sx Dy Dx Left_pad_y Left_pad_x Right_pad_y Right_pad_x -grouped_conv_fwd 3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1 \ No newline at end of file +grouped_conv_fwd 3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1