mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
* unify pipeline signature with existing example * iwyu * move stuff around in load-tile-transpose * cleanups in batched transpose pipeline * comments * use same inputs size * cleaner printf * print host args * use 64 block sides in the 37_transpose example * roll back grid dimension size adjustment for 37_transpose example * transpose grid for 37_transpose to unify with 35_batched_transpose * unify grid computation logic * make policy methods device only (since they are used only on device from the pipeline) * more host/device attribute cleanups * copy over problem * move over pipeline and policy * add switch to batched transpose api * make the lds problem more similar to original problem * factor out logic into traits * factor out conditional compilation into trait parameter * propagate pipeline to args * unhardcode pipeline dispatch parameter * refactor vector size * put warp tile out of dispatch * rename template parameter for trait * rewrite vector size in terms of problem * mark policy-internal struct variable as device * factor out input distribution and thread access pattern from policies * reword vector size * use datatype across batched transpose pipelines, problems and kernel * remove transpose traits from lds pipeline * add padding to the lds pipeline *interface* * add comment * remove ck_tile example #37 * update cmakelists * add test for new pipeline * update batched transpose test * roll back load_tile_transpose changes * remove comments * pack dispatch parameters into a config * padM can be enabled * adjust lds vector size to enable padding along N * update test * clean up logic * swap m/n input vector size * adjust perf test script * sweep over C/W in perf test * count both read and written bytes into bandwidth (x2 the number) * clang-format * widen size range for perf test * remove 64k x 64k case; it's too large for index * remove thread tile from dispatch * Solve merge conflict * fix compile * modify the transpose * solve the test error and clang format * Add v3 support for Groupd fwd conv+bias+clamp & ckProfiler (#2463) * Add logging to IsSupported. * Less casting in AddClamp * Conv+bias+clamp instances & profiler BF16 * Fix 3D instances & run just 1x for verification. * :Run just once for verification conv fwd. * ckProfiler conv fwd clampwq * Remove exec bit & formatting * Add support for MultiD for grouped conv fwd v3. * Enable 2Lds. * clean * align instances * align instances * profiler fixes * Fixes * fix * fix --------- Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu> Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Fixing 0ms and inf GB/s issue in img2col (#2565) issue : ==== ``` sh $ bin/tile_example_img2col Perf: 0 ms, inf GB/s ``` solution : ====== Problem occured because config.time_kernel is false by default. if false, then no need to calculate perf, just print proper message `image_to_coloumn: pass, No Perf generated due to config.time_kernel=0` * merge with develop * solve clang format --------- Co-authored-by: ThomasNing <thomas.ning@amd.com> Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu> Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> Co-authored-by: rahjain-amd <Rahul.Jain@amd.com>
64 lines
4.1 KiB
C++
64 lines
4.1 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
|
|
#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
|
|
#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
|
|
#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
|
|
#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
|
|
#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
|
|
#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp"
|
|
#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
|
|
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
|
|
#include "ck_tile/ops/common/tensor_layout.hpp"
|
|
#include "ck_tile/ops/common/utils.hpp"
|