Compile CK for all targets (#188)

* compile ck for all targets * update the target criteria * change the target condition * fixed some typos * fixed missed file * revert changes in README * revert device_conv3d_fwd_xdl_... * update device_conv3d_fwd_xdl_... * update device_batched_gemm_reduce... * test the unused arguments fix * test the warning suppression * try suppress warnings in device_batched_gemm_reduce_xdl... * fix the last warnings * replace UNUSED with std::ignore * fix a typo * replaced std::ignore with ignore * add igonre header to common_header * refactor atomicAdd Co-authored-by: Chao Liu <chao.liu2@amd.com>
2026-05-02 20:51:23 +00:00 · 2022-04-15 12:17:28 -07:00
parent ac0d806650
commit 4221505d3e
17 changed files with 232 additions and 77 deletions
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -54,6 +54,7 @@ __global__ void
            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
            const Block2CTileMap block_2_ctile_map)
 {
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -88,6 +89,25 @@ __global__ void
                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
                                                   d_grid_desc_mblock_mperblock,
                                                   block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_d0_grid;
+    ignore = p_d1_grid;
+    ignore = batch_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = d0_reduce_op;
+    ignore = d1_reduce_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = d_grid_desc_mblock_mperblock;
+    ignore = compute_base_ptr_of_batch_;
+    ignore = block_2_ctile_map;
+#endif // end of if defined (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <typename ALayout,
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -46,6 +46,7 @@ __global__ void
            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
            const Block2CTileMap block_2_ctile_map)
 {
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -70,6 +71,20 @@ __global__ void
                                                  b_element_op,
                                                  c_element_op,
                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = compute_base_ptr_of_batch_;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <typename ADataType,
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -49,6 +49,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -73,6 +74,23 @@ __global__ void
                                                  b_element_op,
                                                  c_element_op,
                                                  block_2_ctile_map);
+
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = num_batches;
+    ignore = a_batch_stride;
+    ignore = b_batch_stride;
+    ignore = c_batch_stride;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 // specialization for #D conv: in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]