From 6f7d1272678cbadbb0702874e99bfc705dd54f45 Mon Sep 17 00:00:00 2001 From: shengnxu Date: Sun, 5 Jan 2025 10:31:17 +0000 Subject: [PATCH] changed all the scale outside except for uq --- .../flatmm_32x512x256_1x4x1_16x16x64_int8.hpp | 24 +- ...atmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp | 35 +- ..._gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc | 168 ++-- ..._gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc | 902 +++++++++--------- ...uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc | 286 +----- .../fused_moegemm_pipeline_flatmm_uk_int8.hpp | 90 +- 6 files changed, 671 insertions(+), 834 deletions(-) diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp index 58bf237972..3cbb8d9127 100644 --- a/include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp +++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp @@ -245,13 +245,10 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 // TODO: need paired with tile_window_linear! // TODO: need call init_raw() before call this function! - template + template CK_TILE_DEVICE auto - operator()( const AToken_id& row_ids_a_, - const AQRes& res_aq, - const DQRes& res_dq, - const GQRes& res_gq, - const SMQRes& res_smq, + operator()( const Ascale& a_scale_, + const GQscale& gq_scale_, const ARes& res_a, const ACoords& cached_coords_a, const BRes& res_b, @@ -263,7 +260,6 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 { static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 4 /*2x per dword*/); // 8 static_assert(BCoords::size() == Repeat_N); - static_assert(AToken_id::size() == Repeat_M); static_assert(Ascale::size() == Repeat_M); auto a_sst = make_tile_window( @@ -372,10 +368,6 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 register int v_z61 asm("v189") = 0; register int v_z62 asm("v190") = 0; register int v_z63 asm("v191") = 0; - - index_t temp0 = static_cast(row_ids_a_[number<0>{}]); - index_t temp1 = static_cast(row_ids_a_[number<1>{}]); - // B nr->kr #pragma clang diagnostic push #pragma clang diagnostic ignored "-Winline-asm" @@ -449,13 +441,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 [c61]"+v"(v_z61), [c62]"+v"(v_z62), [c63]"+v"(v_z63), - [v_token_id0]"+v"(temp0), - [v_token_id1]"+v"(temp1), [s_mem_]"+r"(smem) - : [s_res_aq]"s"(res_aq), - [s_res_dq]"s"(res_dq), - [s_res_gq]"s"(res_gq), - [s_res_smq]"s"(res_smq), + : [a_scale0]"v"(a_scale_[0]), + [a_scale1]"v"(a_scale_[1]), + [gq_scale0]"v"(gq_scale_[0]), + [gq_scale1]"v"(gq_scale_[1]), [s_res_a]"s"(res_a), // [s_res_a1]"s"(res_a[1]), // [s_res_a2]"s"(res_a[2]), diff --git a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp index e941fdf041..dfbb59dd38 100644 --- a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp +++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp @@ -80,21 +80,25 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x // template template - // typename ScaleTensor> + typename OFlags, + typename ScaleTensor, + typename YScaleTensor> CK_TILE_DEVICE auto operator()(const DQRes& res_dq, const BRes& res_b, + const DQCoords& cached_coords_dq, const BCoords& cached_coords_b, const ORes& res_o, const OCoords& cached_coords_o, const OFlags& o_flags, // this should be in sgpr CK_TILE_LDS_ADDR void* smem, index_t n, // loop along n dim - // const ScaleTensor& scale_, + const ScaleTensor& scale_, + const YScaleTensor& smq_scale_, index_t tile_offset_dq, index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust index_t tile_offset_half_b, //splited load alone K in to 2 part @@ -108,9 +112,9 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); const index_t tile_stride_dq_bytes = tile_offset_dq * sizeof(DScaleDataType); - // static_assert(ScaleTensor::size() == 2); - // float s0 = scale_[number<0>{}]; - // float s1 = scale_[number<1>{}]; + static_assert(ScaleTensor::size() == 2); + float s0 = scale_[number<0>{}]; + float s1 = scale_[number<1>{}]; index_t loop_cnt = n ; @@ -220,8 +224,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x // [v_sld_y_os]"v"(sld_y_os), // [v_sfl_sld]"v"(sfl_sld), // [v_sfl_sst]"v"(sfl_sst), + [smq_scale0]"s"(smq_scale_[0]), + [smq_scale1]"s"(smq_scale_[1]), [s_res_dq]"s"(res_dq), - [s_res_o0]"s"(res_o[0]), + [s_res_o0]"s"(res_o[0]), [s_res_o1]"s"(res_o[1]), //[s_res_o2]"s"(res_o[2]), //[s_res_o3]"s"(res_o[3]), @@ -229,6 +235,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x // [s_res_b1]"s"(res_b[1]), // [s_res_b2]"s"(res_b[2]), // [s_res_b3]"s"(res_b[3]), + [v_os_dq]"v"(static_cast(cached_coords_dq * sizeof(DScaleDataType))), [v_os_o0]"v"(static_cast(cached_coords_o[number<0>{}] * sizeof(ODataType))), [v_os_o1]"v"(static_cast(cached_coords_o[number<1>{}] * sizeof(ODataType))), [v_os_o2]"v"(static_cast(cached_coords_o[number<2>{}] * sizeof(ODataType))), @@ -293,8 +300,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp - "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v1", "v2", "v3", "v4", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v50", "v51", "v52", "v53", "v54", "v55", "v56", "v57", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73", @@ -366,8 +372,8 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x // [v_sfl_sld]"v"(sfl_sld), // [v_sfl_sst]"v"(sfl_sst), [s_res_dq]"s"(res_dq), - [s_res_o0]"s"(res_o[0]), - [s_res_o1]"s"(res_o[1]), + [s_res_o0]"s"(res_o[0]), + [s_res_o1]"s"(res_o[1]), //[s_res_o2]"s"(res_o[2]), //[s_res_o3]"s"(res_o[3]), [s_res_d]"s"(res_b), @@ -390,8 +396,8 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x [s_tile_os_b_half]"s"(tile_offset_half_b_bytes), [s_tile_os_b]"s"(tile_stride_b_bytes), [s_tile_os_dq]"s"(tile_stride_dq_bytes), - // [scale_0]"v"(s0), - // [scale_1]"v"(s1), + [scale_0]"v"(s0), + [scale_1]"v"(s1), // [v_nan_lo]"v"(nan_lo), // [v_nan_hi]"v"(nan_hi), [s_execflag_0]"s"(o_flags[number<0>{}]), @@ -438,8 +444,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp - "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v1", "v2", "v3", "v4", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v50", "v51", "v52", "v53", "v54", "v55", "v56", "v57", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73", diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc index 3f8af1841d..cbf7116f70 100644 --- a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc @@ -27,8 +27,12 @@ # define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16" #endif +" v_and_b32 v0, 0x3f, v0 \n" +" v_lshrrev_b32 v3, 6, v0 \n" +" v_readfirstlane_b32 s7, v3 \n" " s_waitcnt vmcnt(24) \n" " buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen\n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n" " v_mul_f32 v54, v128, v128 \n" " v_mul_f32 v55, v129, v129 \n" " v_mul_f32 v56, v130, v130 \n" @@ -49,7 +53,6 @@ " v_exp_f32 v55, v55 \n" " v_exp_f32 v56, v56 \n" " v_exp_f32 v57, v57 \n" -" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n" " v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n" @@ -577,71 +580,71 @@ " v_mul_f32 v189, v189, v55 \n" " v_mul_f32 v190, v190, v56 \n" " v_mul_f32 v191, v191, v57 \n" -" v_mul_f32 v128, v18, v128 row_newbcast:0 \n" -" v_mul_f32 v129, v18, v129 row_newbcast:1 \n" -" v_mul_f32 v130, v18, v130 row_newbcast:2 \n" -" v_mul_f32 v131, v18, v131 row_newbcast:3 \n" -" v_mul_f32 v132, v18, v132 row_newbcast:0 \n" -" v_mul_f32 v133, v18, v133 row_newbcast:1 \n" -" v_mul_f32 v134, v18, v134 row_newbcast:2 \n" -" v_mul_f32 v135, v18, v135 row_newbcast:3 \n" -" v_mul_f32 v136, v18, v136 row_newbcast:4 \n" -" v_mul_f32 v137, v18, v137 row_newbcast:5 \n" -" v_mul_f32 v138, v18, v138 row_newbcast:6 \n" -" v_mul_f32 v139, v18, v139 row_newbcast:7 \n" -" v_mul_f32 v140, v18, v140 row_newbcast:4 \n" -" v_mul_f32 v141, v18, v141 row_newbcast:5 \n" -" v_mul_f32 v142, v18, v142 row_newbcast:6 \n" -" v_mul_f32 v143, v18, v143 row_newbcast:7 \n" -" v_mul_f32 v144, v18, v144 row_newbcast:8 \n" -" v_mul_f32 v145, v18, v145 row_newbcast:9 \n" -" v_mul_f32 v146, v18, v146 row_newbcast:10 \n" -" v_mul_f32 v147, v18, v147 row_newbcast:11 \n" -" v_mul_f32 v148, v18, v148 row_newbcast:8 \n" -" v_mul_f32 v149, v18, v149 row_newbcast:9 \n" -" v_mul_f32 v150, v18, v150 row_newbcast:10 \n" -" v_mul_f32 v151, v18, v151 row_newbcast:11 \n" -" v_mul_f32 v152, v18, v152 row_newbcast:12 \n" -" v_mul_f32 v153, v18, v153 row_newbcast:13 \n" -" v_mul_f32 v154, v18, v154 row_newbcast:14 \n" -" v_mul_f32 v155, v18, v155 row_newbcast:15 \n" -" v_mul_f32 v156, v18, v156 row_newbcast:12 \n" -" v_mul_f32 v157, v18, v157 row_newbcast:13 \n" -" v_mul_f32 v158, v18, v158 row_newbcast:14 \n" -" v_mul_f32 v159, v18, v159 row_newbcast:15 \n" -" v_mul_f32 v160, v19, v160 row_newbcast:0 \n" -" v_mul_f32 v161, v19, v161 row_newbcast:1 \n" -" v_mul_f32 v162, v19, v162 row_newbcast:2 \n" -" v_mul_f32 v163, v19, v163 row_newbcast:3 \n" -" v_mul_f32 v164, v19, v164 row_newbcast:0 \n" -" v_mul_f32 v165, v19, v165 row_newbcast:1 \n" -" v_mul_f32 v166, v19, v166 row_newbcast:2 \n" -" v_mul_f32 v167, v19, v167 row_newbcast:3 \n" -" v_mul_f32 v168, v19, v168 row_newbcast:4 \n" -" v_mul_f32 v169, v19, v169 row_newbcast:5 \n" -" v_mul_f32 v170, v19, v170 row_newbcast:6 \n" -" v_mul_f32 v171, v19, v171 row_newbcast:7 \n" -" v_mul_f32 v172, v19, v172 row_newbcast:4 \n" -" v_mul_f32 v173, v19, v173 row_newbcast:5 \n" -" v_mul_f32 v174, v19, v174 row_newbcast:6 \n" -" v_mul_f32 v175, v19, v175 row_newbcast:7 \n" -" v_mul_f32 v176, v19, v176 row_newbcast:8 \n" -" v_mul_f32 v177, v19, v177 row_newbcast:9 \n" -" v_mul_f32 v178, v19, v178 row_newbcast:10 \n" -" v_mul_f32 v179, v19, v179 row_newbcast:11 \n" -" v_mul_f32 v180, v19, v180 row_newbcast:8 \n" -" v_mul_f32 v181, v19, v181 row_newbcast:9 \n" -" v_mul_f32 v182, v19, v182 row_newbcast:10 \n" -" v_mul_f32 v183, v19, v183 row_newbcast:11 \n" -" v_mul_f32 v184, v19, v184 row_newbcast:12 \n" -" v_mul_f32 v185, v19, v185 row_newbcast:13 \n" -" v_mul_f32 v186, v19, v186 row_newbcast:14 \n" -" v_mul_f32 v187, v19, v187 row_newbcast:15 \n" -" v_mul_f32 v188, v19, v188 row_newbcast:12 \n" -" v_mul_f32 v189, v19, v189 row_newbcast:13 \n" -" v_mul_f32 v190, v19, v190 row_newbcast:14 \n" -" v_mul_f32 v191, v19, v191 row_newbcast:15 \n" -" buffer_load_dword v12, v5, %[s_res_dq], 0 offen \n" +" v_mul_f32 v128, %[smq_scale0], v128 row_newbcast:0 \n" +" v_mul_f32 v129, %[smq_scale0], v129 row_newbcast:1 \n" +" v_mul_f32 v130, %[smq_scale0], v130 row_newbcast:2 \n" +" v_mul_f32 v131, %[smq_scale0], v131 row_newbcast:3 \n" +" v_mul_f32 v132, %[smq_scale0], v132 row_newbcast:0 \n" +" v_mul_f32 v133, %[smq_scale0], v133 row_newbcast:1 \n" +" v_mul_f32 v134, %[smq_scale0], v134 row_newbcast:2 \n" +" v_mul_f32 v135, %[smq_scale0], v135 row_newbcast:3 \n" +" v_mul_f32 v136, %[smq_scale0], v136 row_newbcast:4 \n" +" v_mul_f32 v137, %[smq_scale0], v137 row_newbcast:5 \n" +" v_mul_f32 v138, %[smq_scale0], v138 row_newbcast:6 \n" +" v_mul_f32 v139, %[smq_scale0], v139 row_newbcast:7 \n" +" v_mul_f32 v140, %[smq_scale0], v140 row_newbcast:4 \n" +" v_mul_f32 v141, %[smq_scale0], v141 row_newbcast:5 \n" +" v_mul_f32 v142, %[smq_scale0], v142 row_newbcast:6 \n" +" v_mul_f32 v143, %[smq_scale0], v143 row_newbcast:7 \n" +" v_mul_f32 v144, %[smq_scale0], v144 row_newbcast:8 \n" +" v_mul_f32 v145, %[smq_scale0], v145 row_newbcast:9 \n" +" v_mul_f32 v146, %[smq_scale0], v146 row_newbcast:10 \n" +" v_mul_f32 v147, %[smq_scale0], v147 row_newbcast:11 \n" +" v_mul_f32 v148, %[smq_scale0], v148 row_newbcast:8 \n" +" v_mul_f32 v149, %[smq_scale0], v149 row_newbcast:9 \n" +" v_mul_f32 v150, %[smq_scale0], v150 row_newbcast:10 \n" +" v_mul_f32 v151, %[smq_scale0], v151 row_newbcast:11 \n" +" v_mul_f32 v152, %[smq_scale0], v152 row_newbcast:12 \n" +" v_mul_f32 v153, %[smq_scale0], v153 row_newbcast:13 \n" +" v_mul_f32 v154, %[smq_scale0], v154 row_newbcast:14 \n" +" v_mul_f32 v155, %[smq_scale0], v155 row_newbcast:15 \n" +" v_mul_f32 v156, %[smq_scale0], v156 row_newbcast:12 \n" +" v_mul_f32 v157, %[smq_scale0], v157 row_newbcast:13 \n" +" v_mul_f32 v158, %[smq_scale0], v158 row_newbcast:14 \n" +" v_mul_f32 v159, %[smq_scale0], v159 row_newbcast:15 \n" +" v_mul_f32 v160, %[smq_scale1], v160 row_newbcast:0 \n" +" v_mul_f32 v161, %[smq_scale1], v161 row_newbcast:1 \n" +" v_mul_f32 v162, %[smq_scale1], v162 row_newbcast:2 \n" +" v_mul_f32 v163, %[smq_scale1], v163 row_newbcast:3 \n" +" v_mul_f32 v164, %[smq_scale1], v164 row_newbcast:0 \n" +" v_mul_f32 v165, %[smq_scale1], v165 row_newbcast:1 \n" +" v_mul_f32 v166, %[smq_scale1], v166 row_newbcast:2 \n" +" v_mul_f32 v167, %[smq_scale1], v167 row_newbcast:3 \n" +" v_mul_f32 v168, %[smq_scale1], v168 row_newbcast:4 \n" +" v_mul_f32 v169, %[smq_scale1], v169 row_newbcast:5 \n" +" v_mul_f32 v170, %[smq_scale1], v170 row_newbcast:6 \n" +" v_mul_f32 v171, %[smq_scale1], v171 row_newbcast:7 \n" +" v_mul_f32 v172, %[smq_scale1], v172 row_newbcast:4 \n" +" v_mul_f32 v173, %[smq_scale1], v173 row_newbcast:5 \n" +" v_mul_f32 v174, %[smq_scale1], v174 row_newbcast:6 \n" +" v_mul_f32 v175, %[smq_scale1], v175 row_newbcast:7 \n" +" v_mul_f32 v176, %[smq_scale1], v176 row_newbcast:8 \n" +" v_mul_f32 v177, %[smq_scale1], v177 row_newbcast:9 \n" +" v_mul_f32 v178, %[smq_scale1], v178 row_newbcast:10 \n" +" v_mul_f32 v179, %[smq_scale1], v179 row_newbcast:11 \n" +" v_mul_f32 v180, %[smq_scale1], v180 row_newbcast:8 \n" +" v_mul_f32 v181, %[smq_scale1], v181 row_newbcast:9 \n" +" v_mul_f32 v182, %[smq_scale1], v182 row_newbcast:10 \n" +" v_mul_f32 v183, %[smq_scale1], v183 row_newbcast:11 \n" +" v_mul_f32 v184, %[smq_scale1], v184 row_newbcast:12 \n" +" v_mul_f32 v185, %[smq_scale1], v185 row_newbcast:13 \n" +" v_mul_f32 v186, %[smq_scale1], v186 row_newbcast:14 \n" +" v_mul_f32 v187, %[smq_scale1], v187 row_newbcast:15 \n" +" v_mul_f32 v188, %[smq_scale1], v188 row_newbcast:12 \n" +" v_mul_f32 v189, %[smq_scale1], v189 row_newbcast:13 \n" +" v_mul_f32 v190, %[smq_scale1], v190 row_newbcast:14 \n" +" v_mul_f32 v191, %[smq_scale1], v191 row_newbcast:15 \n" +" buffer_load_dword v12, %[v_os_dq], %[s_res_dq], 0 offen \n" " v_mov_b32 v22, 0x358637bd \n" " v_mov_b32 v23, 0x358637bd \n" " v_max3_f32 v22, abs(v128), abs(v129), v22 \n" @@ -934,9 +937,42 @@ " v_lshlrev_b32 v54, 1, v54 \n" " v_add_u32 v55, v54, v55 \n" " v_lshlrev_b32 v54, 2, v55 \n" +" ds_read_b64 v[128:129], v54 offset:18688 \n" +" ds_read_b64 v[130:131], v54 offset:18816 \n" +" ds_read_b64 v[132:133], v54 offset:19712 \n" +" ds_read_b64 v[134:135], v54 offset:19840 \n" +" ds_read_b64 v[136:137], v54 offset:20736 \n" +" ds_read_b64 v[138:139], v54 offset:20864 \n" +" ds_read_b64 v[140:141], v54 offset:21760 \n" +" ds_read_b64 v[142:143], v54 offset:21888 \n" +" ds_read_b64 v[144:145], v54 offset:22784 \n" +" ds_read_b64 v[146:147], v54 offset:22912 \n" +" ds_read_b64 v[148:149], v54 offset:23808 \n" +" ds_read_b64 v[150:151], v54 offset:23936 \n" +" ds_read_b64 v[152:153], v54 offset:24832 \n" +" ds_read_b64 v[154:155], v54 offset:24960 \n" +" ds_read_b64 v[156:157], v54 offset:25856 \n" +" ds_read_b64 v[158:159], v54 offset:25984 \n" +" ds_read_b64 v[160:161], v54 offset:26880 \n" +" ds_read_b64 v[162:163], v54 offset:27008 \n" +" ds_read_b64 v[164:165], v54 offset:27904 \n" +" ds_read_b64 v[166:167], v54 offset:28032 \n" +" ds_read_b64 v[168:169], v54 offset:28928 \n" +" ds_read_b64 v[170:171], v54 offset:29056 \n" +" ds_read_b64 v[172:173], v54 offset:29952 \n" +" ds_read_b64 v[174:175], v54 offset:30080 \n" +" ds_read_b64 v[176:177], v54 offset:30976 \n" +" ds_read_b64 v[178:179], v54 offset:31104 \n" +" ds_read_b64 v[180:181], v54 offset:32000 \n" +" ds_read_b64 v[182:183], v54 offset:32128 \n" +" ds_read_b64 v[184:185], v54 offset:33024 \n" +" ds_read_b64 v[186:187], v54 offset:33152 \n" +" ds_read_b64 v[188:189], v54 offset:34048 \n" +" ds_read_b64 v[190:191], v54 offset:34176 \n" #undef _UK_MFMA_ #undef _UK_PK_CVT_ #undef _UK_ATOMIC_ADD_ + diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc index 390fbfbc23..8cb7d2a7a4 100644 --- a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc @@ -27,222 +27,190 @@ # define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16" #endif -" ds_read_b64 v[128:129], v54 offset:18688 \n" -" ds_read_b64 v[130:131], v54 offset:18816 \n" -" ds_read_b64 v[132:133], v54 offset:19712 \n" -" ds_read_b64 v[134:135], v54 offset:19840 \n" -" ds_read_b64 v[136:137], v54 offset:20736 \n" -" ds_read_b64 v[138:139], v54 offset:20864 \n" -" ds_read_b64 v[140:141], v54 offset:21760 \n" -" ds_read_b64 v[142:143], v54 offset:21888 \n" -" ds_read_b64 v[144:145], v54 offset:22784 \n" -" ds_read_b64 v[146:147], v54 offset:22912 \n" -" ds_read_b64 v[148:149], v54 offset:23808 \n" -" ds_read_b64 v[150:151], v54 offset:23936 \n" -" ds_read_b64 v[152:153], v54 offset:24832 \n" -" ds_read_b64 v[154:155], v54 offset:24960 \n" -" ds_read_b64 v[156:157], v54 offset:25856 \n" -" ds_read_b64 v[158:159], v54 offset:25984 \n" -" ds_read_b64 v[160:161], v54 offset:26880 \n" -" ds_read_b64 v[162:163], v54 offset:27008 \n" -" ds_read_b64 v[164:165], v54 offset:27904 \n" -" ds_read_b64 v[166:167], v54 offset:28032 \n" -" ds_read_b64 v[168:169], v54 offset:28928 \n" -" ds_read_b64 v[170:171], v54 offset:29056 \n" -" ds_read_b64 v[172:173], v54 offset:29952 \n" -" ds_read_b64 v[174:175], v54 offset:30080 \n" -" ds_read_b64 v[176:177], v54 offset:30976 \n" -" ds_read_b64 v[178:179], v54 offset:31104 \n" -" ds_read_b64 v[180:181], v54 offset:32000 \n" -" ds_read_b64 v[182:183], v54 offset:32128 \n" -" ds_read_b64 v[184:185], v54 offset:33024 \n" -" ds_read_b64 v[186:187], v54 offset:33152 \n" -" ds_read_b64 v[188:189], v54 offset:34048 \n" -" ds_read_b64 v[190:191], v54 offset:34176 \n" -" s_add_u32 s12, %[s_tile_os_b], s12 \n" -" s_addc_u32 s13, 0, s13 \n" -" s_add_u32 s16, %[s_tile_os_dq], s16 \n" -" s_addc_u32 s17, 0, s17 \n" +" s_add_u32 %[s_res_d][0], %[s_tile_os_b], %[s_res_d][0] \n" +" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n" +" s_add_u32 %[s_res_dq][0], %[s_tile_os_dq], %[s_res_dq][0] \n" +" s_addc_u32 %[s_res_dq][1], 0, %[s_res_dq][1] \n" " s_mov_b32 s80, 0 \n" " s_waitcnt 0x0000 \n" "label_startgemm2: \n" " s_waitcnt vmcnt(41) \n" " s_barrier \n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]\n" -" buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]\n" -" buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]\n" -" buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]\n" -" buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]\n" -" buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]\n" -" buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]\n" -" buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]\n" -" buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]\n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195] \n" +" buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195] \n" +" buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199] \n" +" buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199] \n" +" buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203] \n" +" buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203] \n" +" buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207] \n" +" buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207] \n" +" buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207] \n" " s_waitcnt vmcnt(41) \n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]\n" -" buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]\n" -" buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]\n" -" buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]\n" -" buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]\n" -" buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]\n" -" buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]\n" -" buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]\n" -" buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n" -" s_add_u32 s12, %[s_tile_os_b_half], s12 \n" -" s_addc_u32 s13, 0, s13 \n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223]\n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211] \n" +" buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211] \n" +" buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215] \n" +" buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215] \n" +" buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219] \n" +" buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219] \n" +" buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223] \n" +" buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223] \n" +" buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n" +" s_add_u32 %[s_res_d][0], %[s_tile_os_b_half], %[s_res_d][0] \n" +" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223] \n" " s_waitcnt vmcnt(41) \n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]\n" -" buffer_load_dwordx4 acc[192:195], %[v_os_b0], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]\n" -" buffer_load_dwordx4 acc[196:199], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]\n" -" buffer_load_dwordx4 acc[200:203], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]\n" -" buffer_load_dwordx4 acc[204:207], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]\n" -" buffer_load_dwordx4 acc[208:211], %[v_os_b1], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]\n" -" buffer_load_dwordx4 acc[212:215], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]\n" -" buffer_load_dwordx4 acc[216:219], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]\n" -" buffer_load_dwordx4 acc[220:223], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]\n" -" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]\n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195] \n" +" buffer_load_dwordx4 acc[192:195], %[v_os_b0], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195] \n" +" buffer_load_dwordx4 acc[196:199], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199] \n" +" buffer_load_dwordx4 acc[200:203], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199] \n" +" buffer_load_dwordx4 acc[204:207], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203] \n" +" buffer_load_dwordx4 acc[208:211], %[v_os_b1], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203] \n" +" buffer_load_dwordx4 acc[212:215], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207] \n" +" buffer_load_dwordx4 acc[216:219], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207] \n" +" buffer_load_dwordx4 acc[220:223], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207] \n" +" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207] \n" " s_waitcnt vmcnt(40) \n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]\n" -" buffer_load_dwordx4 acc[224:227], %[v_os_b2], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]\n" -" buffer_load_dword v13, v5, %[s_res_dq], 0 offen \n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]\n" -" buffer_load_dwordx4 acc[228:231], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]\n" -" buffer_load_dwordx4 acc[232:235], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]\n" -" buffer_load_dwordx4 acc[236:239], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]\n" -" buffer_load_dwordx4 acc[240:243], %[v_os_b3], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]\n" -" buffer_load_dwordx4 acc[244:247], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]\n" -" buffer_load_dwordx4 acc[248:251], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]\n" -" buffer_load_dwordx4 acc[252:255], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]\n" -" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]\n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211] \n" +" buffer_load_dwordx4 acc[224:227], %[v_os_b2], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211] \n" +" buffer_load_dword v13, %[v_os_dq], %[s_res_dq], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211] \n" +" buffer_load_dwordx4 acc[228:231], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215] \n" +" buffer_load_dwordx4 acc[232:235], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215] \n" +" buffer_load_dwordx4 acc[236:239], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219] \n" +" buffer_load_dwordx4 acc[240:243], %[v_os_b3], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219] \n" +" buffer_load_dwordx4 acc[244:247], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223] \n" +" buffer_load_dwordx4 acc[248:251], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223] \n" +" buffer_load_dwordx4 acc[252:255], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223] \n" +" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223] \n" " s_add_u32 s60, 0x00000200, s80 \n" " s_cmp_lt_u32 s60, s81 \n" " s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0 \n" " s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0 \n" " s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0 \n" -" s_add_u32 s12, %[s_tile_os_b], s12 \n" -" s_addc_u32 s13, 0, s13 \n" -" s_add_u32 s16, %[s_tile_os_dq], s16 \n" -" s_addc_u32 s17, 0, s17 \n" +" s_add_u32 %[s_res_d][0], %[s_tile_os_b], %[s_res_d][0] \n" +" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n" +" s_add_u32 %[s_res_dq][0], %[s_tile_os_dq], %[s_res_dq][0] \n" +" s_addc_u32 %[s_res_dq][1], 0, %[s_res_dq][1] \n" " v_cvt_f32_i32 v192, v192 \n" " v_cvt_f32_i32 v193, v193 \n" " v_cvt_f32_i32 v194, v194 \n" @@ -255,10 +223,10 @@ " v_mul_f32 v193, v12, v193 row_newbcast:1 \n" " v_mul_f32 v194, v12, v194 row_newbcast:2 \n" " v_mul_f32 v195, v12, v195 row_newbcast:3 \n" -" v_mul_f32 v192, v20, v192 \n" -" v_mul_f32 v193, v20, v193 \n" -" v_mul_f32 v194, v20, v194 \n" -" v_mul_f32 v195, v20, v195 \n" +" v_mul_f32 v192, %[scale_0], v192 \n" +" v_mul_f32 v193, %[scale_0], v193 \n" +" v_mul_f32 v194, %[scale_0], v194 \n" +" v_mul_f32 v195, %[scale_0], v195 \n" " v_cvt_f32_i32 v196, v196 \n" " v_cvt_f32_i32 v197, v197 \n" " v_cvt_f32_i32 v198, v198 \n" @@ -271,10 +239,10 @@ " v_mul_f32 v197, v12, v197 row_newbcast:1 \n" " v_mul_f32 v198, v12, v198 row_newbcast:2 \n" " v_mul_f32 v199, v12, v199 row_newbcast:3 \n" -" v_mul_f32 v196, v21, v196 \n" -" v_mul_f32 v197, v21, v197 \n" -" v_mul_f32 v198, v21, v198 \n" -" v_mul_f32 v199, v21, v199 \n" +" v_mul_f32 v196, %[scale_1], v196 \n" +" v_mul_f32 v197, %[scale_1], v197 \n" +" v_mul_f32 v198, %[scale_1], v198 \n" +" v_mul_f32 v199, %[scale_1], v199 \n" " v_cvt_f32_i32 v200, v200 \n" " v_cvt_f32_i32 v201, v201 \n" " v_cvt_f32_i32 v202, v202 \n" @@ -287,10 +255,10 @@ " v_mul_f32 v201, v12, v201 row_newbcast:5 \n" " v_mul_f32 v202, v12, v202 row_newbcast:6 \n" " v_mul_f32 v203, v12, v203 row_newbcast:7 \n" -" v_mul_f32 v200, v20, v200 \n" -" v_mul_f32 v201, v20, v201 \n" -" v_mul_f32 v202, v20, v202 \n" -" v_mul_f32 v203, v20, v203 \n" +" v_mul_f32 v200, %[scale_0], v200 \n" +" v_mul_f32 v201, %[scale_0], v201 \n" +" v_mul_f32 v202, %[scale_0], v202 \n" +" v_mul_f32 v203, %[scale_0], v203 \n" " v_cvt_f32_i32 v204, v204 \n" " v_cvt_f32_i32 v205, v205 \n" " v_cvt_f32_i32 v206, v206 \n" @@ -303,10 +271,10 @@ " v_mul_f32 v205, v12, v205 row_newbcast:5 \n" " v_mul_f32 v206, v12, v206 row_newbcast:6 \n" " v_mul_f32 v207, v12, v207 row_newbcast:7 \n" -" v_mul_f32 v204, v21, v204 \n" -" v_mul_f32 v205, v21, v205 \n" -" v_mul_f32 v206, v21, v206 \n" -" v_mul_f32 v207, v21, v207 \n" +" v_mul_f32 v204, %[scale_1], v204 \n" +" v_mul_f32 v205, %[scale_1], v205 \n" +" v_mul_f32 v206, %[scale_1], v206 \n" +" v_mul_f32 v207, %[scale_1], v207 \n" " v_cvt_f32_i32 v208, v208 \n" " v_cvt_f32_i32 v209, v209 \n" " v_cvt_f32_i32 v210, v210 \n" @@ -319,10 +287,10 @@ " v_mul_f32 v209, v12, v209 row_newbcast:9 \n" " v_mul_f32 v210, v12, v210 row_newbcast:10 \n" " v_mul_f32 v211, v12, v211 row_newbcast:11 \n" -" v_mul_f32 v208, v20, v208 \n" -" v_mul_f32 v209, v20, v209 \n" -" v_mul_f32 v210, v20, v210 \n" -" v_mul_f32 v211, v20, v211 \n" +" v_mul_f32 v208, %[scale_0], v208 \n" +" v_mul_f32 v209, %[scale_0], v209 \n" +" v_mul_f32 v210, %[scale_0], v210 \n" +" v_mul_f32 v211, %[scale_0], v211 \n" " v_cvt_f32_i32 v212, v212 \n" " v_cvt_f32_i32 v213, v213 \n" " v_cvt_f32_i32 v214, v214 \n" @@ -335,10 +303,10 @@ " v_mul_f32 v213, v12, v213 row_newbcast:9 \n" " v_mul_f32 v214, v12, v214 row_newbcast:10 \n" " v_mul_f32 v215, v12, v215 row_newbcast:11 \n" -" v_mul_f32 v212, v21, v212 \n" -" v_mul_f32 v213, v21, v213 \n" -" v_mul_f32 v214, v21, v214 \n" -" v_mul_f32 v215, v21, v215 \n" +" v_mul_f32 v212, %[scale_1], v212 \n" +" v_mul_f32 v213, %[scale_1], v213 \n" +" v_mul_f32 v214, %[scale_1], v214 \n" +" v_mul_f32 v215, %[scale_1], v215 \n" " v_cvt_f32_i32 v216, v216 \n" " v_cvt_f32_i32 v217, v217 \n" " v_cvt_f32_i32 v218, v218 \n" @@ -351,10 +319,10 @@ " v_mul_f32 v217, v12, v217 row_newbcast:13 \n" " v_mul_f32 v218, v12, v218 row_newbcast:14 \n" " v_mul_f32 v219, v12, v219 row_newbcast:15 \n" -" v_mul_f32 v216, v20, v216 \n" -" v_mul_f32 v217, v20, v217 \n" -" v_mul_f32 v218, v20, v218 \n" -" v_mul_f32 v219, v20, v219 \n" +" v_mul_f32 v216, %[scale_0], v216 \n" +" v_mul_f32 v217, %[scale_0], v217 \n" +" v_mul_f32 v218, %[scale_0], v218 \n" +" v_mul_f32 v219, %[scale_0], v219 \n" " v_cvt_f32_i32 v220, v220 \n" " v_cvt_f32_i32 v221, v221 \n" " v_cvt_f32_i32 v222, v222 \n" @@ -367,10 +335,10 @@ " v_mul_f32 v221, v12, v221 row_newbcast:13 \n" " v_mul_f32 v222, v12, v222 row_newbcast:14 \n" " v_mul_f32 v223, v12, v223 row_newbcast:15 \n" -" v_mul_f32 v220, v21, v220 \n" -" v_mul_f32 v221, v21, v221 \n" -" v_mul_f32 v222, v21, v222 \n" -" v_mul_f32 v223, v21, v223 \n" +" v_mul_f32 v220, %[scale_1], v220 \n" +" v_mul_f32 v221, %[scale_1], v221 \n" +" v_mul_f32 v222, %[scale_1], v222 \n" +" v_mul_f32 v223, %[scale_1], v223 \n" " v_cmp_u_f32 s[48:49], v192, v192 \n" " v_add3_u32 v50, v192, v53, 1 \n" " v_cndmask_b32 v54, v50, v52, s[48:49] \n" @@ -511,52 +479,52 @@ " ds_read_b32 v79, v4 offset:48224 \n" " s_waitcnt lgkmcnt(0) \n" " s_mov_b64 exec, s[20:21] \n" -" global_atomic_pk_add_bf16 v80, v64, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o0], v64, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[20:21] \n" -" global_atomic_pk_add_bf16 v80, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o0], v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[22:23] \n" -" global_atomic_pk_add_bf16 v82, v66, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o1], v66, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[22:23] \n" -" global_atomic_pk_add_bf16 v82, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o1], v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[24:25] \n" -" global_atomic_pk_add_bf16 v84, v68, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o3], v68, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[24:25] \n" -" global_atomic_pk_add_bf16 v84, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o3], v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[26:27] \n" -" global_atomic_pk_add_bf16 v86, v70, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o4], v70, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[26:27] \n" -" global_atomic_pk_add_bf16 v86, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o4], v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[28:29] \n" -" global_atomic_pk_add_bf16 v88, v72, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o5], v72, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[28:29] \n" -" global_atomic_pk_add_bf16 v88, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o5], v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[30:31] \n" -" global_atomic_pk_add_bf16 v90, v74, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o6], v74, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[30:31] \n" -" global_atomic_pk_add_bf16 v90, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o6], v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[32:33] \n" -" global_atomic_pk_add_bf16 v92, v76, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o7], v76, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[32:33] \n" -" global_atomic_pk_add_bf16 v92, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o7], v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[34:35] \n" -" global_atomic_pk_add_bf16 v94, v78, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o8], v78, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[34:35] \n" -" global_atomic_pk_add_bf16 v94, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o8], v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_add_u32 %[s_res_o0], s59, %[s_res_o0] \n" " s_addc_u32 %[s_res_o1], 0, %[s_res_o1] \n" @@ -565,181 +533,181 @@ " s_cbranch_scc0 label_end_gemm2 \n" " s_waitcnt vmcnt(41) \n" " s_barrier \n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]\n" -" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]\n" -" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]\n" -" buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]\n" -" buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]\n" -" buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]\n" -" buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]\n" -" buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]\n" -" buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]\n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227] \n" +" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227] \n" +" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231] \n" +" buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231] \n" +" buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235] \n" +" buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235] \n" +" buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239] \n" +" buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239] \n" +" buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239] \n" " s_waitcnt vmcnt(41) \n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]\n" -" buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]\n" -" buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]\n" -" buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]\n" -" buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]\n" -" buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]\n" -" buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]\n" -" buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]\n" -" buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n" -" s_add_u32 s12, %[s_tile_os_b_half], s12 \n" -" s_addc_u32 s13, 0, s13 \n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255]\n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243] \n" +" buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243] \n" +" buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247] \n" +" buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247] \n" +" buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0 \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251] \n" +" buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251] \n" +" buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0 \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255] \n" +" buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255] \n" +" buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n" +" s_add_u32 %[s_res_d][0], %[s_tile_os_b_half], %[s_res_d][0] \n" +" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255] \n" " s_waitcnt vmcnt(41) \n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]\n" -" buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]\n" -" buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]\n" -" buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]\n" -" buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]\n" -" buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]\n" -" buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]\n" -" buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]\n" -" buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]\n" -" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]\n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227] \n" +" buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227] \n" +" buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231] \n" +" buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231] \n" +" buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235] \n" +" buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235] \n" +" buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239] \n" +" buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239] \n" +" buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239] \n" +" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239] \n" " s_waitcnt vmcnt(40) \n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]\n" -" buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]\n" -" buffer_load_dword v12, v5, %[s_res_dq], 0 offen \n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]\n" -" buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]\n" -" buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]\n" -" buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]\n" -" buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]\n" -" buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]\n" -" buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]\n" -" buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]\n" -" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]\n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243] \n" +" buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243] \n" +" buffer_load_dword v12, %[v_os_dq], %[s_res_dq], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243] \n" +" buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247] \n" +" buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247] \n" +" buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251] \n" +" buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251] \n" +" buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255] \n" +" buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255] \n" +" buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255] \n" +" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255] \n" " s_add_u32 s60, 0x00000200, s80 \n" " s_cmp_lt_u32 s60, s81 \n" " s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0 \n" " s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0 \n" " s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0 \n" -" s_add_u32 s12, %[s_tile_os_b], s12 \n" -" s_addc_u32 s13, 0, s13 \n" -" s_add_u32 s16, %[s_tile_os_dq], s16 \n" -" s_addc_u32 s17, 0, s17 \n" +" s_add_u32 %[s_res_d][0], %[s_tile_os_b], %[s_res_d][0] \n" +" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n" +" s_add_u32 %[s_res_dq][0], %[s_tile_os_dq], %[s_res_dq][0] \n" +" s_addc_u32 %[s_res_dq][1], 0, %[s_res_dq][1] \n" " v_cvt_f32_i32 v224, v224 \n" " v_cvt_f32_i32 v225, v225 \n" " v_cvt_f32_i32 v226, v226 \n" @@ -752,10 +720,10 @@ " v_mul_f32 v225, v13, v225 row_newbcast:1 \n" " v_mul_f32 v226, v13, v226 row_newbcast:2 \n" " v_mul_f32 v227, v13, v227 row_newbcast:3 \n" -" v_mul_f32 v224, v20, v224 \n" -" v_mul_f32 v225, v20, v225 \n" -" v_mul_f32 v226, v20, v226 \n" -" v_mul_f32 v227, v20, v227 \n" +" v_mul_f32 v224, %[scale_0], v224 \n" +" v_mul_f32 v225, %[scale_0], v225 \n" +" v_mul_f32 v226, %[scale_0], v226 \n" +" v_mul_f32 v227, %[scale_0], v227 \n" " v_cvt_f32_i32 v228, v228 \n" " v_cvt_f32_i32 v229, v229 \n" " v_cvt_f32_i32 v230, v230 \n" @@ -768,10 +736,10 @@ " v_mul_f32 v229, v13, v229 row_newbcast:1 \n" " v_mul_f32 v230, v13, v230 row_newbcast:2 \n" " v_mul_f32 v231, v13, v231 row_newbcast:3 \n" -" v_mul_f32 v228, v21, v228 \n" -" v_mul_f32 v229, v21, v229 \n" -" v_mul_f32 v230, v21, v230 \n" -" v_mul_f32 v231, v21, v231 \n" +" v_mul_f32 v228, %[scale_1], v228 \n" +" v_mul_f32 v229, %[scale_1], v229 \n" +" v_mul_f32 v230, %[scale_1], v230 \n" +" v_mul_f32 v231, %[scale_1], v231 \n" " v_cvt_f32_i32 v232, v232 \n" " v_cvt_f32_i32 v233, v233 \n" " v_cvt_f32_i32 v234, v234 \n" @@ -784,10 +752,10 @@ " v_mul_f32 v233, v13, v233 row_newbcast:5 \n" " v_mul_f32 v234, v13, v234 row_newbcast:6 \n" " v_mul_f32 v235, v13, v235 row_newbcast:7 \n" -" v_mul_f32 v232, v20, v232 \n" -" v_mul_f32 v233, v20, v233 \n" -" v_mul_f32 v234, v20, v234 \n" -" v_mul_f32 v235, v20, v235 \n" +" v_mul_f32 v232, %[scale_0], v232 \n" +" v_mul_f32 v233, %[scale_0], v233 \n" +" v_mul_f32 v234, %[scale_0], v234 \n" +" v_mul_f32 v235, %[scale_0], v235 \n" " v_cvt_f32_i32 v236, v236 \n" " v_cvt_f32_i32 v237, v237 \n" " v_cvt_f32_i32 v238, v238 \n" @@ -800,10 +768,10 @@ " v_mul_f32 v237, v13, v237 row_newbcast:5 \n" " v_mul_f32 v238, v13, v238 row_newbcast:6 \n" " v_mul_f32 v239, v13, v239 row_newbcast:7 \n" -" v_mul_f32 v236, v21, v236 \n" -" v_mul_f32 v237, v21, v237 \n" -" v_mul_f32 v238, v21, v238 \n" -" v_mul_f32 v239, v21, v239 \n" +" v_mul_f32 v236, %[scale_1], v236 \n" +" v_mul_f32 v237, %[scale_1], v237 \n" +" v_mul_f32 v238, %[scale_1], v238 \n" +" v_mul_f32 v239, %[scale_1], v239 \n" " v_cvt_f32_i32 v240, v240 \n" " v_cvt_f32_i32 v241, v241 \n" " v_cvt_f32_i32 v242, v242 \n" @@ -816,10 +784,10 @@ " v_mul_f32 v241, v13, v241 row_newbcast:9 \n" " v_mul_f32 v242, v13, v242 row_newbcast:10 \n" " v_mul_f32 v243, v13, v243 row_newbcast:11 \n" -" v_mul_f32 v240, v20, v240 \n" -" v_mul_f32 v241, v20, v241 \n" -" v_mul_f32 v242, v20, v242 \n" -" v_mul_f32 v243, v20, v243 \n" +" v_mul_f32 v240, %[scale_0], v240 \n" +" v_mul_f32 v241, %[scale_0], v241 \n" +" v_mul_f32 v242, %[scale_0], v242 \n" +" v_mul_f32 v243, %[scale_0], v243 \n" " v_cvt_f32_i32 v244, v244 \n" " v_cvt_f32_i32 v245, v245 \n" " v_cvt_f32_i32 v246, v246 \n" @@ -832,10 +800,10 @@ " v_mul_f32 v245, v13, v245 row_newbcast:9 \n" " v_mul_f32 v246, v13, v246 row_newbcast:10 \n" " v_mul_f32 v247, v13, v247 row_newbcast:11 \n" -" v_mul_f32 v244, v21, v244 \n" -" v_mul_f32 v245, v21, v245 \n" -" v_mul_f32 v246, v21, v246 \n" -" v_mul_f32 v247, v21, v247 \n" +" v_mul_f32 v244, %[scale_1], v244 \n" +" v_mul_f32 v245, %[scale_1], v245 \n" +" v_mul_f32 v246, %[scale_1], v246 \n" +" v_mul_f32 v247, %[scale_1], v247 \n" " v_cvt_f32_i32 v248, v248 \n" " v_cvt_f32_i32 v249, v249 \n" " v_cvt_f32_i32 v250, v250 \n" @@ -848,10 +816,10 @@ " v_mul_f32 v249, v13, v249 row_newbcast:13 \n" " v_mul_f32 v250, v13, v250 row_newbcast:14 \n" " v_mul_f32 v251, v13, v251 row_newbcast:15 \n" -" v_mul_f32 v248, v20, v248 \n" -" v_mul_f32 v249, v20, v249 \n" -" v_mul_f32 v250, v20, v250 \n" -" v_mul_f32 v251, v20, v251 \n" +" v_mul_f32 v248, %[scale_0], v248 \n" +" v_mul_f32 v249, %[scale_0], v249 \n" +" v_mul_f32 v250, %[scale_0], v250 \n" +" v_mul_f32 v251, %[scale_0], v251 \n" " v_cvt_f32_i32 v252, v252 \n" " v_cvt_f32_i32 v253, v253 \n" " v_cvt_f32_i32 v254, v254 \n" @@ -864,10 +832,10 @@ " v_mul_f32 v253, v13, v253 row_newbcast:13 \n" " v_mul_f32 v254, v13, v254 row_newbcast:14 \n" " v_mul_f32 v255, v13, v255 row_newbcast:15 \n" -" v_mul_f32 v252, v21, v252 \n" -" v_mul_f32 v253, v21, v253 \n" -" v_mul_f32 v254, v21, v254 \n" -" v_mul_f32 v255, v21, v255 \n" +" v_mul_f32 v252, %[scale_1], v252 \n" +" v_mul_f32 v253, %[scale_1], v253 \n" +" v_mul_f32 v254, %[scale_1], v254 \n" +" v_mul_f32 v255, %[scale_1], v255 \n" " v_cmp_u_f32 s[48:49], v224, v224 \n" " v_add3_u32 v50, v224, v53, 1 \n" " v_cndmask_b32 v54, v50, v52, s[48:49] \n" @@ -1008,52 +976,52 @@ " ds_read_b32 v79, v4 offset:48224 \n" " s_waitcnt lgkmcnt(0) \n" " s_mov_b64 exec, s[20:21] \n" -" global_atomic_pk_add_bf16 v80, v64, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o0], v64, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[20:21] \n" -" global_atomic_pk_add_bf16 v80, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o0], v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[22:23] \n" -" global_atomic_pk_add_bf16 v82, v66, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o1], v66, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[22:23] \n" -" global_atomic_pk_add_bf16 v82, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o1], v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[24:25] \n" -" global_atomic_pk_add_bf16 v84, v68, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o3], v68, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[24:25] \n" -" global_atomic_pk_add_bf16 v84, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o3], v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[26:27] \n" -" global_atomic_pk_add_bf16 v86, v70, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o4], v70, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[26:27] \n" -" global_atomic_pk_add_bf16 v86, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o4], v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[28:29] \n" -" global_atomic_pk_add_bf16 v88, v72, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o5], v72, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[28:29] \n" -" global_atomic_pk_add_bf16 v88, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o5], v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[30:31] \n" -" global_atomic_pk_add_bf16 v90, v74, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o6], v74, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[30:31] \n" -" global_atomic_pk_add_bf16 v90, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o6], v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[32:33] \n" -" global_atomic_pk_add_bf16 v92, v76, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o7], v76, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[32:33] \n" -" global_atomic_pk_add_bf16 v92, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o7], v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[34:35] \n" -" global_atomic_pk_add_bf16 v94, v78, [%[s_res_o0],%[s_res_o1]] \n" +" global_atomic_pk_add_bf16 %[v_os_o8], v78, [%[s_res_o0],%[s_res_o1]] \n" " s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[34:35] \n" -" global_atomic_pk_add_bf16 v94, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n" +" global_atomic_pk_add_bf16 %[v_os_o8], v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n" " s_mov_b64 exec, s[36:37] \n" " s_add_u32 %[s_res_o0], s59, %[s_res_o0] \n" " s_addc_u32 %[s_res_o1], 0, %[s_res_o1] \n" @@ -1068,3 +1036,5 @@ #undef _UK_PK_CVT_ #undef _UK_ATOMIC_ADD_ + + diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc index 3c9a6c62a8..92a138201c 100644 --- a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc +++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc @@ -5,36 +5,20 @@ #if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8 # define _UK_MFMA_ "v_mfma_i32_16x16x32_i8" #endif - # define _DEQUAN_CVT_(a0,a1,a2,a3, b, c) \ - " v_cvt_f32_i32 a0, a0 \n" \ - " v_cvt_f32_i32 a1, a1 \n" \ - " v_cvt_f32_i32 a2, a2 \n" \ - " v_cvt_f32_i32 a3, a3 \n" \ - " v_mul_f32 a0, v15, a0 \n" \ - " v_mul_f32 a1, v15, a1 \n" \ - " v_mul_f32 a2, v15, a2 \n" \ - " v_mul_f32 a3, v15, a3 \n" \ - " v_mul_f32 a0, v17, a0 row_newbcast:12 \n" \ - " v_mul_f32 a1, v17, a1 row_newbcast:13 \n" \ - " v_mul_f32 a2, v17, a2 row_newbcast:14 \n" \ - " v_mul_f32 a3, v17, a3 row_newbcast:15 \n" \ + # define _DEQUAN_CVT_(a0,a1,a2,a3, xq, gq,brd0,brd1,brd2,brd3) \ + " v_cvt_f32_i32 " a0 ", " a0 " \n" \ + " v_cvt_f32_i32 " a1 ", " a1 " \n" \ + " v_cvt_f32_i32 " a2 ", " a2 " \n" \ + " v_cvt_f32_i32 " a3 ", " a3" \n" \ + " v_mul_f32 " a0 ", " xq ", " a0 " \n" \ + " v_mul_f32 " a1 ", " xq ", " a1 " \n" \ + " v_mul_f32 " a2 ", " xq ", " a2 " \n" \ + " v_mul_f32 " a3 ", " xq ", " a3 " \n" \ + " v_mul_f32 " a0 ", " gq ", " a0 " row_newbcast:" brd0 " \n" \ + " v_mul_f32 " a1 ", " gq ", " a1 " row_newbcast:" brd1 " \n" \ + " v_mul_f32 " a2 ", " gq ", " a2 " row_newbcast: " brd2 " \n" \ + " v_mul_f32 " a3 ", " gq ", " a3 " row_newbcast:" brd3 " \n" -";---------------------------------------------- \n" -" v_lshrrev_b32 v54, 4, v0 \n" -" v_lshlrev_b32 v55, 2, v54 \n" -" v_and_b32 v54, 15, v0 \n" -" v_lshrrev_b32 v56, 2, v54 \n" -" v_lshlrev_b32 v56, 6, v56 \n" -" v_add_u32 v55, v56, v55 \n" -" v_and_b32 v54, 3, v0 \n" -" v_add_u32 v55, v54, v55 \n" -" v_lshlrev_b32 v10, 2, v55 \n" -" v_add_u32 v11, 0x00000400, v10 \n" -" s_mul_i32 s60, %[s_wave_id], 16 \n" -" s_mul_i32 s60, s60, 4 \n" -" v_add_u32 v10, s60, v10 \n" -" v_add_u32 v11, s60, v11 \n" -" v_mov_b32 v5, v10 \n" ";---------------------------------------------- \n" " s_mov_b32 s57, 0x00000100 \n" " s_mov_b32 s58, 0x00001000 \n" @@ -53,27 +37,22 @@ " v_mov_b32 v52, 0x7fff0000 \n" " v_mov_b32 v53, 0x00007fff \n" " s_waitcnt 0x0000 \n" -";---------------------------------------------- \n" - " v_lshrrev_b32 v54, 24, %[v_token_id0] \n" - " v_mul_i32_i24 v54, s66, v54 \n" - " v_and_b32 v55, 0x00ffffff, %[v_token_id0] \n" - " v_add_u32 %[v_token_id0], v54, v55 \n" - " v_lshrrev_b32 v54, 24, %[v_token_id1] \n" - " v_mul_i32_i24 v54, s66, v54 \n" - " v_and_b32 v55, 0x00ffffff, %[v_token_id1] \n" -" v_add_u32 %[v_token_id1], v54, v55 \n" -" v_lshlrev_b32 %[v_token_id0], 2, %[v_token_id0] \n" -" v_lshlrev_b32 %[v_token_id1], 2, %[v_token_id1] \n" -" buffer_load_dword v14, %[v_token_id0], %[s_res_aq], 0 offen \n" -" buffer_load_dword v15, %[v_token_id1], %[s_res_aq], 0 offen \n" -" buffer_load_dword v16, v10, %[s_res_gq], 0 offen \n" -" buffer_load_dword v17, v11, %[s_res_gq], 0 offen \n" -" buffer_load_dword v18, v10, %[s_res_smq], 0 offen \n" -" buffer_load_dword v19, v11, %[s_res_smq], 0 offen \n" -" buffer_load_dword v20, v8, s[40:43], 0 offen \n" -" buffer_load_dword v21, v9, s[40:43], 0 offen \n" - " s_mov_b32 s80, 0 \n" +" v_lshrrev_b32 v54, 4, v0 \n" +" v_mul_i32_i24 v3, 34, v54 \n" +" v_and_b32 v54, 15, v0 \n" +" v_mul_i32_i24 v55, 2, v54 \n" +" v_add_u32 v3, v55, v3 \n" +" s_mul_i32 s60, s7, 0x00000088 \n" +" v_add_u32 v3, s60, v3 \n" +" v_lshlrev_b32 v3, 2, v3 \n" +" v_lshrrev_b32 v54, 1, v0 \n" +" v_mul_i32_i24 v4, 34, v54 \n" +" v_and_b32 v55, 1, v0 \n" +" v_add_u32 v4, v55, v4 \n" +" s_mul_i32 s60, s7, 2 \n" +" v_add_u32 v4, s60, v4 \n" +" v_lshlrev_b32 v4, 2, v4 \n" ";---------------------------------------------- \n" "; -- prefetch A0\n" "s_add_u32 m0, 0, %[s_m0_init] \n" @@ -570,198 +549,23 @@ " s_branch label_start \n" " label_end : \n" ";---------------------------------------------- \n" -" v_cvt_f32_i32 v128, v128 \n" -" v_cvt_f32_i32 v129, v129 \n" -" v_cvt_f32_i32 v130, v130 \n" -" v_cvt_f32_i32 v131, v131 \n" -" v_mul_f32 v128, v14, v128 \n" -" v_mul_f32 v129, v14, v129 \n" -" v_mul_f32 v130, v14, v130 \n" -" v_mul_f32 v131, v14, v131 \n" -" v_mul_f32 v128, v16, v128 row_newbcast:0 \n" -" v_mul_f32 v129, v16, v129 row_newbcast:1 \n" -" v_mul_f32 v130, v16, v130 row_newbcast:2 \n" -" v_mul_f32 v131, v16, v131 row_newbcast:3 \n" -" v_cvt_f32_i32 v132, v132 \n" -" v_cvt_f32_i32 v133, v133 \n" -" v_cvt_f32_i32 v134, v134 \n" -" v_cvt_f32_i32 v135, v135 \n" -" v_mul_f32 v132, v15, v132 \n" -" v_mul_f32 v133, v15, v133 \n" -" v_mul_f32 v134, v15, v134 \n" -" v_mul_f32 v135, v15, v135 \n" -" v_mul_f32 v132, v16, v132 row_newbcast:0 \n" -" v_mul_f32 v133, v16, v133 row_newbcast:1 \n" -" v_mul_f32 v134, v16, v134 row_newbcast:2 \n" -" v_mul_f32 v135, v16, v135 row_newbcast:3 \n" -" v_cvt_f32_i32 v136, v136 \n" -" v_cvt_f32_i32 v137, v137 \n" -" v_cvt_f32_i32 v138, v138 \n" -" v_cvt_f32_i32 v139, v139 \n" -" v_mul_f32 v136, v14, v136 \n" -" v_mul_f32 v137, v14, v137 \n" -" v_mul_f32 v138, v14, v138 \n" -" v_mul_f32 v139, v14, v139 \n" -" v_mul_f32 v136, v16, v136 row_newbcast:4 \n" -" v_mul_f32 v137, v16, v137 row_newbcast:5 \n" -" v_mul_f32 v138, v16, v138 row_newbcast:6 \n" -" v_mul_f32 v139, v16, v139 row_newbcast:7 \n" -" v_cvt_f32_i32 v140, v140 \n" -" v_cvt_f32_i32 v141, v141 \n" -" v_cvt_f32_i32 v142, v142 \n" -" v_cvt_f32_i32 v143, v143 \n" -" v_mul_f32 v140, v15, v140 \n" -" v_mul_f32 v141, v15, v141 \n" -" v_mul_f32 v142, v15, v142 \n" -" v_mul_f32 v143, v15, v143 \n" -" v_mul_f32 v140, v16, v140 row_newbcast:4 \n" -" v_mul_f32 v141, v16, v141 row_newbcast:5 \n" -" v_mul_f32 v142, v16, v142 row_newbcast:6 \n" -" v_mul_f32 v143, v16, v143 row_newbcast:7 \n" -" v_cvt_f32_i32 v144, v144 \n" -" v_cvt_f32_i32 v145, v145 \n" -" v_cvt_f32_i32 v146, v146 \n" -" v_cvt_f32_i32 v147, v147 \n" -" v_mul_f32 v144, v14, v144 \n" -" v_mul_f32 v145, v14, v145 \n" -" v_mul_f32 v146, v14, v146 \n" -" v_mul_f32 v147, v14, v147 \n" -" v_mul_f32 v144, v16, v144 row_newbcast:8 \n" -" v_mul_f32 v145, v16, v145 row_newbcast:9 \n" -" v_mul_f32 v146, v16, v146 row_newbcast:10 \n" -" v_mul_f32 v147, v16, v147 row_newbcast:11 \n" -" v_cvt_f32_i32 v148, v148 \n" -" v_cvt_f32_i32 v149, v149 \n" -" v_cvt_f32_i32 v150, v150 \n" -" v_cvt_f32_i32 v151, v151 \n" -" v_mul_f32 v148, v15, v148 \n" -" v_mul_f32 v149, v15, v149 \n" -" v_mul_f32 v150, v15, v150 \n" -" v_mul_f32 v151, v15, v151 \n" -" v_mul_f32 v148, v16, v148 row_newbcast:8 \n" -" v_mul_f32 v149, v16, v149 row_newbcast:9 \n" -" v_mul_f32 v150, v16, v150 row_newbcast:10 \n" -" v_mul_f32 v151, v16, v151 row_newbcast:11 \n" -" v_cvt_f32_i32 v152, v152 \n" -" v_cvt_f32_i32 v153, v153 \n" -" v_cvt_f32_i32 v154, v154 \n" -" v_cvt_f32_i32 v155, v155 \n" -" v_mul_f32 v152, v14, v152 \n" -" v_mul_f32 v153, v14, v153 \n" -" v_mul_f32 v154, v14, v154 \n" -" v_mul_f32 v155, v14, v155 \n" -" v_mul_f32 v152, v16, v152 row_newbcast:12 \n" -" v_mul_f32 v153, v16, v153 row_newbcast:13 \n" -" v_mul_f32 v154, v16, v154 row_newbcast:14 \n" -" v_mul_f32 v155, v16, v155 row_newbcast:15 \n" -" v_cvt_f32_i32 v156, v156 \n" -" v_cvt_f32_i32 v157, v157 \n" -" v_cvt_f32_i32 v158, v158 \n" -" v_cvt_f32_i32 v159, v159 \n" -" v_mul_f32 v156, v15, v156 \n" -" v_mul_f32 v157, v15, v157 \n" -" v_mul_f32 v158, v15, v158 \n" -" v_mul_f32 v159, v15, v159 \n" -" v_mul_f32 v156, v16, v156 row_newbcast:12 \n" -" v_mul_f32 v157, v16, v157 row_newbcast:13 \n" -" v_mul_f32 v158, v16, v158 row_newbcast:14 \n" -" v_mul_f32 v159, v16, v159 row_newbcast:15 \n" -" v_cvt_f32_i32 v160, v160 \n" -" v_cvt_f32_i32 v161, v161 \n" -" v_cvt_f32_i32 v162, v162 \n" -" v_cvt_f32_i32 v163, v163 \n" -" v_mul_f32 v160, v14, v160 \n" -" v_mul_f32 v161, v14, v161 \n" -" v_mul_f32 v162, v14, v162 \n" -" v_mul_f32 v163, v14, v163 \n" -" v_mul_f32 v160, v17, v160 row_newbcast:0 \n" -" v_mul_f32 v161, v17, v161 row_newbcast:1 \n" -" v_mul_f32 v162, v17, v162 row_newbcast:2 \n" -" v_mul_f32 v163, v17, v163 row_newbcast:3 \n" -" v_cvt_f32_i32 v164, v164 \n" -" v_cvt_f32_i32 v165, v165 \n" -" v_cvt_f32_i32 v166, v166 \n" -" v_cvt_f32_i32 v167, v167 \n" -" v_mul_f32 v164, v15, v164 \n" -" v_mul_f32 v165, v15, v165 \n" -" v_mul_f32 v166, v15, v166 \n" -" v_mul_f32 v167, v15, v167 \n" -" v_mul_f32 v164, v17, v164 row_newbcast:0 \n" -" v_mul_f32 v165, v17, v165 row_newbcast:1 \n" -" v_mul_f32 v166, v17, v166 row_newbcast:2 \n" -" v_mul_f32 v167, v17, v167 row_newbcast:3 \n" -" v_cvt_f32_i32 v168, v168 \n" -" v_cvt_f32_i32 v169, v169 \n" -" v_cvt_f32_i32 v170, v170 \n" -" v_cvt_f32_i32 v171, v171 \n" -" v_mul_f32 v168, v14, v168 \n" -" v_mul_f32 v169, v14, v169 \n" -" v_mul_f32 v170, v14, v170 \n" -" v_mul_f32 v171, v14, v171 \n" -" v_mul_f32 v168, v17, v168 row_newbcast:4 \n" -" v_mul_f32 v169, v17, v169 row_newbcast:5 \n" -" v_mul_f32 v170, v17, v170 row_newbcast:6 \n" -" v_mul_f32 v171, v17, v171 row_newbcast:7 \n" -" v_cvt_f32_i32 v172, v172 \n" -" v_cvt_f32_i32 v173, v173 \n" -" v_cvt_f32_i32 v174, v174 \n" -" v_cvt_f32_i32 v175, v175 \n" -" v_mul_f32 v172, v15, v172 \n" -" v_mul_f32 v173, v15, v173 \n" -" v_mul_f32 v174, v15, v174 \n" -" v_mul_f32 v175, v15, v175 \n" -" v_mul_f32 v172, v17, v172 row_newbcast:4 \n" -" v_mul_f32 v173, v17, v173 row_newbcast:5 \n" -" v_mul_f32 v174, v17, v174 row_newbcast:6 \n" -" v_mul_f32 v175, v17, v175 row_newbcast:7 \n" -" v_cvt_f32_i32 v176, v176 \n" -" v_cvt_f32_i32 v177, v177 \n" -" v_cvt_f32_i32 v178, v178 \n" -" v_cvt_f32_i32 v179, v179 \n" -" v_mul_f32 v176, v14, v176 \n" -" v_mul_f32 v177, v14, v177 \n" -" v_mul_f32 v178, v14, v178 \n" -" v_mul_f32 v179, v14, v179 \n" -" v_mul_f32 v176, v17, v176 row_newbcast:8 \n" -" v_mul_f32 v177, v17, v177 row_newbcast:9 \n" -" v_mul_f32 v178, v17, v178 row_newbcast:10 \n" -" v_mul_f32 v179, v17, v179 row_newbcast:11 \n" -" v_cvt_f32_i32 v180, v180 \n" -" v_cvt_f32_i32 v181, v181 \n" -" v_cvt_f32_i32 v182, v182 \n" -" v_cvt_f32_i32 v183, v183 \n" -" v_mul_f32 v180, v15, v180 \n" -" v_mul_f32 v181, v15, v181 \n" -" v_mul_f32 v182, v15, v182 \n" -" v_mul_f32 v183, v15, v183 \n" -" v_mul_f32 v180, v17, v180 row_newbcast:8 \n" -" v_mul_f32 v181, v17, v181 row_newbcast:9 \n" -" v_mul_f32 v182, v17, v182 row_newbcast:10 \n" -" v_mul_f32 v183, v17, v183 row_newbcast:11 \n" -" v_cvt_f32_i32 v184, v184 \n" -" v_cvt_f32_i32 v185, v185 \n" -" v_cvt_f32_i32 v186, v186 \n" -" v_cvt_f32_i32 v187, v187 \n" -" v_mul_f32 v184, v14, v184 \n" -" v_mul_f32 v185, v14, v185 \n" -" v_mul_f32 v186, v14, v186 \n" -" v_mul_f32 v187, v14, v187 \n" -" v_mul_f32 v184, v17, v184 row_newbcast:12 \n" -" v_mul_f32 v185, v17, v185 row_newbcast:13 \n" -" v_mul_f32 v186, v17, v186 row_newbcast:14 \n" -" v_mul_f32 v187, v17, v187 row_newbcast:15 \n" -" v_cvt_f32_i32 v188, v188 \n" -" v_cvt_f32_i32 v189, v189 \n" -" v_cvt_f32_i32 v190, v190 \n" -" v_cvt_f32_i32 v191, v191 \n" -" v_mul_f32 v188, v15, v188 \n" -" v_mul_f32 v189, v15, v189 \n" -" v_mul_f32 v190, v15, v190 \n" -" v_mul_f32 v191, v15, v191 \n" -" v_mul_f32 v188, v17, v188 row_newbcast:12 \n" -" v_mul_f32 v189, v17, v189 row_newbcast:13 \n" -" v_mul_f32 v190, v17, v190 row_newbcast:14 \n" -" v_mul_f32 v191, v17, v191 row_newbcast:15 \n" +_DEQUAN_CVT_("%[c0]","%[c1]","%[c2]","%[c3]","%[a_scale0]"," %[gq_scale0]","0","1","2","3") +_DEQUAN_CVT_("%[c4]","%[c5]","%[c6]","%[c7]","%[a_scale1]"," %[gq_scale0]","0","1","2","3") +_DEQUAN_CVT_("%[c8]","%[c9]","%[c10]","%[c11]","%[a_scale0]"," %[gq_scale0]","4","5","6","7") +_DEQUAN_CVT_("%[c12]","%[c13]","%[c14]","%[c15]","%[a_scale1]"," %[gq_scale0]","4","5","6","7") +_DEQUAN_CVT_("%[c16]","%[c17]","%[c18]","%[c19]","%[a_scale0]"," %[gq_scale0]","8","9","10","11") +_DEQUAN_CVT_("%[c20]","%[c21]","%[c22]","%[c23]","%[a_scale1]"," %[gq_scale0]","8","9","10","11") +_DEQUAN_CVT_("%[c24]","%[c25]","%[c26]","%[c27]","%[a_scale0]"," %[gq_scale0]","12","13","14","15") +_DEQUAN_CVT_("%[c28]","%[c29]","%[c30]","%[c31]","%[a_scale1]"," %[gq_scale0]","12","13","14","15") +_DEQUAN_CVT_("%[c32]","%[c33]","%[c34]","%[c35]","%[a_scale0]"," %[gq_scale1]","0","1","2","3") +_DEQUAN_CVT_("%[c36]","%[c37]","%[c38]","%[c39]","%[a_scale1]"," %[gq_scale1]","0","1","2","3") +_DEQUAN_CVT_("%[c40]","%[c41]","%[c42]","%[c43]","%[a_scale0]"," %[gq_scale1]","4","5","6","7") +_DEQUAN_CVT_("%[c44]","%[c45]","%[c46]","%[c47]","%[a_scale1]"," %[gq_scale1]","4","5","6","7") +_DEQUAN_CVT_("%[c48]","%[c49]","%[c50]","%[c51]","%[a_scale0]"," %[gq_scale1]","8","9","10","11") +_DEQUAN_CVT_("%[c52]","%[c53]","%[c54]","%[c55]","%[a_scale1]"," %[gq_scale1]","8","9","10","11") +_DEQUAN_CVT_("%[c56]","%[c57]","%[c58]","%[c59]","%[a_scale0]"," %[gq_scale1]","12","13","14","15") +_DEQUAN_CVT_("%[c60]","%[c61]","%[c62]","%[c63]","%[a_scale1]"," %[gq_scale1]","12","13","14","15") + #undef _UK_MFMA_ #undef _DEQUAN_CVT_ diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp index 01a53107fb..509fa73776 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp @@ -186,6 +186,50 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 return coords; } + // TODO: this row id is before shuffle atomic, need use acc distribution + //this calculation shared by G and SMQ + CK_TILE_DEVICE auto GetColCoords_GQSMQ(index_t base_offset) + { + constexpr index_t MLanes = BlockShape::Warp_M1; + constexpr index_t Repeat_N = 2;//different,this load is partitioned along N + + // auto h_id = threadIdx.x / MLanes ; + // auto r_id = threadIdx.x & 0xffff; + // auto p_id = r_id/4; + // auto q_is = threadIdx.x & 0x3; + + array coords; + static_for<0, Repeat_N, 1>{}([&](auto i) { coords.at(i) = base_coord + (threadIdx.x / MLanes) * 4 + + (threadIdx.x & 0xffff)/4 * 64 + + q_id + + i * 256 ; }); + return coords; + } + //this calculation shared by G and SMQ + CK_TILE_DEVICE auto GetGQScale(const COL_IDS coords, + const GScaleDataType* g_scale_ptr) + { + constexpr index_t n_size = coords.size(); + + array g_scale_value; + static_for<0, n_size, 1>{}([&](auto i) { + g_scale_value.at(i) = g_scale_ptr[coords[i]]; + }); + + return g_scale_value; + } + CK_TILE_DEVICE auto GetSMQScale(const COL_IDS coords, + const YSmoothScaleDataType * y_scale_ptr) + { + constexpr index_t n_size = coords.size(); + array y_scale_value; + static_for<0, n_size, 1>{}([&](auto i) { + y_scale_value.at(i) = y_scale_ptr[coords[i]]; + }); + + return y_scale_value; + } + template CK_TILE_DEVICE auto operator()(const Karg& kargs, @@ -230,12 +274,6 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 return (row_ids_a[i]) &0xffffff; }, number{}); - // auto token_id_mma = generate_tuple( - // [&](auto i) { - // return (row_ids_a_mma[i]) &0xffffff; - // }, - // number{}); - //addr in fact auto a_coords = generate_tuple( [&](auto i) { return ((row_ids_a[i])&0xffffff) * kargs.stride_token + @@ -306,7 +344,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 auto smq_win = [&]() { const YSmoothScaleDataType* smq_ptr = reinterpret_cast(kargs.y_smooth_scale_ptr) + static_cast(expert_id) * smq_scale_expert_stride_0 + - intermediate_tile_id * BlockShape::Block_N0; + intermediate_tile_id * BlockShape::Block_K1; // const GDataType* g_ptr = reinterpret_cast(kargs.g_scale_ptr);//remember to add expert id for inline auto smq_view_ = make_naive_tensor_view( smq_ptr, @@ -346,15 +384,15 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 auto d_res = d_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_; //////gq auto dq_win = [&]() { - const DScaleDataType* g_ptr = reinterpret_cast(kargs.d_scale_ptr) + - static_cast(expert_id) * d_scale_expert_stride_1; + const DScaleDataType* dq_ptr = reinterpret_cast(kargs.d_scale_ptr) + + static_cast(expert_id) * d_scale_expert_stride_1; // const GDataType* g_ptr = reinterpret_cast(kargs.d_scale_ptr)//remember to add expert_id as expert_idx - auto g_view_ = make_naive_tensor_view_packed( - g_ptr, + auto dq_view_ = make_naive_tensor_view_packed( + dq_ptr, make_tuple(kargs.hidden_size), number<1>{}); - return g_view_; + return dq_view_; }(); auto dq_res = dq_win.get_buffer_view().cached_buf_res_; @@ -400,15 +438,6 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 generate_tuple([&](auto i) { return cmp_lt_to_exec(token_id[i], kargs.num_tokens); }, number{}); - // auto bridge_sst_win = [&]() { - // constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc(); - // constexpr auto dist_ = Policy::template GetUK_0().MakeCBlockDist(); - // return make_tile_window_linear(make_tensor_view( - // reinterpret_cast(smem), desc_), - // desc_.get_lengths(), - // {0, 0}, - // dist_); - // }(); auto o_res = make_wave_buffer_resource(reinterpret_cast(kargs.o_ptr), kargs.num_tokens * kargs.stride_token * sizeof(ODataType)); @@ -417,16 +446,17 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 auto w_scale = GetWeightScale( row_coords_o, reinterpret_cast(kargs.sorted_weight_ptr)); auto a_scale = GetAScale( - row_coords_o, reinterpret_cast(kargs.a_scale_ptr)); - + row_ids_a_mma, reinterpret_cast(kargs.a_scale_ptr)); + auto gqsmq_coords = GetColCoords_GQSMQ(intermediated_tile_id * BlockShape::Block_K1); + auto dq_coords = gqsmq_coords[0];//only one for this tiling + auto gq_scale = GetGQScale( + gqsmq_coords, reinterpret_cast(kargs.g_scale_ptr + static_cast(expert_id) * shared_intermediate_size_0)); + auto smq_scale = GetSMQScale( + gqsmq_coords, reinterpret_cast(kargs.y_smooth_scale_ptr + static_cast(expert_id) * shared_intermediate_size_0)); auto uk_0 = Policy::template GetUK_0(); // auto acc_0= uk_0( - uk_0( - row_ids_a_mma,//fake token id, 2D index for X scale - a_scale, - dq_res, - gq_res, - smq_res, + uk_0( a_scale, + gq_scale, a_res, a_coords, g_res, @@ -457,6 +487,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 auto uk_1 = Policy::template GetUK_1(); uk_1(dq_res, d_res, + dq_coords, d_coords, o_res, o_coords, @@ -464,6 +495,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 smem, kargs.hidden_size, // total n number w_scale, + smq_scale, BlockShape::Block_N1, shared_intermediate_size_1 * BlockShape::Block_N1 - kr_1 * BlockShape::Block_W1, // along N kr_1 * BlockShape::Block_W1,