diff --git a/example/ck_tile/16_fused_moe_general/main.cpp b/example/ck_tile/16_fused_moe_general/main.cpp index 6c7cce8c23..9fba68fa43 100644 --- a/example/ck_tile/16_fused_moe_general/main.cpp +++ b/example/ck_tile/16_fused_moe_general/main.cpp @@ -207,17 +207,6 @@ bool run(const ck_tile::ArgParser& arg_parser) {(max_num_tokens_padded + block_m - 1) / block_m}); ck_tile::HostTensor num_sorted_tiles_host({1}); -#if 1 -# if 0 - ck_tile::FillStepRange{-.5f, .5f, 0.01f}(a_host); - ck_tile::FillStepRange{-.5f, .5f, 0.01f}(g_host); - ck_tile::FillStepRange{.5f, -.5f, -0.01f}(d_host); - ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sa_host); - ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sg_host); - ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sd_host); - ck_tile::FillStepRange{0.f, 1.f, 0.01f}(sy_host); - ck_tile::FillStepRange{-.5f, .5f, 0.01f}(topk_weight_host); -# else ck_tile::FillUniformDistribution{-.5f, .5f}(a_host); ck_tile::FillUniformDistribution{-.5f, .5f}(g_host); ck_tile::FillUniformDistribution{-.5f, .5f}(d_host); @@ -226,7 +215,7 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::FillUniformDistribution{-.5f, .5f}(sd_host); ck_tile::FillUniformDistribution{-.5f, .5f}(sy_host); ck_tile::FillUniformDistribution{0.0f, 1.0f}(topk_weight_host); -# endif + // permute weight ck_tile::HostTensor g_perm_host = shuffle_moe_weight(g_host, prec_w, 1); @@ -248,81 +237,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { topid_unique_gen(topk_ids_host.mData, tokens, topk, experts, 11913); } -#else - a_host.loadtxt("../../ater/input_torch.txt"); - - topk_ids_host.loadtxt("../../ater/topk_ids_torch.txt", "int"); - // topk_ids_host.savetxt("topk_ids_2.txt"); - topk_weight_host.loadtxt("../../ater/topk_weights_torch.txt", "float"); - std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; - - g_host.loadtxt("../../ater/w1_torch.txt", "float"); - std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; - d_host.loadtxt("../../ater/w2_torch.txt", "float"); - std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; - - ck_tile::HostTensor g_perm_host = shuffle_moe_weight(g_host, prec_w, 1); - std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; - ck_tile::HostTensor d_perm_host = shuffle_moe_weight(d_host, prec_w, 1); - std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; - -# if 0 - ck_tile::reference_moe_sorting( - topk_ids_host, - topk_weight_host, - sorted_token_ids_host, - sorted_weight_host, - sorted_expert_ids_host, - num_sorted_tiles_host.mData[0], - experts, - block_m); - - std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl; - std::cout << sorted_token_ids_host << std::endl; - std::cout << num_sorted_tiles_host << std::endl; - std::cout << sorted_expert_ids_host << std::endl; - - ck_tile::reference_fused_moe( - a_host, - g_host, - d_host, - sa_host, - sg_host, - sd_host, - sy_host, - o_host, - sorted_token_ids_host, - sorted_weight_host, - sorted_expert_ids_host, - num_sorted_tiles_host, - topk_ids_host, - block_m, - tokens, - experts, - hidden_size, - shared_intermediate_size_0, - topk, - gate_only); - - std::cout << "------- >" << std::endl; - std::cout << o_host << std::endl; - (void)balance; - - { - ck_tile::HostTensor o_host_torch({tokens, hidden_size}, {stride, 1}); - o_host_torch.loadtxt("../../ater/ref2_torch.txt"); - - auto [rtol, atol] = get_elimit(); - bool pass = ck_tile::check_err( - o_host, o_host_torch, std::string("OUT-Torch Error: Incorrect results!"), rtol, atol); - std::cout << ", valid:" << (pass ? "y" : "n") << std::flush; - } - - return 1; -# endif - -#endif - (void)balance; + ck_tile::reference_moe_sorting( topk_ids_host, topk_weight_host, diff --git a/include/ck_tile/host/reference/reference_fused_moe.hpp b/include/ck_tile/host/reference/reference_fused_moe.hpp index bcd810a958..d08dc409cd 100644 --- a/include/ck_tile/host/reference/reference_fused_moe.hpp +++ b/include/ck_tile/host/reference/reference_fused_moe.hpp @@ -135,7 +135,7 @@ void reference_fused_moe( for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++) { Activation{}(y(0, i_n), acc_0(0, i_n)); - printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n)); + //printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n)); } } else diff --git a/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp b/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp index 3d4d4ede0a..155a37efcc 100644 --- a/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp @@ -292,8 +292,8 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16 number{}); - printf("----- tid:%d, a_sld:%d\n", static_cast(threadIdx.x), - static_cast(a_sld.cached_coords_[number<0>{}].get_offset())); + // printf("----- tid:%d, a_sld:%d\n", static_cast(threadIdx.x), + // static_cast(a_sld.cached_coords_[number<0>{}].get_offset()));