diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp index 66825edcf9..a2421bd3d7 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp @@ -132,7 +132,7 @@ static constexpr ck::index_t BLOCKSIZE = 256; static constexpr ck::index_t NPerBlock = 128; static constexpr ck::index_t MNPerXDL = 32; static constexpr ck::index_t KPerBlock = 128 / sizeof(A0DataType); -static constexpr ck::index_t Nswizzle = true; +static constexpr ck::index_t Nswizzle = false; static constexpr ck::index_t AK1 = 16 / sizeof(A0DataType); static constexpr ck::index_t BK1 = 16 / sizeof(B0DataType); static constexpr ck::index_t EVec = 16 / sizeof(EDataType); @@ -170,11 +170,11 @@ int main(int argc, char* argv[]) // GEMM shape ck::index_t N = 4096; - ck::index_t K = 4096; - ck::index_t experts = 8; - ck::index_t sorted_tile_num = 8; - ck::index_t valid_tile_num = 8; - ck::index_t tokens = 128; + ck::index_t K = 6144; + ck::index_t experts = 1; + ck::index_t sorted_tile_num = 2000; + ck::index_t valid_tile_num = 2000; + ck::index_t tokens = 256000; ck::index_t topk = 2; // ck::index_t tokens = batch * topk; @@ -237,11 +237,10 @@ int main(int argc, char* argv[]) // int eids[] = {0, 0,1, 2,3, 3, 4,4, 5, 5, 6, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2} // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13}; // int eids[] = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2} - max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8}; - int eids[] = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2} + max_token_id.mData = {valid_size}; for(int i = 0; i < sorted_tile_num; i++) { - expert_ids.mData[i] = eids[i]; + expert_ids.mData[i] = 0; } int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num; int tokenid = 0; diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp index 859649185a..fd150b4fdc 100644 --- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp @@ -58,7 +58,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1_gather const DstDesc& dst_desc, const Index& dst_block_slice_origin, const DstElementwiseOperation& dst_element_op, - const StaticallyIndexedArray& gather_offsets) + const StaticallyIndexedArray& gather_offsets) : threadwise_transfer_(src_desc, make_zero_multi_index(), src_element_op, diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp index cf758e4d5f..c42b0cb65e 100644 --- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp @@ -149,7 +149,7 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter template __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs, - StaticallyIndexedArray& scatter_offsets, + StaticallyIndexedArray& scatter_offsets, Number thread_scratch_id = Number{}) { if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or @@ -169,7 +169,7 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter const SrcBuffers& src_bufs, const DstDescs& dst_descs, DstBuffers dst_bufs, - StaticallyIndexedArray& scatter_offsets, + StaticallyIndexedArray& scatter_offsets, StaticallyIndexedArray& scatter_weights) { RunRead(src_descs, src_bufs, scatter_weights); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp index 6ec2a6d536..838d87bcbf 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp @@ -491,7 +491,7 @@ struct GridwiseMoeGemm template __host__ __device__ static auto - MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC) + MakeCGridDescriptor_M_N(long_index_t M, long_index_t MPad, long_index_t N, long_index_t NPad, long_index_t StrideC) { const auto c_grid_desc_mraw_nraw = [&]() { if constexpr(is_same::value) @@ -1171,6 +1171,8 @@ struct GridwiseMoeGemm const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]); // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged"); const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y; + if(threadIdx.x==0 && blockIdx.y+blockIdx.x==0) + printf("%ld %ld %ld\n", a_grid_desc_ak0_m_ak1.GetElementSpaceSize(), c_grid_desc_m_n.GetElementSpaceSize(), c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); if(expert_block_id * MPerBlock >= max_token_id) return; const index_t expert_id = @@ -1210,7 +1212,7 @@ struct GridwiseMoeGemm if(token_pos >= max_token_id || token0 >= problem.NumTokens) return; - StaticallyIndexedArray gather_offsets; + StaticallyIndexedArray gather_offsets; static_for<0, AMRepeats, 1>{}([&](auto m0) { const index_t fused_token = p_sorted_token_ids[token_pos + m0]; index_t token_offset = fused_token & 0xffffff; @@ -1563,7 +1565,7 @@ struct GridwiseMoeGemm const float* p_sorted_weights_0 = p_ds_grid[I0]; static_for<0, num_access, 1>{}([&](auto access_id) { // make sure it's safe to write to LDS - StaticallyIndexedArray + StaticallyIndexedArray scatter_offsets; //= p_sorted_token_ids[c_token_pos]; StaticallyIndexedArray scatter_weights; //= for topk // too hack here, 2 specific for topk weights, fixme @@ -1713,7 +1715,7 @@ struct GridwiseMoeGemm if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id || token0 >= problem.NumTokens) return; - StaticallyIndexedArray + StaticallyIndexedArray gather_offsets; //= p_sorted_token_ids[token_pos]; static_for<0, AMRepeats, 1>{}([&](auto m0) { const index_t fused_token = p_sorted_token_ids[token_pos + m0]; @@ -2073,7 +2075,7 @@ struct GridwiseMoeGemm const float* p_sorted_weights_0 = p_ds_grid[I0]; static_for<0, num_access, 1>{}([&](auto access_id) { // make sure it's safe to write to LDS - StaticallyIndexedArray + StaticallyIndexedArray scatter_offsets; //= p_sorted_token_ids[c_token_pos]; StaticallyIndexedArray scatter_weights; //= for topk // too hack here, 2 specific for topk weights, fixme diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp index bb9a452761..c496622f9b 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp @@ -88,7 +88,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather const DstDesc& dst_desc, const Index& dst_slice_origin, const DstElementwiseOperation& dst_element_op, - const StaticallyIndexedArray& gather_offsets) + const StaticallyIndexedArray& gather_offsets) : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)), src_element_op_(src_element_op), @@ -935,7 +935,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather DstCoord dst_coord_; const SrcElementwiseOperation src_element_op_; const DstElementwiseOperation dst_element_op_; - StaticallyIndexedArray gather_offsets_; + StaticallyIndexedArray gather_offsets_; }; } // namespace ck diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp index 6ff30ff76a..51cafec39e 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp @@ -412,7 +412,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter enable_if_t = false> __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs, - StaticallyIndexedArray& scatter_offsets, + StaticallyIndexedArray& scatter_offsets, Number thread_scratch_id = Number{}) { OOBCheck(thread_scratch_id); @@ -431,15 +431,24 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter // copy data from buf_vectors into dst_bufs static_for<0, nDst, 1>{}([&](auto i) { using dst_vector_t = typename remove_cvref_t::type; - auto dst_offset = scatter_offset + dst_coords_[i].GetOffset(); + long_index_t dst_offset = scatter_offset + (dst_coords_[i].GetOffset()); const bool is_dst_valid = dst_offset < dst_descs[i].GetElementSpaceSize(); // coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i], // dst_coords_[i]); - constexpr InMemoryDataOperationEnum DstInMemOp = static_cast(DstInMemOps::At(i.value)); dst_bufs(i).template Update( dst_offset, is_dst_valid, dst_vectors[i].template AsType()[I0]); + // if(threadIdx.x%8 ==0 && blockIdx.x==0) { + // if(dst_offset>80740352 && threadIdx.x==0) { + // static_for<0, 1, 1>{}([&](auto idx) { + // using DstData = remove_cvref_t>; + // using print_vec_t = typename vector_type::type; + // printf("tid %d off %ld valid %d %ld %f\n",threadIdx.x, dst_offset, + // is_dst_valid, dst_descs[i].GetElementSpaceSize(), type_convert(dst_vectors[i].template + // AsType()[idx])); + // }); + // } }); // move coordinate @@ -491,7 +500,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter const SrcBuffers& src_bufs, const DstDescs& dst_descs, DstBuffers dst_bufs, - StaticallyIndexedArray& scatter_offsets, + StaticallyIndexedArray& scatter_offsets, StaticallyIndexedArray& scatter_weights) { RunRead(src_descs, src_bufs, scatter_weights); diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 6805fba4f9..d2f1409ef8 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -33,7 +33,7 @@ struct DynamicBuffer ElementSpaceSize element_space_size_; T invalid_element_value_ = T{0}; - static constexpr index_t PackedSize = []() { + static constexpr long_index_t PackedSize = []() { if constexpr(is_same_v, pk_i4_t>) return 2; else @@ -59,21 +59,21 @@ struct DynamicBuffer return BufferAddressSpace; } - __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; } + __host__ __device__ constexpr const T& operator[](long_index_t i) const { return p_data_[i]; } - __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; } + __host__ __device__ constexpr T& operator()(long_index_t i) { return p_data_[i]; } template >::type, typename scalar_type>::type>::value || !is_native_type(), bool>::type = false> - __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const + __host__ __device__ constexpr auto Get(long_index_t i, bool is_valid_element) const { // X contains multiple T - constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_t_vector = scalar_type>::vector_size; - constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X should contain multiple T"); @@ -86,7 +86,7 @@ struct DynamicBuffer if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing) { - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + constexpr long_index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; if constexpr(InvalidElementUseNumericalZeroValue) { @@ -140,7 +140,7 @@ struct DynamicBuffer typename enable_if>::type, typename scalar_type>::type>::value, bool>::type = false> - __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x) + __host__ __device__ void Update(long_index_t i, bool is_valid_element, const X& x) { if constexpr(Op == InMemoryDataOperationEnum::Set) { @@ -189,10 +189,10 @@ struct DynamicBuffer } } - template + template __host__ __device__ void DirectCopyToLds(DstBuffer& dst_buf, - index_t src_offset, - index_t dst_offset, + long_index_t src_offset, + long_index_t dst_offset, bool is_valid_element) const { // Copy data from global to LDS memory using direct loads. @@ -214,12 +214,12 @@ struct DynamicBuffer typename scalar_type>::type>::value || !is_native_type(), bool>::type = false> - __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x) + __host__ __device__ void Set(long_index_t i, bool is_valid_element, const X& x) { // X contains multiple T - constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_t_vector = scalar_type>::vector_size; - constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X should contain multiple T"); @@ -238,7 +238,7 @@ struct DynamicBuffer if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing) { - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + constexpr long_index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; amd_buffer_store, t_per_x, coherence>( x, p_data_, i, is_valid_element, element_space_size_ / PackedSize); @@ -342,12 +342,13 @@ struct DynamicBuffer { if(is_valid_element) { -#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS +#if 0 X tmp = x; __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X)); #else - *c_style_pointer_cast(&p_data_[i]) = x; + // if(i >= 2169041600) + *c_style_pointer_cast(p_data_ + i) = x; #endif } } @@ -357,14 +358,14 @@ struct DynamicBuffer typename enable_if>::type, typename scalar_type>::type>::value, bool>::type = false> - __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x) + __host__ __device__ void AtomicAdd(long_index_t i, bool is_valid_element, const X& x) { using scalar_t = typename scalar_type>::type; // X contains multiple T - constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_t_vector = scalar_type>::vector_size; - constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X should contain multiple T"); @@ -390,7 +391,7 @@ struct DynamicBuffer if constexpr(use_amd_buffer_addressing) { - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + constexpr long_index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; amd_buffer_atomic_add, t_per_x>( x, p_data_, i, is_valid_element, element_space_size_ / PackedSize); @@ -408,12 +409,12 @@ struct DynamicBuffer typename enable_if>::type, typename scalar_type>::type>::value, bool>::type = false> - __host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x) + __host__ __device__ void AtomicMax(long_index_t i, bool is_valid_element, const X& x) { // X contains multiple T - constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_t_vector = scalar_type>::vector_size; - constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; + constexpr long_index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X should contain multiple T"); @@ -429,7 +430,7 @@ struct DynamicBuffer if constexpr(use_amd_buffer_addressing) { - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + constexpr long_index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; amd_buffer_atomic_max, t_per_x>( x, p_data_, i, is_valid_element, element_space_size_ / PackedSize); diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index b4f1545aa9..7a1e0e5a45 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -18,6 +18,13 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number) typename arithmetic_sequence_gen<0, N, 1>::type{}); } +template +__host__ __device__ constexpr auto generate_tuple(F&& f, LongNumber) +{ + return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); }, + typename arithmetic_sequence_gen<0, N, 1>::type{}); +} + template __host__ __device__ constexpr auto generate_tie(F&& f, Number) {