Fix the vector load & fix the gfx950 compv4 error (#2831)

This commit is contained in:
Thomas Ning
2025-09-12 11:48:45 -07:00
committed by GitHub
parent 321627aec5
commit 1894a0dbc3
3 changed files with 35 additions and 20 deletions

View File

@@ -1335,8 +1335,10 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
static_assert(
(std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
(std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
(std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
(std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
(std::is_same<T, fp16_t>::value &&
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
(std::is_same<T, bf16_t>::value &&
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
(std::is_same<T, int32_t>::value &&
(N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
(std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
@@ -1449,14 +1451,19 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 8)
else
{
// use fp32 load to mimic fp16 load
fp32x4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
// N >= 8: build from fp32x4 chunks
thread_buffer<float, N / 2> tmp;
static_for<0, (N / 8), 1>{}([&](auto i) {
constexpr index_t chunk = i;
tmp.template get_as<fp32x4_t>()(i) = llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + (chunk * 4) * sizeof(float),
static_cast<index_t>(coherence));
});
return bit_cast<rtn_type>(tmp);
}
}
@@ -1486,13 +1493,19 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 8)
else
{
int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
// N >= 8: build from fp32x4 chunks
thread_buffer<float, N / 2> tmp;
static_for<0, (N / 8), 1>{}([&](auto i) {
constexpr index_t chunk = i;
tmp.template get_as<fp32x4_t>()(i) = llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + (chunk * 4) * sizeof(float),
static_cast<index_t>(coherence));
});
return bit_cast<rtn_type>(tmp);
}
}