mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[CK] s_prefetch unit test fixes.
Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>
This commit is contained in:
committed by
Michał Kulikowski
parent
f3ef7acca0
commit
cd8af997e6
@@ -3,7 +3,6 @@
|
||||
|
||||
#pragma once
|
||||
#include "data_type.hpp"
|
||||
#include "amd_inline_asm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1065,48 +1064,4 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
}
|
||||
#endif
|
||||
|
||||
template <index_t N>
|
||||
__device__ typename vector_type<int8_t, N>::type
|
||||
amd_s_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
|
||||
index_t src_wave_addr_offset)
|
||||
{
|
||||
static_assert(N == 4 || N == 8, "wrong! not implemented");
|
||||
// TODO: add other variants of s_buffer_load
|
||||
if constexpr(N == 4)
|
||||
{
|
||||
int32_t tmp =
|
||||
amd_assembly_s_buffer_load_b32(src_wave_buffer_resource, src_wave_addr_offset);
|
||||
return bit_cast<int8x4_t>(tmp);
|
||||
}
|
||||
else if constexpr(N == 8)
|
||||
{
|
||||
int32x2_t tmp =
|
||||
amd_assembly_s_buffer_load_b64(src_wave_buffer_resource, src_wave_addr_offset);
|
||||
return bit_cast<int8x8_t>(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, index_t N>
|
||||
__device__ typename vector_type<T, N>::type
|
||||
amd_s_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
|
||||
index_t src_wave_addr_offset)
|
||||
{
|
||||
static_assert((is_same<T, double>::value && (N == 1)) ||
|
||||
(is_same<T, float>::value && (N == 1 || N == 2)) ||
|
||||
(is_same<T, half_t>::value && (N == 2 || N == 4)) ||
|
||||
(is_same<T, bhalf_t>::value && (N == 2 || N == 4)) ||
|
||||
(is_same<T, int32_t>::value && (N == 1 || N == 2)) ||
|
||||
(is_same<T, f8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, bf8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, int8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, uint8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, pk_i4_t>::value && (N == 4 || N == 8)),
|
||||
"wrong! not implemented");
|
||||
|
||||
using r_t = typename vector_type<T, N>::type;
|
||||
auto raw_data =
|
||||
amd_s_buffer_load_impl_raw<sizeof(T) * N>(src_wave_buffer_resource, src_wave_addr_offset);
|
||||
return bit_cast<r_t>(raw_data);
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
#pragma once
|
||||
#include "data_type.hpp"
|
||||
#include "amd_inline_asm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -886,48 +885,4 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
}
|
||||
#endif
|
||||
|
||||
template <index_t N>
|
||||
__device__ typename vector_type<int8_t, N>::type
|
||||
amd_s_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
|
||||
index_t src_wave_addr_offset)
|
||||
{
|
||||
static_assert(N == 4 || N == 8, "wrong! not implemented");
|
||||
// TODO: add other variants of s_buffer_load
|
||||
if constexpr(N == 4)
|
||||
{
|
||||
int32_t tmp =
|
||||
amd_assembly_s_buffer_load_b32(src_wave_buffer_resource, src_wave_addr_offset);
|
||||
return bit_cast<int8x4_t>(tmp);
|
||||
}
|
||||
else if constexpr(N == 8)
|
||||
{
|
||||
int32x2_t tmp =
|
||||
amd_assembly_s_buffer_load_b64(src_wave_buffer_resource, src_wave_addr_offset);
|
||||
return bit_cast<int8x8_t>(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, index_t N>
|
||||
__device__ typename vector_type<T, N>::type
|
||||
amd_s_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
|
||||
index_t src_wave_addr_offset)
|
||||
{
|
||||
static_assert((is_same<T, double>::value && (N == 1)) ||
|
||||
(is_same<T, float>::value && (N == 1 || N == 2)) ||
|
||||
(is_same<T, half_t>::value && (N == 2 || N == 4)) ||
|
||||
(is_same<T, bhalf_t>::value && (N == 2 || N == 4)) ||
|
||||
(is_same<T, int32_t>::value && (N == 1 || N == 2)) ||
|
||||
(is_same<T, f8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, bf8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, int8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, uint8_t>::value && (N == 4 || N == 8)) ||
|
||||
(is_same<T, pk_i4_t>::value && (N == 4 || N == 8)),
|
||||
"wrong! not implemented");
|
||||
|
||||
using r_t = typename vector_type<T, N>::type;
|
||||
auto raw_data =
|
||||
amd_s_buffer_load_impl_raw<sizeof(T) * N>(src_wave_buffer_resource, src_wave_addr_offset);
|
||||
return bit_cast<r_t>(raw_data);
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -432,28 +432,5 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
|
||||
}
|
||||
#endif
|
||||
|
||||
// s_buffer_loads
|
||||
inline __device__ int32_t
|
||||
amd_assembly_s_buffer_load_b32(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, unsigned int offset)
|
||||
{
|
||||
int32_t result;
|
||||
asm volatile("s_buffer_load_b32 %0, %1, %2"
|
||||
: "=s"(result)
|
||||
: "s"(src_wave_buffer_resource), "s"(offset)
|
||||
: "memory");
|
||||
return result;
|
||||
}
|
||||
|
||||
inline __device__ int32x2_t
|
||||
amd_assembly_s_buffer_load_b64(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, unsigned int offset)
|
||||
{
|
||||
int32x2_t result;
|
||||
asm volatile("s_buffer_load_b64 %0, %1, %2"
|
||||
: "=s"(result)
|
||||
: "s"(src_wave_buffer_resource), "s"(offset)
|
||||
: "memory");
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user