mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
debugging
This commit is contained in:
@@ -11,13 +11,6 @@ struct NativeDimension
|
||||
__host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
|
||||
|
||||
__host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
|
||||
|
||||
__host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
|
||||
|
||||
__host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
|
||||
{
|
||||
return i_diff * Stride;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -193,7 +193,7 @@ struct TensorCoordinate
|
||||
private:
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
@@ -201,7 +201,7 @@ struct TensorCoordinate
|
||||
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
|
||||
{
|
||||
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
|
||||
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
|
||||
private:
|
||||
template <class... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
|
||||
}
|
||||
|
||||
template <class... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
|
||||
{
|
||||
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
|
||||
}
|
||||
|
||||
@@ -75,10 +75,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
{
|
||||
#if 0
|
||||
mThreadwiseLoad.template Run<BlockSrcData,
|
||||
ThreadBufferData,
|
||||
BlockSrcAddressSpace,
|
||||
ThreadBufferAddressSpace>(p_block_src,
|
||||
p_thread_buffer);
|
||||
ThreadBufferData,
|
||||
BlockSrcAddressSpace,
|
||||
ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
|
||||
#else // tweaking
|
||||
mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
|
||||
ThreadBufferData,
|
||||
|
||||
@@ -483,8 +483,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
|
||||
address_space_t ThreadBufferAddressSpace = address_space_t::generic>
|
||||
__device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
|
||||
{
|
||||
mThreadwiseLoad.Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(p_block_src,
|
||||
p_thread_buffer);
|
||||
mThreadwiseLoad.template Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
|
||||
p_block_src, p_thread_buffer);
|
||||
}
|
||||
|
||||
template <typename TData,
|
||||
@@ -492,8 +492,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
|
||||
address_space_t BlockDstAddressSpace = address_space_t::generic>
|
||||
__device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
|
||||
{
|
||||
mThreadwiseStore.Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(p_thread_buffer,
|
||||
p_block_dst);
|
||||
mThreadwiseStore.template Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
|
||||
p_thread_buffer, p_block_dst);
|
||||
}
|
||||
|
||||
template <typename TData,
|
||||
|
||||
@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
|
||||
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
|
||||
__buffer_load<SrcData, SrcDataPerAccess>(
|
||||
p_src, 0, src_coord.GetOffset());
|
||||
p_src, src_coord.GetOffset(), 0);
|
||||
#else
|
||||
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
|
||||
@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
__buffer_store<DstData, DstDataPerAccess>(
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
|
||||
p_dst,
|
||||
0,
|
||||
dst_coord.GetOffset());
|
||||
dst_coord.GetOffset(),
|
||||
0);
|
||||
#else
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
|
||||
@@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
const auto src_coord =
|
||||
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
|
||||
|
||||
// this is src compile-time offset
|
||||
// this is src compile-time offset
|
||||
#if 0
|
||||
// TODO: is this good implementation?
|
||||
const index_t src_linear_offset =
|
||||
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
|
||||
#else
|
||||
const index_t src_linear_offset =
|
||||
SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
|
||||
#endif
|
||||
|
||||
// Check src vector's padding situation, only check the first data in
|
||||
// this src vector. It's user's responsiblity to make sure all data in
|
||||
@@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
const auto dst_coord =
|
||||
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
|
||||
|
||||
// this is dst compile-time offset
|
||||
// this is dst compile-time offset
|
||||
#if 1
|
||||
// TODO: is this good implementation?
|
||||
const index_t dst_linear_offset =
|
||||
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
|
||||
#else
|
||||
const index_t dst_linear_offset =
|
||||
DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
|
||||
#endif
|
||||
|
||||
// Check dst vector's padding situation, only check the first data in
|
||||
// this dst vector. It's user's responsiblity to make sure all data in
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#define CK_UNSIGNED_INDEX_TYPE 0
|
||||
#define CK_DEVICE_BACKEND_AMD 1
|
||||
#define CK_USE_AMD_INTRINSIC 1
|
||||
#define CK_USE_AMD_INTRINSIC 0
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
|
||||
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<0, 0>;
|
||||
using RightPads = Sequence<0, 0>;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
|
||||
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
|
||||
constexpr index_t N = 128;
|
||||
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<3, 0>;
|
||||
using RightPads = Sequence<3, 0>;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 1x7 filter, 0x3 pad, 17x17 input
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 128;
|
||||
@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
|
||||
ConvStrides{},
|
||||
ConvDilations{},
|
||||
nrepeat);
|
||||
#elif 0
|
||||
#elif 1
|
||||
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
|
||||
in_nchw,
|
||||
wei_kcyx_desc,
|
||||
|
||||
Reference in New Issue
Block a user