experimenting global and buffer load/store

This commit is contained in:
Chao Liu
2019-09-18 01:37:28 -05:00
parent 9f46cdf5fa
commit c7a6545ec4
5 changed files with 223 additions and 80 deletions

View File

@@ -487,6 +487,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
#else
// hardcoded: global to register
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
#endif
}
@@ -497,7 +498,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
#else
mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst);
// hardcoded: register to LDS
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
#endif
}
@@ -506,13 +508,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
{
TData p_buffer[GetRegisterBufferSize()];
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseStore.Run(p_buffer, p_dst);
#else
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst);
#endif
RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst);
}
template <typename T, bool PositiveDirection>

View File

@@ -819,38 +819,38 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
src_vector_t vector_data;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Inline asm versions using global_load or buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using inline asm (global_load or buffer_load), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if<SrcMemorySpace == 2>{}([&](auto) {
#if 1 // source code
// Load vector from src.
// src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]);
#else // inline asm using buffer_load
// Load vector from src
// src's memory-space can only be global-memory (buffer_load inline-asm is
// used)
// In order for buffer_load to be valid, need to assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data = buffer_load<TData, SrcDataPerAccess>(
#elif 1 // inline asm using global_load
vector_data = __global_load<TData, SrcDataPerAccess>(
p_src,
static_cast<uint32_t>(src_merged_offset),
static_cast<uint32_t>(src_normal_offset));
#elif 1 // inline asm using buffer_load
vector_data = __buffer_load<TData, SrcDataPerAccess>(
p_src,
static_cast<uint32_t>(src_merged_offset),
static_cast<uint32_t>(src_normal_offset));
#endif
}).Else([&](auto) {
// Load vector from src.
// src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]);
});
@@ -924,36 +924,34 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Inline asm versions using global_store or buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using inline asm (global_store or buffer_store), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if<DstMemorySpace == 2>{}([&](auto) {
#if 1 // source code
// Write vector into dst.
// dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#else // inline asm using buffer_store
// Write vector into dst.
// dst's memory-space need to be global-memory (buffer_store is used)
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
buffer_store<TData, DstDataPerAccess>(
#elif 1 // inline asm using global_store
__global_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#elif 1 // inline asm using buffer_store
__buffer_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#endif
}).Else([&](auto) {
// Write vector into dst.
// dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
});