Restructure gridwise and blockwise GEMM, add tensor contraction and FWD-v4r5 (#36)

* experimenting magic number division

* overhauling fwd-v4r4 to clearly reflect transformation graph

* added fwd-v4r5

* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2

* bug fix and added sanity-check in transform_dynamic_tensor_descriptor

* added conv_driver_v2

[ROCm/composable_kernel commit: 30072aec37]
This commit is contained in:
Chao Liu
2021-06-09 23:53:08 -05:00
committed by GitHub
parent 040023fdcd
commit f4acec502e
38 changed files with 4791 additions and 2050 deletions

View File

@@ -118,6 +118,7 @@ struct MagicDivision
return (tmp + dividend) >> shift;
}
#if 1 // debug
// HACK: magic division for int32_t
// HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
// non-negative for result to be correct
@@ -127,8 +128,25 @@ struct MagicDivision
{
uint32_t dividend_u32 = as_type<uint32_t>(dividend_i32);
uint32_t tmp = ((uint64_t)dividend_u32 * (uint64_t)multiplier) >> 32;
return (tmp + dividend_i32) >> shift;
return (tmp + dividend_u32) >> shift;
}
#else
// the inline ASM is producing wrong result
__host__ __device__ static int32_t
DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
{
uint32_t r;
asm volatile("\n \
v_mul_hi_u32 %0, %1, %2 \n \
v_add_u32_e32 %0, %1, %0 \n \
v_lshrrev_b32_e32 %0, %3, %0 \n \
"
: "=v"(r)
: "v"(as_type<uint32_t>(dividend_i32)), "s"(multiplier), "s"(shift));
return as_type<int32_t>(r);
}
#endif
};
} // namespace ck