mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Added F32 reference Unreorder function
- Implemeneted unpackb_f32f32f32of32_reference function. - Modified const pointer declaration in aocl_reorder_reference() to avoid compiler warnings. [AMD-Internal: SWLCSG-3618 ]
This commit is contained in:
@@ -363,7 +363,7 @@ AOCL_GEMM_REORDER(float,f32f32f32of32_reference)
|
||||
|
||||
//When n == 1, B marix becomes a vector.
|
||||
//Reordering is avoided so that LPGEMV can process it efficiently.
|
||||
if( ( n == 1 ) ) //&& ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
|
||||
if( ( n == 1 ) )
|
||||
{
|
||||
if(rs_b == 1)
|
||||
{
|
||||
@@ -531,7 +531,7 @@ void unreorderb_nr64_f32f32f32of32_reference
|
||||
( jc_cur_loop * k ) + ( n_sub_updated * pc ) +
|
||||
( jc_cur_loop_rem * kc0 ),
|
||||
( ( ( float* )b->storage.aligned_buffer ) +
|
||||
( rs_b * pc ) + (jc * cs_b)),
|
||||
( rs_b * pc ) + (jc * cs_b) ),
|
||||
nc0, kc0, NR, rs_b, cs_b
|
||||
);
|
||||
}
|
||||
|
||||
@@ -39,15 +39,15 @@
|
||||
#ifdef BLIS_ADDON_LPGEMM
|
||||
|
||||
/*
|
||||
Below are the reference packb functions which are
|
||||
varied based on block size NR (64, 48, 32, 16, lt) and
|
||||
Below are the reference packb functions which are
|
||||
varied based on block size NR (64, 48, 32, 16, lt) and
|
||||
order (row / column (transpose)).
|
||||
*/
|
||||
|
||||
static void packb_f32f32f32of32_row_major_ref
|
||||
(
|
||||
float* pack_b,
|
||||
const float* b,
|
||||
float* b,
|
||||
const dim_t ldb,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
@@ -69,11 +69,11 @@ static void packb_f32f32f32of32_row_major_ref
|
||||
}
|
||||
}
|
||||
|
||||
if(n_partial_pieces > 0)
|
||||
if(n_partial_pieces > 0)
|
||||
{
|
||||
float* pack_b_rem = ( pack_b + ( n_full_pieces_loop_limit * KC ) );
|
||||
float* b_rem = ( b + n_full_pieces_loop_limit );
|
||||
for ( dim_t kr = 0; kr < KC; kr ++ )
|
||||
for ( dim_t kr = 0; kr < KC; kr ++ )
|
||||
{
|
||||
float* inp0 = ( b_rem + ( ldb * kr ) );
|
||||
float* outp0 = ( pack_b_rem + ( kr * NR ) );
|
||||
@@ -88,7 +88,7 @@ static void packb_f32f32f32of32_row_major_ref
|
||||
static void packb_nr_f32f32f32of32_col_major_ref
|
||||
(
|
||||
float* pack_b_buffer,
|
||||
const float* b,
|
||||
float* b,
|
||||
const dim_t NR,
|
||||
const dim_t ldb,
|
||||
const dim_t KC,
|
||||
@@ -117,7 +117,7 @@ static void packb_nr_f32f32f32of32_col_major_ref
|
||||
static void packb_f32f32f32of32_col_major_ref
|
||||
(
|
||||
float* pack_b_buffer,
|
||||
const float* b,
|
||||
float* b,
|
||||
const dim_t ldb,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
@@ -133,7 +133,7 @@ static void packb_f32f32f32of32_col_major_ref
|
||||
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
|
||||
{
|
||||
packb_nr_f32f32f32of32_col_major_ref
|
||||
(
|
||||
(
|
||||
pack_b_buffer + (jc * KC),
|
||||
b + (jc * ldb), NR, ldb, KC, NR
|
||||
);
|
||||
@@ -155,7 +155,7 @@ static void packb_f32f32f32of32_col_major_ref
|
||||
void packb_f32f32f32of32_reference
|
||||
(
|
||||
float* pack_b,
|
||||
const float* b,
|
||||
float* b,
|
||||
const dim_t rs_b,
|
||||
const dim_t cs_b,
|
||||
const dim_t NC,
|
||||
|
||||
@@ -38,8 +38,95 @@
|
||||
|
||||
#ifdef BLIS_ADDON_LPGEMM
|
||||
|
||||
void unpackb_f32f32f32of32_row_major_ref
|
||||
(
|
||||
float* b,
|
||||
float* unpack_b,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t ldb
|
||||
)
|
||||
{
|
||||
dim_t n_full_pieces = NC / NR;
|
||||
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
|
||||
dim_t n_partial_pieces = NC % NR;
|
||||
|
||||
for (dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR)
|
||||
{
|
||||
for (dim_t kr = 0; kr < KC; kr++)
|
||||
{
|
||||
float* outp = (unpack_b + (ldb * kr) + jc);
|
||||
float* inp = (b + (jc * KC) + (kr * NR));
|
||||
|
||||
for (dim_t i = 0; i < NR; i++) {
|
||||
*outp++ = *inp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(n_partial_pieces > 0)
|
||||
{
|
||||
dim_t nr0 = n_partial_pieces;
|
||||
float* b_rem = (b + (n_full_pieces_loop_limit * KC));
|
||||
float* unpack_b_rem = (unpack_b + n_full_pieces_loop_limit);
|
||||
|
||||
for (dim_t kr = 0; kr < KC; kr++)
|
||||
{
|
||||
float* inp = (b_rem + kr * NR);
|
||||
float* outp = (unpack_b_rem + (ldb * kr));
|
||||
|
||||
for (dim_t i = 0; i < nr0; i++) {
|
||||
*outp++ = *inp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unpackb_f32f32f32of32_col_major_ref
|
||||
(
|
||||
float* b,
|
||||
float* unpack_b,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t ldb
|
||||
)
|
||||
{
|
||||
dim_t n_full_pieces = NC / NR;
|
||||
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
|
||||
dim_t n_partial_pieces = NC % NR;
|
||||
|
||||
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
|
||||
{
|
||||
for (dim_t kr = 0; kr < KC; kr++)
|
||||
{
|
||||
float* outp = (unpack_b + jc * KC + kr);
|
||||
float* inp = (b + (jc * ldb) + (kr * NR));
|
||||
|
||||
for (dim_t i = 0; i < NR; i++)
|
||||
{
|
||||
*(outp + i * ldb ) = *inp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(n_partial_pieces > 0)
|
||||
{
|
||||
float* b_rem = (b + (n_full_pieces_loop_limit * KC));
|
||||
float* unpack_b_rem = (unpack_b + n_full_pieces_loop_limit * ldb);
|
||||
|
||||
for (dim_t kr = 0; kr < KC; kr++)
|
||||
{
|
||||
float* inp = (b_rem + kr * NR);
|
||||
float* outp = (unpack_b_rem + kr);
|
||||
|
||||
for (dim_t i = 0; i < n_partial_pieces; i++)
|
||||
{
|
||||
*(outp + i * ldb ) = *inp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//TODO: Kept it as place holder for now, yet to test this completely!
|
||||
void unpackb_f32f32f32of32_reference
|
||||
(
|
||||
float* b,
|
||||
@@ -53,33 +140,11 @@ void unpackb_f32f32f32of32_reference
|
||||
{
|
||||
if( cs_b == 1 )
|
||||
{
|
||||
for ( dim_t jc = 0; jc < NC; jc += NR )
|
||||
{
|
||||
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
|
||||
float* outp = ( unpack_b + jc );
|
||||
float* inp = (b + jc * NR );
|
||||
for ( dim_t kr = 0; kr < KC; kr++ )
|
||||
{
|
||||
outp += nr0; inp += NR ;
|
||||
|
||||
for(dim_t i = 0; i < nr0; i++) *outp++ = *inp++;
|
||||
}
|
||||
}
|
||||
unpackb_f32f32f32of32_row_major_ref(b, unpack_b, NC, KC, NR, rs_b);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for ( dim_t jc = 0; jc < NC; jc += NR )
|
||||
{
|
||||
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
|
||||
for ( dim_t kr = 0; kr < KC; kr++ )
|
||||
{
|
||||
float* outp0 = ( unpack_b + ( cs_b * kr) + jc );
|
||||
float* inp0 = ( b + ( jc * KC ) + ( ( kr + NR )));
|
||||
|
||||
for(dim_t i = 0; i < nr0; i++) *outp0++ = *inp0++;
|
||||
}
|
||||
}
|
||||
unpackb_f32f32f32of32_col_major_ref(b, unpack_b, NC, KC, NR, cs_b);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user