Added F32 reference Unreorder function

- Implemeneted unpackb_f32f32f32of32_reference function.
 - Modified const pointer declaration in aocl_reorder_reference() to avoid compiler warnings.

[AMD-Internal: SWLCSG-3618 ]
This commit is contained in:
V, Varsha
2025-07-18 14:52:03 +05:30
committed by GitHub
parent 355018e739
commit 2f54bc1e14
3 changed files with 101 additions and 36 deletions

View File

@@ -363,7 +363,7 @@ AOCL_GEMM_REORDER(float,f32f32f32of32_reference)
//When n == 1, B marix becomes a vector.
//Reordering is avoided so that LPGEMV can process it efficiently.
if( ( n == 1 ) ) //&& ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
if( ( n == 1 ) )
{
if(rs_b == 1)
{
@@ -531,7 +531,7 @@ void unreorderb_nr64_f32f32f32of32_reference
( jc_cur_loop * k ) + ( n_sub_updated * pc ) +
( jc_cur_loop_rem * kc0 ),
( ( ( float* )b->storage.aligned_buffer ) +
( rs_b * pc ) + (jc * cs_b)),
( rs_b * pc ) + (jc * cs_b) ),
nc0, kc0, NR, rs_b, cs_b
);
}

View File

@@ -39,15 +39,15 @@
#ifdef BLIS_ADDON_LPGEMM
/*
Below are the reference packb functions which are
varied based on block size NR (64, 48, 32, 16, lt) and
Below are the reference packb functions which are
varied based on block size NR (64, 48, 32, 16, lt) and
order (row / column (transpose)).
*/
static void packb_f32f32f32of32_row_major_ref
(
float* pack_b,
const float* b,
float* b,
const dim_t ldb,
const dim_t NC,
const dim_t KC,
@@ -69,11 +69,11 @@ static void packb_f32f32f32of32_row_major_ref
}
}
if(n_partial_pieces > 0)
if(n_partial_pieces > 0)
{
float* pack_b_rem = ( pack_b + ( n_full_pieces_loop_limit * KC ) );
float* b_rem = ( b + n_full_pieces_loop_limit );
for ( dim_t kr = 0; kr < KC; kr ++ )
for ( dim_t kr = 0; kr < KC; kr ++ )
{
float* inp0 = ( b_rem + ( ldb * kr ) );
float* outp0 = ( pack_b_rem + ( kr * NR ) );
@@ -88,7 +88,7 @@ static void packb_f32f32f32of32_row_major_ref
static void packb_nr_f32f32f32of32_col_major_ref
(
float* pack_b_buffer,
const float* b,
float* b,
const dim_t NR,
const dim_t ldb,
const dim_t KC,
@@ -117,7 +117,7 @@ static void packb_nr_f32f32f32of32_col_major_ref
static void packb_f32f32f32of32_col_major_ref
(
float* pack_b_buffer,
const float* b,
float* b,
const dim_t ldb,
const dim_t NC,
const dim_t KC,
@@ -133,7 +133,7 @@ static void packb_f32f32f32of32_col_major_ref
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
{
packb_nr_f32f32f32of32_col_major_ref
(
(
pack_b_buffer + (jc * KC),
b + (jc * ldb), NR, ldb, KC, NR
);
@@ -155,7 +155,7 @@ static void packb_f32f32f32of32_col_major_ref
void packb_f32f32f32of32_reference
(
float* pack_b,
const float* b,
float* b,
const dim_t rs_b,
const dim_t cs_b,
const dim_t NC,

View File

@@ -38,8 +38,95 @@
#ifdef BLIS_ADDON_LPGEMM
void unpackb_f32f32f32of32_row_major_ref
(
float* b,
float* unpack_b,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t ldb
)
{
dim_t n_full_pieces = NC / NR;
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
dim_t n_partial_pieces = NC % NR;
for (dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR)
{
for (dim_t kr = 0; kr < KC; kr++)
{
float* outp = (unpack_b + (ldb * kr) + jc);
float* inp = (b + (jc * KC) + (kr * NR));
for (dim_t i = 0; i < NR; i++) {
*outp++ = *inp++;
}
}
}
if(n_partial_pieces > 0)
{
dim_t nr0 = n_partial_pieces;
float* b_rem = (b + (n_full_pieces_loop_limit * KC));
float* unpack_b_rem = (unpack_b + n_full_pieces_loop_limit);
for (dim_t kr = 0; kr < KC; kr++)
{
float* inp = (b_rem + kr * NR);
float* outp = (unpack_b_rem + (ldb * kr));
for (dim_t i = 0; i < nr0; i++) {
*outp++ = *inp++;
}
}
}
}
void unpackb_f32f32f32of32_col_major_ref
(
float* b,
float* unpack_b,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t ldb
)
{
dim_t n_full_pieces = NC / NR;
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
dim_t n_partial_pieces = NC % NR;
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
{
for (dim_t kr = 0; kr < KC; kr++)
{
float* outp = (unpack_b + jc * KC + kr);
float* inp = (b + (jc * ldb) + (kr * NR));
for (dim_t i = 0; i < NR; i++)
{
*(outp + i * ldb ) = *inp++;
}
}
}
if(n_partial_pieces > 0)
{
float* b_rem = (b + (n_full_pieces_loop_limit * KC));
float* unpack_b_rem = (unpack_b + n_full_pieces_loop_limit * ldb);
for (dim_t kr = 0; kr < KC; kr++)
{
float* inp = (b_rem + kr * NR);
float* outp = (unpack_b_rem + kr);
for (dim_t i = 0; i < n_partial_pieces; i++)
{
*(outp + i * ldb ) = *inp++;
}
}
}
}
//TODO: Kept it as place holder for now, yet to test this completely!
void unpackb_f32f32f32of32_reference
(
float* b,
@@ -53,33 +140,11 @@ void unpackb_f32f32f32of32_reference
{
if( cs_b == 1 )
{
for ( dim_t jc = 0; jc < NC; jc += NR )
{
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
float* outp = ( unpack_b + jc );
float* inp = (b + jc * NR );
for ( dim_t kr = 0; kr < KC; kr++ )
{
outp += nr0; inp += NR ;
for(dim_t i = 0; i < nr0; i++) *outp++ = *inp++;
}
}
unpackb_f32f32f32of32_row_major_ref(b, unpack_b, NC, KC, NR, rs_b);
}
else
{
for ( dim_t jc = 0; jc < NC; jc += NR )
{
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
for ( dim_t kr = 0; kr < KC; kr++ )
{
float* outp0 = ( unpack_b + ( cs_b * kr) + jc );
float* inp0 = ( b + ( jc * KC ) + ( ( kr + NR )));
for(dim_t i = 0; i < nr0; i++) *outp0++ = *inp0++;
}
}
unpackb_f32f32f32of32_col_major_ref(b, unpack_b, NC, KC, NR, cs_b);
}
}