mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Implemented f32 reference reorder function
Implemented aocl_reorder_f32f32f32of32_reference( ) function and tested. Implemented framework changes required and place holder for kernels for aocl_unreorder_f32f32f32of32_reference( ) function. It is not tested completely and will be taken care in subsequent commits. [AMD-Internal: SWLCSG-3618 ]
This commit is contained in:
@@ -36,6 +36,7 @@
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "lpgemm_config.h"
|
||||
#include "lpgemm_utils.h"
|
||||
#include "lpgemm_reorder_f32.h"
|
||||
|
||||
AOCL_GEMM_GET_REORDER_BUF_SIZE(f32f32f32of32)
|
||||
{
|
||||
@@ -273,3 +274,359 @@ AOCL_GEMM_REORDER(float,f32f32f32of32)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Pack B into row stored column panels.
|
||||
AOCL_GEMM_REORDER(float,f32f32f32of32_reference)
|
||||
{
|
||||
trans_t blis_trans;
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(trans, &blis_trans);
|
||||
if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
|
||||
( k <= 0 ) || ( n <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Only supports row major packing now.
|
||||
inc_t rs_b, cs_b;
|
||||
if( ( order == 'r') || ( order == 'R' ) )
|
||||
{
|
||||
if( ( bli_is_notrans( blis_trans ) && ( ldb < n ) ) ||
|
||||
( bli_is_trans( blis_trans ) && ( ldb < k ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
else
|
||||
{
|
||||
rs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
|
||||
cs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
|
||||
}
|
||||
}
|
||||
else if ( ( order == 'c' ) || ( order == 'C' ) )
|
||||
{
|
||||
if( ( bli_is_notrans( blis_trans ) && ( ldb < k ) ) ||
|
||||
( bli_is_trans( blis_trans ) && ( ldb < n ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
else
|
||||
{
|
||||
rs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
|
||||
cs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return; // Error
|
||||
}
|
||||
|
||||
// Check if AVX2 ISA is supported, lpgemm fp32 matmul only works with it.
|
||||
if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
|
||||
{
|
||||
bli_print_msg(" AVX2 ISA not supported by processor, "
|
||||
"cannot perform f32f32f32 gemm.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
// Initialize lpgemm context.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
AOCL_MATRIX_TYPE input_mat_type;
|
||||
bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
|
||||
|
||||
if ( input_mat_type == A_MATRIX )
|
||||
{
|
||||
return; // A reorder not supported.
|
||||
}
|
||||
|
||||
// Query the context for various blocksizes.
|
||||
lpgemm_cntx_t* lcntx = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
|
||||
dim_t NC = lcntx->blksz.NC;
|
||||
dim_t KC = lcntx->blksz.KC;
|
||||
dim_t NR = lcntx->blksz.NR;
|
||||
|
||||
dim_t rs_b_reorder = 0;
|
||||
dim_t cs_b_reorder = 0;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
|
||||
dim_t n_threads = bli_rntm_num_threads( &rntm_g );
|
||||
n_threads = ( n_threads > 0 ) ? n_threads : 1;
|
||||
|
||||
|
||||
//When n == 1, B marix becomes a vector.
|
||||
//Reordering is avoided so that LPGEMV can process it efficiently.
|
||||
if( ( n == 1 ) ) //&& ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
|
||||
{
|
||||
if(rs_b == 1)
|
||||
{
|
||||
memcpy(reorder_buf_addr, input_buf_addr, (k * sizeof(float)));
|
||||
}else
|
||||
{
|
||||
for(dim_t k0 = 0; k0 < k; k0++)
|
||||
{
|
||||
reorder_buf_addr[k0] = input_buf_addr[k0*rs_b];
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// Initialise a local thrinfo obj for work split across threads.
|
||||
thrinfo_t thread_jc;
|
||||
bli_thrinfo_set_n_way( n_threads, &thread_jc );
|
||||
bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
|
||||
#else
|
||||
{
|
||||
// Initialise a local thrinfo obj for work split across threads.
|
||||
thrinfo_t thread_jc;
|
||||
bli_thrinfo_set_n_way( 1, &thread_jc );
|
||||
bli_thrinfo_set_work_id( 0, &thread_jc );
|
||||
#endif
|
||||
// Compute the JC loop thread range for the current thread. Per thread
|
||||
// gets multiple of NR columns.
|
||||
dim_t jc_start, jc_end;
|
||||
bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
|
||||
for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
|
||||
{
|
||||
dim_t nc0 = bli_min( ( jc_end - jc ), NC );
|
||||
|
||||
dim_t jc_cur_loop = jc;
|
||||
dim_t jc_cur_loop_rem = 0;
|
||||
dim_t n_sub_updated;
|
||||
|
||||
get_B_panel_reordered_start_offset_width
|
||||
(
|
||||
jc, n, NC, NR,
|
||||
&jc_cur_loop, &jc_cur_loop_rem,
|
||||
&nc0, &n_sub_updated
|
||||
);
|
||||
|
||||
for ( dim_t pc = 0; pc < k; pc += KC )
|
||||
{
|
||||
dim_t kc0 = bli_min( ( k - pc ), KC );
|
||||
|
||||
// The offsets are calculated in such a way that it resembles
|
||||
// the reorder buffer traversal in single threaded reordering.
|
||||
// The panel boundaries (KCxNC) remain as it is accessed in
|
||||
// single thread, and as a consequence a thread with jc_start
|
||||
// inside the panel cannot consider NC range for reorder. It
|
||||
// has to work with NC' < NC, and the offset is calulated using
|
||||
// prev NC panels spanning k dim + cur NC panel spaning pc loop
|
||||
// cur iteration + (NC - NC') spanning current kc0 (<= KC).
|
||||
//
|
||||
//Eg: Consider the following reordered buffer diagram:
|
||||
// t1 t2
|
||||
// | |
|
||||
// | |..NC..|
|
||||
// | | |
|
||||
// |.NC. |.NC. |NC'|NC"
|
||||
// pc=0-+-----+-----+---+--+
|
||||
// KC| | | | |
|
||||
// | 1 | 3 | 5 |
|
||||
// pc=KC-+-----+-----+---st-+
|
||||
// KC| | | | |
|
||||
// | 2 | 4 | 6 | 7|
|
||||
// pc=k=2KC-+-----+-----+---+--+
|
||||
// |jc=0 |jc=NC|jc=2NC|
|
||||
//
|
||||
// The numbers 1,2..6,7 denotes the order in which reordered
|
||||
// KCxNC blocks are stored in memory, ie: block 1 followed by 2
|
||||
// followed by 3, etc. Given two threads t1 and t2, and t2 needs
|
||||
// to acces point st in the reorder buffer to write the data:
|
||||
// The offset calulation logic will be:
|
||||
// jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC,
|
||||
// n_sub_updated = NC, k = 2KC, kc0_updated = KC
|
||||
//
|
||||
// st = ( jc_cur_loop * k ) <traverse blocks 1,2,3,4>
|
||||
// + ( n_sub_updated * pc ) <traverse block 5>
|
||||
// + ( NC' * kc0_updated) <traverse block 6>
|
||||
packb_f32f32f32of32_reference
|
||||
(
|
||||
reorder_buf_addr + ( jc_cur_loop * k ) +
|
||||
( n_sub_updated * pc ) + ( jc_cur_loop_rem * kc0 ),
|
||||
input_buf_addr + ( rs_b * pc ) + ( cs_b * jc ),
|
||||
rs_b, cs_b, nc0, kc0, NR, &rs_b_reorder, &cs_b_reorder
|
||||
);
|
||||
}
|
||||
|
||||
adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//TODO: Kept it as place holder for now, yet to test this completely!
|
||||
void unreorderb_nr64_f32f32f32of32_reference
|
||||
(
|
||||
lpgemm_obj_t * b,
|
||||
lpgemm_obj_t * b_unreorder,
|
||||
rntm_t* rntm,
|
||||
lpgemm_cntx_t* lcntx
|
||||
)
|
||||
{
|
||||
dim_t NC = lcntx->blksz.NC;
|
||||
dim_t KC = lcntx->blksz.KC;
|
||||
dim_t NR = lcntx->blksz.NR;
|
||||
|
||||
// Extracting the matrix properties from the lpgemm object
|
||||
dim_t rs_b = b->rs;
|
||||
dim_t cs_b = b->cs;
|
||||
dim_t n = b->width;
|
||||
dim_t k = b->length;
|
||||
|
||||
dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
n_threads = ( n_threads > 0 ) ? n_threads : 1;
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// Initialise a local thrinfo obj for work split across threads.
|
||||
thrinfo_t thread_jc;
|
||||
bli_thrinfo_set_n_way( n_threads, &thread_jc );
|
||||
bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
|
||||
#else
|
||||
{
|
||||
// Initialise a local thrinfo obj for work split across threads.
|
||||
thrinfo_t thread_jc;
|
||||
bli_thrinfo_set_n_way( 1, &thread_jc );
|
||||
bli_thrinfo_set_work_id( 0, &thread_jc );
|
||||
#endif
|
||||
|
||||
// Compute the JC loop thread range for the current thread.
|
||||
dim_t jc_start, jc_end;
|
||||
bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
|
||||
|
||||
for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
|
||||
{
|
||||
dim_t nc0 = bli_min( ( jc_end - jc ), NC );
|
||||
|
||||
dim_t jc_cur_loop = jc;
|
||||
dim_t jc_cur_loop_rem = 0;
|
||||
dim_t n_sub_updated;
|
||||
|
||||
get_B_panel_reordered_start_offset_width
|
||||
(
|
||||
jc, n, NC, NR,
|
||||
&jc_cur_loop, &jc_cur_loop_rem,
|
||||
&nc0, &n_sub_updated
|
||||
);
|
||||
|
||||
for ( dim_t pc = 0; pc < k; pc += KC )
|
||||
{
|
||||
dim_t kc0 = bli_min( ( k - pc ), KC );
|
||||
|
||||
unpackb_f32f32f32of32_reference
|
||||
(
|
||||
( ( float* )b_unreorder->storage.aligned_buffer ) +
|
||||
( jc_cur_loop * k ) + ( n_sub_updated * pc ) +
|
||||
( jc_cur_loop_rem * kc0 ),
|
||||
( ( ( float* )b->storage.aligned_buffer ) +
|
||||
( rs_b * pc ) + (jc * cs_b)),
|
||||
nc0, kc0, NR, rs_b, cs_b
|
||||
);
|
||||
}
|
||||
|
||||
adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//TODO: Kept it as place holder for now, yet to test this completely!
|
||||
AOCL_GEMM_UNREORDER(float, f32f32f32of32_reference)
|
||||
{
|
||||
if ( ( output_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
|
||||
( k <= 0 ) || ( n <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
inc_t rs_b, cs_b;
|
||||
|
||||
// Check for the validity of strides.
|
||||
if( ( order == 'r' ) || ( order == 'R' ) )
|
||||
{
|
||||
if( ldb < n ) return; // Error
|
||||
else
|
||||
{
|
||||
rs_b = ldb;
|
||||
cs_b = 1;
|
||||
}
|
||||
}
|
||||
else if( ( order == 'c' ) || ( order == 'C' ) )
|
||||
{
|
||||
if( ldb < k ) return; // Error.
|
||||
else
|
||||
{
|
||||
rs_b = 1;
|
||||
cs_b = ldb;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
AOCL_MATRIX_TYPE input_mat_type;
|
||||
bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
|
||||
|
||||
if ( input_mat_type == A_MATRIX )
|
||||
{
|
||||
return; // A reorder not supported.
|
||||
}
|
||||
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
if( n == 1 )
|
||||
{
|
||||
if( rs_b == 1 )
|
||||
{
|
||||
memcpy( output_buf_addr, reorder_buf_addr, ( k * sizeof( float ) ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for( dim_t k0 = 0; k0 < k; k0++ )
|
||||
{
|
||||
output_buf_addr[k0*rs_b] = reorder_buf_addr[k0];
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
|
||||
|
||||
// create dummy b_reorder obj.
|
||||
lpgemm_obj_t b_reorder;
|
||||
b_reorder.storage.aligned_buffer = ( void* )reorder_buf_addr;
|
||||
|
||||
// create dummy b obj.
|
||||
lpgemm_obj_t b;
|
||||
b.storage.aligned_buffer = ( void* )output_buf_addr;
|
||||
b.rs = rs_b;
|
||||
b.cs = cs_b;
|
||||
b.width = n;
|
||||
b.length = k;
|
||||
|
||||
unreorderb_nr64_f32f32f32of32_reference( &b, &b_reorder, &rntm_g, lcntx_g );
|
||||
}
|
||||
@@ -87,6 +87,7 @@ BLIS_EXPORT_ADDON void aocl_reorder_ ## LP_SFX \
|
||||
) \
|
||||
|
||||
AOCL_GEMM_REORDER(float,f32f32f32of32);
|
||||
AOCL_GEMM_REORDER(float,f32f32f32of32_reference);
|
||||
AOCL_GEMM_REORDER(int8_t,u8s8s32os32);
|
||||
AOCL_GEMM_REORDER(bfloat16,bf16bf16f32of32);
|
||||
AOCL_GEMM_REORDER(bfloat16,bf16bf16f32of32_reference);
|
||||
@@ -139,6 +140,7 @@ BLIS_EXPORT_ADDON void aocl_unreorder_ ## LP_SFX \
|
||||
|
||||
AOCL_GEMM_UNREORDER(bfloat16, bf16bf16f32of32);
|
||||
AOCL_GEMM_UNREORDER(bfloat16, bf16bf16f32of32_reference);
|
||||
AOCL_GEMM_UNREORDER(float, f32f32f32of32_reference);
|
||||
AOCL_GEMM_UNREORDER(int8_t, s8s8s32os32_reference);
|
||||
|
||||
#define AOCL_GEMM_MATMUL(A_type,B_type,C_type,Sum_type,LP_SFX) \
|
||||
|
||||
64
addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32.h
Normal file
64
addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef LPGEMM_REORDER_F32_H
|
||||
#define LPGEMM_REORDER_F32_H
|
||||
|
||||
#include "lpgemm_types.h"
|
||||
|
||||
void packb_f32f32f32of32_reference
|
||||
(
|
||||
float* pack_b,
|
||||
const float* b,
|
||||
const dim_t rs_b,
|
||||
const dim_t cs_b,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t* rs_p,
|
||||
dim_t* cs_p
|
||||
);
|
||||
|
||||
void unpackb_f32f32f32of32_reference
|
||||
(
|
||||
float* b,
|
||||
float* unpack_b_buffer,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t rs_b,
|
||||
dim_t cs_b
|
||||
);
|
||||
|
||||
#endif
|
||||
175
addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32_ref.c
Normal file
175
addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32_ref.c
Normal file
@@ -0,0 +1,175 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <string.h>
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ADDON_LPGEMM
|
||||
|
||||
/*
|
||||
Below are the reference packb functions which are
|
||||
varied based on block size NR (64, 48, 32, 16, lt) and
|
||||
order (row / column (transpose)).
|
||||
*/
|
||||
|
||||
static void packb_f32f32f32of32_row_major_ref
|
||||
(
|
||||
float* pack_b,
|
||||
const float* b,
|
||||
const dim_t ldb,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t* rs_b,
|
||||
dim_t* cs_b
|
||||
)
|
||||
{
|
||||
dim_t n_full_pieces = NC / NR;
|
||||
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
|
||||
dim_t n_partial_pieces = NC % NR;
|
||||
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
|
||||
{
|
||||
for ( dim_t kr = 0; kr < KC; kr ++ )
|
||||
{
|
||||
float* inp0 = ( b + ( ldb * kr) + jc );
|
||||
float* outp0 = ( pack_b + ( jc * KC ) + ( kr * NR ));
|
||||
for(dim_t i = 0; i < NR; i++) *outp0++ = *inp0++;
|
||||
}
|
||||
}
|
||||
|
||||
if(n_partial_pieces > 0)
|
||||
{
|
||||
float* pack_b_rem = ( pack_b + ( n_full_pieces_loop_limit * KC ) );
|
||||
float* b_rem = ( b + n_full_pieces_loop_limit );
|
||||
for ( dim_t kr = 0; kr < KC; kr ++ )
|
||||
{
|
||||
float* inp0 = ( b_rem + ( ldb * kr ) );
|
||||
float* outp0 = ( pack_b_rem + ( kr * NR ) );
|
||||
for(dim_t i = 0; i < n_partial_pieces; i++) *outp0++ = *inp0++;
|
||||
}
|
||||
}
|
||||
|
||||
*rs_b = NR;
|
||||
*cs_b = 1;
|
||||
}
|
||||
|
||||
static void packb_nr_f32f32f32of32_col_major_ref
|
||||
(
|
||||
float* pack_b_buffer,
|
||||
const float* b,
|
||||
const dim_t NR,
|
||||
const dim_t ldb,
|
||||
const dim_t KC,
|
||||
const dim_t n0_partial_rem
|
||||
)
|
||||
{
|
||||
for( dim_t i = 0; i < n0_partial_rem; i++ )
|
||||
{
|
||||
float* inp = (b + ( ldb * i ));
|
||||
float* outp = pack_b_buffer + i;
|
||||
for( dim_t j = 0; j < KC; j++ )
|
||||
{
|
||||
*(outp + ( j * NR)) = *inp++;
|
||||
}
|
||||
}
|
||||
for( dim_t i = n0_partial_rem; i < NR; i++ )
|
||||
{
|
||||
float* outp = pack_b_buffer + i;
|
||||
for( dim_t j = 0; j < KC; j++ )
|
||||
{
|
||||
*(outp + ( j * NR)) = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void packb_f32f32f32of32_col_major_ref
|
||||
(
|
||||
float* pack_b_buffer,
|
||||
const float* b,
|
||||
const dim_t ldb,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t* rs_b,
|
||||
dim_t* cs_b
|
||||
)
|
||||
{
|
||||
dim_t n_full_pieces = NC / NR;
|
||||
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
|
||||
dim_t n_partial_pieces = NC % NR;
|
||||
|
||||
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
|
||||
{
|
||||
packb_nr_f32f32f32of32_col_major_ref
|
||||
(
|
||||
pack_b_buffer + (jc * KC),
|
||||
b + (jc * ldb), NR, ldb, KC, NR
|
||||
);
|
||||
}
|
||||
|
||||
if(n_partial_pieces > 0)
|
||||
{
|
||||
packb_nr_f32f32f32of32_col_major_ref
|
||||
(
|
||||
( pack_b_buffer + ( n_full_pieces_loop_limit * KC ) ),
|
||||
( b + n_full_pieces_loop_limit * ldb ), NR, ldb, KC, n_partial_pieces
|
||||
);
|
||||
}
|
||||
|
||||
*rs_b = NR;
|
||||
*cs_b = 1;
|
||||
}
|
||||
|
||||
void packb_f32f32f32of32_reference
|
||||
(
|
||||
float* pack_b,
|
||||
const float* b,
|
||||
const dim_t rs_b,
|
||||
const dim_t cs_b,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t* rs_p,
|
||||
dim_t* cs_p
|
||||
)
|
||||
{
|
||||
if( cs_b == 1 ) {
|
||||
packb_f32f32f32of32_row_major_ref( pack_b, b, rs_b, NC, KC, NR, rs_p, cs_p );
|
||||
}else{
|
||||
packb_f32f32f32of32_col_major_ref( pack_b, b, cs_b, NC, KC, NR, rs_p, cs_p );
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
86
addon/aocl_gemm/frame/f32f32f32/lpgemm_unreorder_f32_ref.c
Normal file
86
addon/aocl_gemm/frame/f32f32f32/lpgemm_unreorder_f32_ref.c
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <string.h>
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ADDON_LPGEMM
|
||||
|
||||
|
||||
//TODO: Kept it as place holder for now, yet to test this completely!
|
||||
void unpackb_f32f32f32of32_reference
|
||||
(
|
||||
float* b,
|
||||
float* unpack_b,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
const dim_t NR,
|
||||
dim_t rs_b,
|
||||
dim_t cs_b
|
||||
)
|
||||
{
|
||||
if( cs_b == 1 )
|
||||
{
|
||||
for ( dim_t jc = 0; jc < NC; jc += NR )
|
||||
{
|
||||
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
|
||||
float* outp = ( unpack_b + jc );
|
||||
float* inp = (b + jc * NR );
|
||||
for ( dim_t kr = 0; kr < KC; kr++ )
|
||||
{
|
||||
outp += nr0; inp += NR ;
|
||||
|
||||
for(dim_t i = 0; i < nr0; i++) *outp++ = *inp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for ( dim_t jc = 0; jc < NC; jc += NR )
|
||||
{
|
||||
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
|
||||
for ( dim_t kr = 0; kr < KC; kr++ )
|
||||
{
|
||||
float* outp0 = ( unpack_b + ( cs_b * kr) + jc );
|
||||
float* inp0 = ( b + ( jc * KC ) + ( ( kr + NR )));
|
||||
|
||||
for(dim_t i = 0; i < nr0; i++) *outp0++ = *inp0++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user