Implemented f32 reference reorder function

Implemented aocl_reorder_f32f32f32of32_reference( ) function and tested. Implemented framework changes required and place holder for kernels for aocl_unreorder_f32f32f32of32_reference( ) function. It is not tested completely and will be taken care in subsequent commits. [AMD-Internal: SWLCSG-3618 ]
2026-04-19 23:28:52 +00:00 · 2025-07-15 12:26:05 +05:30
parent 837d3974d4
commit 76c08fe81d
5 changed files with 684 additions and 0 deletions
--- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c
@@ -36,6 +36,7 @@
 #include "aocl_gemm_interface_apis.h"
 #include "lpgemm_config.h"
 #include "lpgemm_utils.h"
+#include "lpgemm_reorder_f32.h"

 AOCL_GEMM_GET_REORDER_BUF_SIZE(f32f32f32of32)
 {
@@ -273,3 +274,359 @@ AOCL_GEMM_REORDER(float,f32f32f32of32)
 		}
 	}
 }
+
+
+// Pack B into row stored column panels.
+AOCL_GEMM_REORDER(float,f32f32f32of32_reference)
+{
+	trans_t blis_trans;
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans(trans, &blis_trans);
+	if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
+	     ( k <= 0 ) || ( n <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	// Only supports row major packing now.
+	inc_t rs_b, cs_b;
+	if( ( order == 'r') || ( order == 'R' ) )
+	{
+		if( ( bli_is_notrans( blis_trans ) && ( ldb < n ) ) ||
+		    ( bli_is_trans( blis_trans ) && ( ldb < k ) ) )
+		{
+			return; // Error.
+		}
+		else
+		{
+			rs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
+			cs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
+		}
+	}
+	else if ( ( order == 'c' ) || ( order == 'C' ) )
+	{
+		if( ( bli_is_notrans( blis_trans ) && ( ldb < k ) ) ||
+		    ( bli_is_trans( blis_trans ) && ( ldb < n ) ) )
+		{
+			return; // Error.
+		}
+		else
+		{
+			rs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
+			cs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
+		}
+	}
+	else
+	{
+		return; // Error
+	}
+
+	// Check if AVX2 ISA is supported, lpgemm fp32 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform f32f32f32 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Initialize lpgemm context.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
+
+	if ( input_mat_type == A_MATRIX )
+	{
+		return; // A reorder not supported.
+	}
+
+	// Query the context for various blocksizes.
+	lpgemm_cntx_t* lcntx = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t NR = lcntx->blksz.NR;
+
+	dim_t rs_b_reorder = 0;
+	dim_t cs_b_reorder = 0;
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+
+	dim_t n_threads = bli_rntm_num_threads( &rntm_g );
+	n_threads = ( n_threads > 0 ) ? n_threads : 1;
+
+
+	//When n == 1, B marix becomes a vector.
+	//Reordering is avoided so that LPGEMV can process it efficiently.
+	if( ( n == 1 ) ) //&& ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
+	{
+		if(rs_b == 1)
+		{
+			memcpy(reorder_buf_addr, input_buf_addr, (k * sizeof(float)));
+		}else
+		{
+			for(dim_t k0 = 0; k0 < k; k0++)
+			{
+				reorder_buf_addr[k0] = input_buf_addr[k0*rs_b];
+			}
+		}
+		return;
+	}
+
+#ifdef BLIS_ENABLE_OPENMP
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( n_threads, &thread_jc );
+		bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
+#else
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( 1, &thread_jc );
+		bli_thrinfo_set_work_id( 0, &thread_jc );
+#endif
+		// Compute the JC loop thread range for the current thread. Per thread
+		// gets multiple of NR columns.
+		dim_t jc_start, jc_end;
+		bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+		for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
+		{
+			dim_t nc0 = bli_min( ( jc_end - jc ), NC );
+
+			dim_t jc_cur_loop = jc;
+			dim_t jc_cur_loop_rem = 0;
+			dim_t n_sub_updated;
+
+			get_B_panel_reordered_start_offset_width
+			(
+			  jc, n, NC, NR,
+			  &jc_cur_loop, &jc_cur_loop_rem,
+			  &nc0, &n_sub_updated
+			);
+
+			for ( dim_t pc = 0; pc < k; pc += KC )
+			{
+				dim_t kc0 = bli_min( ( k - pc ), KC );
+
+				// The offsets are calculated in such a way that it resembles
+				// the reorder buffer traversal in single threaded reordering.
+				// The panel boundaries (KCxNC) remain as it is accessed in
+				// single thread, and as a consequence a thread with jc_start
+				// inside the panel cannot consider NC range for reorder. It
+				// has to work with NC' < NC, and the offset is calulated using
+				// prev NC panels spanning k dim + cur NC panel spaning pc loop
+				// cur iteration + (NC - NC') spanning current kc0 (<= KC).
+				//
+				//Eg: Consider the following reordered buffer diagram:
+				//          t1              t2
+				//          |               |
+				//          |           |..NC..|
+				//          |           |      |
+				//          |.NC. |.NC. |NC'|NC"
+				//     pc=0-+-----+-----+---+--+
+				//        KC|     |     |   |  |
+				//          |  1  |  3  |   5  |
+				//    pc=KC-+-----+-----+---st-+
+				//        KC|     |     |   |  |
+				//          |  2  |  4  | 6 | 7|
+				// pc=k=2KC-+-----+-----+---+--+
+				//          |jc=0 |jc=NC|jc=2NC|
+				//
+				// The numbers 1,2..6,7 denotes the order in which reordered
+				// KCxNC blocks are stored in memory, ie: block 1 followed by 2
+				// followed by 3, etc. Given two threads t1 and t2, and t2 needs
+				// to acces point st in the reorder buffer to write the data:
+				// The offset calulation logic will be:
+				// jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC,
+				// n_sub_updated = NC, k = 2KC, kc0_updated = KC
+				//
+				// st = ( jc_cur_loop * k )    <traverse blocks 1,2,3,4>
+				//    + ( n_sub_updated * pc ) <traverse block 5>
+				//    + ( NC' * kc0_updated)   <traverse block 6>
+				packb_f32f32f32of32_reference
+				(
+				  reorder_buf_addr + ( jc_cur_loop * k ) +
+				  ( n_sub_updated * pc ) + ( jc_cur_loop_rem * kc0 ),
+				  input_buf_addr + ( rs_b * pc ) + ( cs_b * jc ),
+				  rs_b, cs_b, nc0, kc0, NR, &rs_b_reorder, &cs_b_reorder
+				);
+			}
+
+			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
+		}
+	}
+}
+
+
+//TODO: Kept it as place holder for now, yet to test this completely!
+void unreorderb_nr64_f32f32f32of32_reference
+(
+	lpgemm_obj_t * b,
+	lpgemm_obj_t * b_unreorder,
+	rntm_t*        rntm,
+	lpgemm_cntx_t* lcntx
+)
+{
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t NR = lcntx->blksz.NR;
+
+	// Extracting the matrix properties from the lpgemm object
+	dim_t rs_b = b->rs;
+	dim_t cs_b = b->cs;
+	dim_t n = b->width;
+	dim_t k = b->length;
+
+	dim_t n_threads = bli_rntm_num_threads( rntm );
+	n_threads = ( n_threads > 0 ) ? n_threads : 1;
+
+#ifdef BLIS_ENABLE_OPENMP
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( n_threads, &thread_jc );
+		bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
+#else
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( 1, &thread_jc );
+		bli_thrinfo_set_work_id( 0, &thread_jc );
+#endif
+
+		// Compute the JC loop thread range for the current thread.
+		dim_t jc_start, jc_end;
+		bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+
+		for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
+		{
+			dim_t nc0 = bli_min( ( jc_end - jc ), NC );
+
+			dim_t jc_cur_loop = jc;
+			dim_t jc_cur_loop_rem = 0;
+			dim_t n_sub_updated;
+
+			get_B_panel_reordered_start_offset_width
+			(
+			  jc, n, NC, NR,
+			  &jc_cur_loop, &jc_cur_loop_rem,
+			  &nc0, &n_sub_updated
+			);
+
+			for ( dim_t pc = 0; pc < k; pc += KC )
+			{
+				dim_t kc0 = bli_min( ( k - pc ), KC );
+
+				unpackb_f32f32f32of32_reference
+				(
+				  ( ( float* )b_unreorder->storage.aligned_buffer ) +
+				  ( jc_cur_loop * k ) + ( n_sub_updated * pc ) +
+				  ( jc_cur_loop_rem * kc0 ),
+				  ( ( ( float* )b->storage.aligned_buffer ) +
+				  ( rs_b * pc ) + (jc * cs_b)),
+				  nc0, kc0, NR, rs_b, cs_b
+				);
+			}
+
+			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
+		}
+	}
+}
+
+//TODO: Kept it as place holder for now, yet to test this completely!
+AOCL_GEMM_UNREORDER(float, f32f32f32of32_reference)
+{
+	if ( ( output_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
+	     ( k <= 0 ) || ( n <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	inc_t rs_b, cs_b;
+
+	// Check for the validity of strides.
+	if( ( order == 'r' ) || ( order == 'R' ) )
+	{
+		if( ldb < n ) return; // Error
+		else
+		{
+			rs_b = ldb;
+			cs_b = 1;
+		}
+	}
+	else if( ( order == 'c' ) || ( order == 'C' ) )
+	{
+		if( ldb < k ) return; // Error.
+		else
+		{
+			rs_b = 1;
+			cs_b = ldb;
+		}
+	}
+	else
+	{
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
+
+	if ( input_mat_type == A_MATRIX )
+	{
+		return; // A reorder not supported.
+	}
+
+#ifdef BLIS_KERNELS_ZEN4
+	if( n == 1 )
+	{
+		if( rs_b == 1 )
+		{
+			memcpy( output_buf_addr, reorder_buf_addr, ( k * sizeof( float ) ) );
+		}
+		else
+		{
+			for( dim_t k0 = 0; k0 < k; k0++ )
+			{
+				output_buf_addr[k0*rs_b] = reorder_buf_addr[k0];
+			}
+		}
+		return;
+	}
+#endif
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+	bli_pba_rntm_set_pba( &rntm_g );
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
+
+	// create dummy b_reorder obj.
+	lpgemm_obj_t b_reorder;
+	b_reorder.storage.aligned_buffer = ( void* )reorder_buf_addr;
+
+	// create dummy b obj.
+	lpgemm_obj_t b;
+	b.storage.aligned_buffer = ( void* )output_buf_addr;
+	b.rs = rs_b;
+	b.cs = cs_b;
+	b.width = n;
+	b.length = k;
+
+	unreorderb_nr64_f32f32f32of32_reference( &b, &b_reorder, &rntm_g, lcntx_g );
+}
--- a/addon/aocl_gemm/aocl_gemm_interface_apis.h
+++ b/addon/aocl_gemm/aocl_gemm_interface_apis.h
@@ -87,6 +87,7 @@ BLIS_EXPORT_ADDON void aocl_reorder_ ## LP_SFX \
     ) \

 AOCL_GEMM_REORDER(float,f32f32f32of32);
+AOCL_GEMM_REORDER(float,f32f32f32of32_reference);
 AOCL_GEMM_REORDER(int8_t,u8s8s32os32);
 AOCL_GEMM_REORDER(bfloat16,bf16bf16f32of32);
 AOCL_GEMM_REORDER(bfloat16,bf16bf16f32of32_reference);
@@ -139,6 +140,7 @@ BLIS_EXPORT_ADDON void aocl_unreorder_ ## LP_SFX \

 AOCL_GEMM_UNREORDER(bfloat16, bf16bf16f32of32);
 AOCL_GEMM_UNREORDER(bfloat16, bf16bf16f32of32_reference);
+AOCL_GEMM_UNREORDER(float, f32f32f32of32_reference);
 AOCL_GEMM_UNREORDER(int8_t, s8s8s32os32_reference);

 #define AOCL_GEMM_MATMUL(A_type,B_type,C_type,Sum_type,LP_SFX) \
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32.h
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32.h
@@ -0,0 +1,64 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_REORDER_F32_H
+#define LPGEMM_REORDER_F32_H
+
+#include "lpgemm_types.h"
+
+void packb_f32f32f32of32_reference
+(
+  float*        pack_b,
+  const float*  b,
+  const dim_t   rs_b,
+  const dim_t   cs_b,
+  const dim_t   NC,
+  const dim_t   KC,
+  const dim_t   NR,
+  dim_t*        rs_p,
+  dim_t*        cs_p
+);
+
+void unpackb_f32f32f32of32_reference
+(
+  float*       b,
+  float*       unpack_b_buffer,
+  const dim_t  NC,
+  const dim_t  KC,
+  const dim_t  NR,
+  dim_t        rs_b,
+  dim_t        cs_b
+);
+
+#endif
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32_ref.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_reorder_f32_ref.c
@@ -0,0 +1,175 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+/*
+	Below are the reference packb functions which are 
+    varied based on block size NR (64, 48, 32, 16, lt) and 
+    order (row / column (transpose)).
+*/
+
+static void   packb_f32f32f32of32_row_major_ref
+(
+	float*       	pack_b,
+	const float*       	b,
+	const dim_t     ldb,
+	const dim_t     NC,
+	const dim_t     KC,
+	const dim_t     NR,
+	dim_t*          rs_b,
+	dim_t*          cs_b
+)
+{
+	dim_t n_full_pieces = NC / NR;
+	dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
+	dim_t n_partial_pieces = NC % NR;
+	for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
+	{
+		for ( dim_t kr = 0; kr < KC; kr ++ )
+		{
+			float* inp0 = ( b + ( ldb * kr) + jc  );
+			float* outp0 = ( pack_b + ( jc * KC ) + ( kr * NR ));
+			for(dim_t i = 0; i < NR; i++) *outp0++ = *inp0++;
+		}
+	}
+
+	if(n_partial_pieces > 0) 
+	{
+		float* pack_b_rem  = ( pack_b + ( n_full_pieces_loop_limit * KC ) );
+		float* b_rem = ( b + n_full_pieces_loop_limit );
+		for ( dim_t kr = 0; kr < KC; kr ++ ) 
+		{
+			float* inp0 = ( b_rem + ( ldb * kr ) );
+			float* outp0 = ( pack_b_rem + ( kr * NR ) );
+			for(dim_t i = 0; i < n_partial_pieces; i++)	*outp0++ = *inp0++;
+		}
+	}
+
+	*rs_b = NR;
+	*cs_b = 1;
+}
+
+static void  packb_nr_f32f32f32of32_col_major_ref
+(
+    float*       	pack_b_buffer,
+    const float*       	b,
+	const dim_t     NR,
+    const dim_t     ldb,
+    const dim_t     KC,
+    const dim_t     n0_partial_rem
+)
+{
+	for( dim_t i = 0; i < n0_partial_rem; i++ )
+	{
+		float* inp  = (b + ( ldb * i ));
+		float* outp = pack_b_buffer + i;
+		for( dim_t j = 0; j < KC; j++ )
+		{
+			*(outp + ( j * NR)) = *inp++;
+		}
+	}
+	for( dim_t i = n0_partial_rem; i < NR; i++ )
+	{
+		float* outp = pack_b_buffer + i;
+		for( dim_t j = 0; j < KC; j++ )
+		{
+			*(outp + ( j * NR)) = 0;
+		}
+	}
+}
+
+static void  packb_f32f32f32of32_col_major_ref
+(
+    float*      	pack_b_buffer,
+    const float*    b,
+    const dim_t     ldb,
+    const dim_t     NC,
+    const dim_t     KC,
+    const dim_t     NR,
+    dim_t*          rs_b,
+    dim_t*          cs_b
+)
+{
+	dim_t n_full_pieces = NC / NR;
+	dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
+	dim_t n_partial_pieces = NC % NR;
+
+	for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
+	{
+		packb_nr_f32f32f32of32_col_major_ref
+		( 
+			pack_b_buffer + (jc * KC),
+			b + (jc * ldb), NR, ldb, KC, NR
+		);
+	}
+
+	if(n_partial_pieces > 0)
+	{
+		packb_nr_f32f32f32of32_col_major_ref
+		(
+			( pack_b_buffer + ( n_full_pieces_loop_limit * KC ) ),
+			( b + n_full_pieces_loop_limit * ldb ), NR, ldb, KC, n_partial_pieces
+		);
+	}
+
+	*rs_b = NR;
+	*cs_b = 1;
+}
+
+void  packb_f32f32f32of32_reference
+(
+    float*       	pack_b,
+    const float*       	b,
+    const dim_t  	rs_b,
+    const dim_t  	cs_b,
+    const dim_t  	NC,
+    const dim_t  	KC,
+	const dim_t  	NR,
+    dim_t*       	rs_p,
+    dim_t*       	cs_p
+)
+{
+	if( cs_b == 1 ) {
+		packb_f32f32f32of32_row_major_ref( pack_b, b, rs_b, NC, KC, NR, rs_p, cs_p );
+	}else{
+		packb_f32f32f32of32_col_major_ref( pack_b, b, cs_b, NC, KC, NR, rs_p, cs_p );
+	}
+}
+
+#endif
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_unreorder_f32_ref.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_unreorder_f32_ref.c
@@ -0,0 +1,86 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+
+//TODO: Kept it as place holder for now, yet to test this completely!
+void unpackb_f32f32f32of32_reference
+	(
+	  float*       b,
+	  float*       unpack_b,
+	  const dim_t  NC,
+	  const dim_t  KC,
+	  const dim_t  NR,
+	  dim_t        rs_b,
+	  dim_t        cs_b
+	)
+{
+	if( cs_b == 1 )
+	{
+		for ( dim_t jc = 0; jc < NC; jc += NR )
+		{
+			dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
+			float* outp = ( unpack_b + jc );
+			float* inp = (b + jc * NR );
+			for ( dim_t kr = 0; kr < KC; kr++ )
+			{
+				outp += nr0; inp  += NR ;
+
+				for(dim_t i = 0; i < nr0; i++)	*outp++ = *inp++;
+			}
+		}
+	}
+	else
+	{
+		
+		for ( dim_t jc = 0; jc < NC; jc += NR )
+		{
+			dim_t nr0 = ((NC - jc) >  NR ? NR : (NC - jc));
+			for ( dim_t kr = 0; kr < KC; kr++ )
+			{
+				float* outp0 = ( unpack_b + ( cs_b * kr) + jc );
+				float* inp0 = ( b + ( jc * KC ) + ( ( kr + NR )));
+
+				for(dim_t i = 0; i < nr0; i++)	*outp0++ = *inp0++;
+			}
+		}
+	}
+}
+
+#endif