Implemented f32 reference reorder function

Implemented aocl_reorder_f32f32f32of32_reference( ) function and tested.

Implemented framework changes required and place holder for kernels for aocl_unreorder_f32f32f32of32_reference( ) function. It is not tested completely and will be taken care in subsequent commits.

[AMD-Internal: SWLCSG-3618 ]
This commit is contained in:
Bhaskar, Nallani
2025-07-15 12:26:05 +05:30
committed by GitHub
parent 837d3974d4
commit 76c08fe81d
5 changed files with 684 additions and 0 deletions

View File

@@ -36,6 +36,7 @@
#include "aocl_gemm_interface_apis.h"
#include "lpgemm_config.h"
#include "lpgemm_utils.h"
#include "lpgemm_reorder_f32.h"
AOCL_GEMM_GET_REORDER_BUF_SIZE(f32f32f32of32)
{
@@ -273,3 +274,359 @@ AOCL_GEMM_REORDER(float,f32f32f32of32)
}
}
}
// Pack B into row stored column panels.
AOCL_GEMM_REORDER(float,f32f32f32of32_reference)
{
trans_t blis_trans;
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(trans, &blis_trans);
if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
( k <= 0 ) || ( n <= 0 ) )
{
return; // Error.
}
// Only supports row major packing now.
inc_t rs_b, cs_b;
if( ( order == 'r') || ( order == 'R' ) )
{
if( ( bli_is_notrans( blis_trans ) && ( ldb < n ) ) ||
( bli_is_trans( blis_trans ) && ( ldb < k ) ) )
{
return; // Error.
}
else
{
rs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
cs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
}
}
else if ( ( order == 'c' ) || ( order == 'C' ) )
{
if( ( bli_is_notrans( blis_trans ) && ( ldb < k ) ) ||
( bli_is_trans( blis_trans ) && ( ldb < n ) ) )
{
return; // Error.
}
else
{
rs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
cs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
}
}
else
{
return; // Error
}
// Check if AVX2 ISA is supported, lpgemm fp32 matmul only works with it.
if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
{
bli_print_msg(" AVX2 ISA not supported by processor, "
"cannot perform f32f32f32 gemm.", __FILE__, __LINE__ );
return; // Error.
}
/* Initialize BLIS. */
bli_init_auto();
// Initialize lpgemm context.
aocl_lpgemm_init_global_cntx();
AOCL_MATRIX_TYPE input_mat_type;
bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
if ( input_mat_type == A_MATRIX )
{
return; // A reorder not supported.
}
// Query the context for various blocksizes.
lpgemm_cntx_t* lcntx = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
dim_t NC = lcntx->blksz.NC;
dim_t KC = lcntx->blksz.KC;
dim_t NR = lcntx->blksz.NR;
dim_t rs_b_reorder = 0;
dim_t cs_b_reorder = 0;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
dim_t n_threads = bli_rntm_num_threads( &rntm_g );
n_threads = ( n_threads > 0 ) ? n_threads : 1;
//When n == 1, B marix becomes a vector.
//Reordering is avoided so that LPGEMV can process it efficiently.
if( ( n == 1 ) ) //&& ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
{
if(rs_b == 1)
{
memcpy(reorder_buf_addr, input_buf_addr, (k * sizeof(float)));
}else
{
for(dim_t k0 = 0; k0 < k; k0++)
{
reorder_buf_addr[k0] = input_buf_addr[k0*rs_b];
}
}
return;
}
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Initialise a local thrinfo obj for work split across threads.
thrinfo_t thread_jc;
bli_thrinfo_set_n_way( n_threads, &thread_jc );
bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
#else
{
// Initialise a local thrinfo obj for work split across threads.
thrinfo_t thread_jc;
bli_thrinfo_set_n_way( 1, &thread_jc );
bli_thrinfo_set_work_id( 0, &thread_jc );
#endif
// Compute the JC loop thread range for the current thread. Per thread
// gets multiple of NR columns.
dim_t jc_start, jc_end;
bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
{
dim_t nc0 = bli_min( ( jc_end - jc ), NC );
dim_t jc_cur_loop = jc;
dim_t jc_cur_loop_rem = 0;
dim_t n_sub_updated;
get_B_panel_reordered_start_offset_width
(
jc, n, NC, NR,
&jc_cur_loop, &jc_cur_loop_rem,
&nc0, &n_sub_updated
);
for ( dim_t pc = 0; pc < k; pc += KC )
{
dim_t kc0 = bli_min( ( k - pc ), KC );
// The offsets are calculated in such a way that it resembles
// the reorder buffer traversal in single threaded reordering.
// The panel boundaries (KCxNC) remain as it is accessed in
// single thread, and as a consequence a thread with jc_start
// inside the panel cannot consider NC range for reorder. It
// has to work with NC' < NC, and the offset is calulated using
// prev NC panels spanning k dim + cur NC panel spaning pc loop
// cur iteration + (NC - NC') spanning current kc0 (<= KC).
//
//Eg: Consider the following reordered buffer diagram:
// t1 t2
// | |
// | |..NC..|
// | | |
// |.NC. |.NC. |NC'|NC"
// pc=0-+-----+-----+---+--+
// KC| | | | |
// | 1 | 3 | 5 |
// pc=KC-+-----+-----+---st-+
// KC| | | | |
// | 2 | 4 | 6 | 7|
// pc=k=2KC-+-----+-----+---+--+
// |jc=0 |jc=NC|jc=2NC|
//
// The numbers 1,2..6,7 denotes the order in which reordered
// KCxNC blocks are stored in memory, ie: block 1 followed by 2
// followed by 3, etc. Given two threads t1 and t2, and t2 needs
// to acces point st in the reorder buffer to write the data:
// The offset calulation logic will be:
// jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC,
// n_sub_updated = NC, k = 2KC, kc0_updated = KC
//
// st = ( jc_cur_loop * k ) <traverse blocks 1,2,3,4>
// + ( n_sub_updated * pc ) <traverse block 5>
// + ( NC' * kc0_updated) <traverse block 6>
packb_f32f32f32of32_reference
(
reorder_buf_addr + ( jc_cur_loop * k ) +
( n_sub_updated * pc ) + ( jc_cur_loop_rem * kc0 ),
input_buf_addr + ( rs_b * pc ) + ( cs_b * jc ),
rs_b, cs_b, nc0, kc0, NR, &rs_b_reorder, &cs_b_reorder
);
}
adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
}
}
}
//TODO: Kept it as place holder for now, yet to test this completely!
void unreorderb_nr64_f32f32f32of32_reference
(
lpgemm_obj_t * b,
lpgemm_obj_t * b_unreorder,
rntm_t* rntm,
lpgemm_cntx_t* lcntx
)
{
dim_t NC = lcntx->blksz.NC;
dim_t KC = lcntx->blksz.KC;
dim_t NR = lcntx->blksz.NR;
// Extracting the matrix properties from the lpgemm object
dim_t rs_b = b->rs;
dim_t cs_b = b->cs;
dim_t n = b->width;
dim_t k = b->length;
dim_t n_threads = bli_rntm_num_threads( rntm );
n_threads = ( n_threads > 0 ) ? n_threads : 1;
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Initialise a local thrinfo obj for work split across threads.
thrinfo_t thread_jc;
bli_thrinfo_set_n_way( n_threads, &thread_jc );
bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
#else
{
// Initialise a local thrinfo obj for work split across threads.
thrinfo_t thread_jc;
bli_thrinfo_set_n_way( 1, &thread_jc );
bli_thrinfo_set_work_id( 0, &thread_jc );
#endif
// Compute the JC loop thread range for the current thread.
dim_t jc_start, jc_end;
bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
{
dim_t nc0 = bli_min( ( jc_end - jc ), NC );
dim_t jc_cur_loop = jc;
dim_t jc_cur_loop_rem = 0;
dim_t n_sub_updated;
get_B_panel_reordered_start_offset_width
(
jc, n, NC, NR,
&jc_cur_loop, &jc_cur_loop_rem,
&nc0, &n_sub_updated
);
for ( dim_t pc = 0; pc < k; pc += KC )
{
dim_t kc0 = bli_min( ( k - pc ), KC );
unpackb_f32f32f32of32_reference
(
( ( float* )b_unreorder->storage.aligned_buffer ) +
( jc_cur_loop * k ) + ( n_sub_updated * pc ) +
( jc_cur_loop_rem * kc0 ),
( ( ( float* )b->storage.aligned_buffer ) +
( rs_b * pc ) + (jc * cs_b)),
nc0, kc0, NR, rs_b, cs_b
);
}
adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
}
}
}
//TODO: Kept it as place holder for now, yet to test this completely!
AOCL_GEMM_UNREORDER(float, f32f32f32of32_reference)
{
if ( ( output_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
( k <= 0 ) || ( n <= 0 ) )
{
return; // Error.
}
inc_t rs_b, cs_b;
// Check for the validity of strides.
if( ( order == 'r' ) || ( order == 'R' ) )
{
if( ldb < n ) return; // Error
else
{
rs_b = ldb;
cs_b = 1;
}
}
else if( ( order == 'c' ) || ( order == 'C' ) )
{
if( ldb < k ) return; // Error.
else
{
rs_b = 1;
cs_b = ldb;
}
}
else
{
return; // Error.
}
/* Initialize BLIS. */
bli_init_auto();
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
AOCL_MATRIX_TYPE input_mat_type;
bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
if ( input_mat_type == A_MATRIX )
{
return; // A reorder not supported.
}
#ifdef BLIS_KERNELS_ZEN4
if( n == 1 )
{
if( rs_b == 1 )
{
memcpy( output_buf_addr, reorder_buf_addr, ( k * sizeof( float ) ) );
}
else
{
for( dim_t k0 = 0; k0 < k; k0++ )
{
output_buf_addr[k0*rs_b] = reorder_buf_addr[k0];
}
}
return;
}
#endif
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
// create dummy b_reorder obj.
lpgemm_obj_t b_reorder;
b_reorder.storage.aligned_buffer = ( void* )reorder_buf_addr;
// create dummy b obj.
lpgemm_obj_t b;
b.storage.aligned_buffer = ( void* )output_buf_addr;
b.rs = rs_b;
b.cs = cs_b;
b.width = n;
b.length = k;
unreorderb_nr64_f32f32f32of32_reference( &b, &b_reorder, &rntm_g, lcntx_g );
}

View File

@@ -87,6 +87,7 @@ BLIS_EXPORT_ADDON void aocl_reorder_ ## LP_SFX \
) \
AOCL_GEMM_REORDER(float,f32f32f32of32);
AOCL_GEMM_REORDER(float,f32f32f32of32_reference);
AOCL_GEMM_REORDER(int8_t,u8s8s32os32);
AOCL_GEMM_REORDER(bfloat16,bf16bf16f32of32);
AOCL_GEMM_REORDER(bfloat16,bf16bf16f32of32_reference);
@@ -139,6 +140,7 @@ BLIS_EXPORT_ADDON void aocl_unreorder_ ## LP_SFX \
AOCL_GEMM_UNREORDER(bfloat16, bf16bf16f32of32);
AOCL_GEMM_UNREORDER(bfloat16, bf16bf16f32of32_reference);
AOCL_GEMM_UNREORDER(float, f32f32f32of32_reference);
AOCL_GEMM_UNREORDER(int8_t, s8s8s32os32_reference);
#define AOCL_GEMM_MATMUL(A_type,B_type,C_type,Sum_type,LP_SFX) \

View File

@@ -0,0 +1,64 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LPGEMM_REORDER_F32_H
#define LPGEMM_REORDER_F32_H
#include "lpgemm_types.h"
void packb_f32f32f32of32_reference
(
float* pack_b,
const float* b,
const dim_t rs_b,
const dim_t cs_b,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t* rs_p,
dim_t* cs_p
);
void unpackb_f32f32f32of32_reference
(
float* b,
float* unpack_b_buffer,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t rs_b,
dim_t cs_b
);
#endif

View File

@@ -0,0 +1,175 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <immintrin.h>
#include <string.h>
#include "blis.h"
#ifdef BLIS_ADDON_LPGEMM
/*
Below are the reference packb functions which are
varied based on block size NR (64, 48, 32, 16, lt) and
order (row / column (transpose)).
*/
static void packb_f32f32f32of32_row_major_ref
(
float* pack_b,
const float* b,
const dim_t ldb,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t* rs_b,
dim_t* cs_b
)
{
dim_t n_full_pieces = NC / NR;
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
dim_t n_partial_pieces = NC % NR;
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
{
for ( dim_t kr = 0; kr < KC; kr ++ )
{
float* inp0 = ( b + ( ldb * kr) + jc );
float* outp0 = ( pack_b + ( jc * KC ) + ( kr * NR ));
for(dim_t i = 0; i < NR; i++) *outp0++ = *inp0++;
}
}
if(n_partial_pieces > 0)
{
float* pack_b_rem = ( pack_b + ( n_full_pieces_loop_limit * KC ) );
float* b_rem = ( b + n_full_pieces_loop_limit );
for ( dim_t kr = 0; kr < KC; kr ++ )
{
float* inp0 = ( b_rem + ( ldb * kr ) );
float* outp0 = ( pack_b_rem + ( kr * NR ) );
for(dim_t i = 0; i < n_partial_pieces; i++) *outp0++ = *inp0++;
}
}
*rs_b = NR;
*cs_b = 1;
}
static void packb_nr_f32f32f32of32_col_major_ref
(
float* pack_b_buffer,
const float* b,
const dim_t NR,
const dim_t ldb,
const dim_t KC,
const dim_t n0_partial_rem
)
{
for( dim_t i = 0; i < n0_partial_rem; i++ )
{
float* inp = (b + ( ldb * i ));
float* outp = pack_b_buffer + i;
for( dim_t j = 0; j < KC; j++ )
{
*(outp + ( j * NR)) = *inp++;
}
}
for( dim_t i = n0_partial_rem; i < NR; i++ )
{
float* outp = pack_b_buffer + i;
for( dim_t j = 0; j < KC; j++ )
{
*(outp + ( j * NR)) = 0;
}
}
}
static void packb_f32f32f32of32_col_major_ref
(
float* pack_b_buffer,
const float* b,
const dim_t ldb,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t* rs_b,
dim_t* cs_b
)
{
dim_t n_full_pieces = NC / NR;
dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
dim_t n_partial_pieces = NC % NR;
for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
{
packb_nr_f32f32f32of32_col_major_ref
(
pack_b_buffer + (jc * KC),
b + (jc * ldb), NR, ldb, KC, NR
);
}
if(n_partial_pieces > 0)
{
packb_nr_f32f32f32of32_col_major_ref
(
( pack_b_buffer + ( n_full_pieces_loop_limit * KC ) ),
( b + n_full_pieces_loop_limit * ldb ), NR, ldb, KC, n_partial_pieces
);
}
*rs_b = NR;
*cs_b = 1;
}
void packb_f32f32f32of32_reference
(
float* pack_b,
const float* b,
const dim_t rs_b,
const dim_t cs_b,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t* rs_p,
dim_t* cs_p
)
{
if( cs_b == 1 ) {
packb_f32f32f32of32_row_major_ref( pack_b, b, rs_b, NC, KC, NR, rs_p, cs_p );
}else{
packb_f32f32f32of32_col_major_ref( pack_b, b, cs_b, NC, KC, NR, rs_p, cs_p );
}
}
#endif

View File

@@ -0,0 +1,86 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <immintrin.h>
#include <string.h>
#include "blis.h"
#ifdef BLIS_ADDON_LPGEMM
//TODO: Kept it as place holder for now, yet to test this completely!
void unpackb_f32f32f32of32_reference
(
float* b,
float* unpack_b,
const dim_t NC,
const dim_t KC,
const dim_t NR,
dim_t rs_b,
dim_t cs_b
)
{
if( cs_b == 1 )
{
for ( dim_t jc = 0; jc < NC; jc += NR )
{
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
float* outp = ( unpack_b + jc );
float* inp = (b + jc * NR );
for ( dim_t kr = 0; kr < KC; kr++ )
{
outp += nr0; inp += NR ;
for(dim_t i = 0; i < nr0; i++) *outp++ = *inp++;
}
}
}
else
{
for ( dim_t jc = 0; jc < NC; jc += NR )
{
dim_t nr0 = ((NC - jc) > NR ? NR : (NC - jc));
for ( dim_t kr = 0; kr < KC; kr++ )
{
float* outp0 = ( unpack_b + ( cs_b * kr) + jc );
float* inp0 = ( b + ( jc * KC ) + ( ( kr + NR )));
for(dim_t i = 0; i < nr0; i++) *outp0++ = *inp0++;
}
}
}
}
#endif