mirror of
https://github.com/amd/blis.git
synced 2026-05-01 04:51:11 +00:00
- Standardize formatting (spacing etc). - Add full copyright to cmake files (excluding .json) - Correct copyright and disclaimer text for frame and zen, skx and a couple of other kernels to cover all contributors, as is commonly used in other files. - Fixed some typos and missing lines in copyright statements. AMD-Internal: [CPUPL-4415] Change-Id: Ib248bb6033c4d0b408773cf0e2a2cda6c2a74371
571 lines
15 KiB
C
571 lines
15 KiB
C
/*
|
|
|
|
BLIS
|
|
An object-based framework for developing high-performance BLAS-like
|
|
libraries.
|
|
|
|
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
- Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
#include "blis.h"
|
|
#include "lpgemm_5loop_interface_apis.h"
|
|
#include "lpgemm_packb_s16.h"
|
|
#include "lpgemm_packa_s16.h"
|
|
#include "lpgemm_kernels.h"
|
|
#include "lpgemm_utils.h"
|
|
#include "lpgemm_config.h"
|
|
#include "lpgemm_thrinfo_utils.h"
|
|
|
|
// Kernel function prototypes
|
|
typedef void (*lpgemm_rowvar_s16)
|
|
(
|
|
const dim_t,
|
|
const dim_t,
|
|
const dim_t,
|
|
const uint8_t*,
|
|
const dim_t,
|
|
const dim_t,
|
|
const dim_t,
|
|
const int8_t*,
|
|
const dim_t,
|
|
const dim_t,
|
|
int16_t*,
|
|
const dim_t,
|
|
const dim_t,
|
|
const int16_t,
|
|
const int16_t,
|
|
lpgemm_post_op*,
|
|
lpgemm_post_op_attr
|
|
);
|
|
|
|
|
|
|
|
LPGEMV(uint8_t,int8_t,int16_t,u8s8s16os16)
|
|
{
|
|
dim_t KC = lcntx->blksz.KC;
|
|
dim_t MC = lcntx->blksz.MC;
|
|
|
|
// Strides are updated based on matrix packing/reordering.
|
|
uint8_t* a_use = ( uint8_t* )a;
|
|
inc_t rs_a_use = rs_a;
|
|
inc_t cs_a_use = cs_a;
|
|
|
|
int8_t* b_use = ( int8_t* )b;
|
|
inc_t rs_b_use = rs_b;
|
|
inc_t cs_b_use = cs_b;
|
|
|
|
int16_t *c_use = NULL;
|
|
|
|
lpgemm_post_op_attr post_ops_attr;
|
|
post_ops_attr.c_stor_type = c_downscale;
|
|
if (c_downscale < S16) post_ops_attr.buf_downscale = c;
|
|
else post_ops_attr.buf_downscale = NULL;
|
|
|
|
siz_t mem_a_size_req = 0;
|
|
siz_t mem_b_size_req = 0;
|
|
|
|
mem_t mem_a = BLIS_MEM_INITIALIZER;
|
|
mem_t mem_b = BLIS_MEM_INITIALIZER;
|
|
|
|
uint8_t* pack_a_buffer;
|
|
int8_t* pack_b_buffer;
|
|
|
|
// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
|
|
thrinfo_t thread_jc;
|
|
thrinfo_t thread_ic;
|
|
|
|
lpgemm_gen_thrinfo( thread, &thread_jc, &thread_ic );
|
|
|
|
// Increased MR from 6 to 8 to make use of 16 ymm regs
|
|
dim_t MR = 8;
|
|
|
|
// Pack B matrix if rs_b > 1
|
|
if( ( mtag_b == PACK ) && ( rs_b != 1 ) )
|
|
{
|
|
mem_b_size_req = sizeof( int8_t ) * k;
|
|
|
|
lpgemm_alloc_mem_panel
|
|
(
|
|
mem_b_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
|
&mem_b, rntm
|
|
);
|
|
|
|
pack_b_buffer = ( int8_t* ) bli_mem_buffer( &mem_b );
|
|
|
|
for( dim_t k0 = 0; k0 < k; k0++ )
|
|
{
|
|
pack_b_buffer[k0] = b[ k0*rs_b ];
|
|
}
|
|
|
|
b_use = pack_b_buffer;
|
|
rs_b_use = 1;
|
|
cs_b_use = 1;
|
|
}
|
|
|
|
// Compute the IC loop thread range for the current thread.
|
|
dim_t ic_start, ic_end;
|
|
bli_thread_range_sub(&thread_ic, m, MR, FALSE, &ic_start, &ic_end);
|
|
|
|
for (dim_t ic = ic_start; ic < ic_end; ic += MC)
|
|
{
|
|
dim_t mc0 = bli_min((ic_end - ic), MC);
|
|
|
|
a_use = (uint8_t*)a + ic * rs_a;
|
|
|
|
c_use = c + ic * rs_c;
|
|
|
|
post_ops_attr.post_op_c_i = ic;
|
|
post_ops_attr.post_op_c_j = 0;
|
|
post_ops_attr.rs_c_downscale = rs_c;
|
|
|
|
if( mtag_a == PACK )
|
|
{
|
|
mem_a_size_req = sizeof( uint8_t ) * mc0 * k;
|
|
|
|
lpgemm_alloc_mem_panel
|
|
(
|
|
mem_a_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
|
&mem_a, rntm
|
|
);
|
|
|
|
pack_a_buffer = ( uint8_t* ) bli_mem_buffer( &mem_a );
|
|
|
|
( ( packa_s16 ) lcntx->packa_fun_ptr )
|
|
(
|
|
pack_a_buffer,
|
|
( a + ( rs_a * ic )), rs_a, cs_a,
|
|
mc0, k,
|
|
&rs_a_use, &cs_a_use
|
|
);
|
|
a_use = pack_a_buffer;
|
|
}
|
|
|
|
// Call lpgemv_n_one kernel
|
|
lpgemv_n_one_u8s8s16os16
|
|
(
|
|
mc0, k,
|
|
a_use, rs_a_use, cs_a_use, mtag_a,
|
|
b_use, rs_b_use, cs_b_use, mtag_b,
|
|
c_use, rs_c, cs_c,
|
|
alpha, beta,
|
|
MR, KC,
|
|
post_op_list,
|
|
&post_ops_attr
|
|
);
|
|
}
|
|
|
|
// Release pack buffers
|
|
if( mtag_a == PACK && bli_mem_is_alloc( &mem_a ) )
|
|
{
|
|
bli_pba_release(rntm, &mem_a);
|
|
}
|
|
if( mtag_b == PACK && bli_mem_is_alloc( &mem_b ) )
|
|
{
|
|
bli_pba_release(rntm, &mem_b);
|
|
}
|
|
}
|
|
|
|
|
|
// B should always be packed.
|
|
LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
|
|
{
|
|
dim_t NC = lcntx->blksz.NC;
|
|
dim_t KC = lcntx->blksz.KC;
|
|
dim_t MC = lcntx->blksz.MC;
|
|
const dim_t NR = lcntx->blksz.NR;
|
|
const dim_t MR = lcntx->blksz.MR;
|
|
|
|
lpgemm_mod_block_size_s16(m, n, k, &MC, &NC, &KC);
|
|
|
|
if (mtag_b == UNPACKED)
|
|
{
|
|
// Error: can only work with packed B now.
|
|
return;
|
|
}
|
|
|
|
if( n == 1 )
|
|
{
|
|
lpgemv_rowvar_u8s8s16os16( m, n, k,
|
|
a, rs_a, cs_a, mtag_a,
|
|
b, rs_b, cs_b, mtag_b,
|
|
c, rs_c, cs_c,
|
|
alpha,
|
|
beta,
|
|
rntm,
|
|
thread,
|
|
lcntx,
|
|
post_op_list,
|
|
c_downscale );
|
|
return;
|
|
}
|
|
|
|
const int8_t *b_use;
|
|
const uint8_t *a_use;
|
|
dim_t rs_a_use = rs_a;
|
|
dim_t cs_a_use = cs_a;
|
|
dim_t a_block_stride = 0;
|
|
|
|
dim_t rs_b_use = rs_b;
|
|
dim_t cs_b_use = cs_b;
|
|
|
|
int16_t *c_use_jc = NULL;
|
|
int16_t *c_use_ic = NULL;
|
|
dim_t rs_c_use = rs_c;
|
|
dim_t rs_c_downscale = rs_c;
|
|
|
|
// Pack buffer for A.
|
|
uint8_t* pack_a_buffer_u8s8s16o16;
|
|
mem_t mem_a = BLIS_MEM_INITIALIZER;
|
|
siz_t mem_a_size_req = 0;
|
|
|
|
// Pack buffer for B.
|
|
int8_t *pack_b_buffer_u8s8s16o16;
|
|
mem_t mem_b = BLIS_MEM_INITIALIZER;
|
|
dim_t packb_min_NR = 16;
|
|
siz_t mem_b_size_req = 0;
|
|
|
|
// Temporary buffer for C accumulation when downscaling is required.
|
|
int16_t* temp_scal_c_buffer_u8s8s16o16;
|
|
mem_t mem_scale_c = BLIS_MEM_INITIALIZER;
|
|
siz_t mem_scale_c_size_req = 0;
|
|
|
|
// Making multiple of 2 to suit k in vpmaddubsw
|
|
dim_t k_updated = make_multiple_of_n( k, 2 );
|
|
|
|
// To decide whether to apply post ops or not.
|
|
bool is_last_k = FALSE;
|
|
|
|
// To decide whether to use original s8 C or temp buffer for beta scale.
|
|
bool is_first_k = FALSE;
|
|
|
|
lpgemm_post_op_attr post_ops_attr;
|
|
post_ops_attr.c_stor_type = c_downscale;
|
|
if ( c_downscale < S16 )
|
|
{
|
|
post_ops_attr.buf_downscale = c;
|
|
}
|
|
else
|
|
{
|
|
post_ops_attr.buf_downscale = NULL;
|
|
}
|
|
|
|
// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
|
|
thrinfo_t thread_jc;
|
|
thrinfo_t thread_ic;
|
|
|
|
lpgemm_gen_thrinfo(thread, &thread_jc, &thread_ic);
|
|
|
|
// Compute the JC, IC loop thread range for the current thread.
|
|
dim_t jc_start, jc_end;
|
|
bli_thread_range_sub(&thread_jc, n, NR, FALSE, &jc_start, &jc_end);
|
|
|
|
dim_t ic_start, ic_end;
|
|
bli_thread_range_sub(&thread_ic, m, MR, FALSE, &ic_start, &ic_end);
|
|
|
|
for (dim_t jc = jc_start; jc < jc_end; jc += NC)
|
|
{
|
|
dim_t nc0 = bli_min((jc_end - jc), NC);
|
|
|
|
dim_t jc_cur_loop = jc;
|
|
dim_t jc_cur_loop_rem = 0;
|
|
dim_t n_sub_updated = 0;
|
|
|
|
if (mtag_b == REORDERED)
|
|
{
|
|
get_B_panel_reordered_start_offset_width
|
|
(
|
|
jc, n, NC, packb_min_NR,
|
|
&jc_cur_loop, &jc_cur_loop_rem,
|
|
&nc0, &n_sub_updated
|
|
);
|
|
}
|
|
|
|
if ( c_downscale == S16 )
|
|
{
|
|
c_use_jc = c + jc;
|
|
}
|
|
// Temp accumulaton buffer for C allocation.
|
|
else if ( c_downscale < S16 )
|
|
{
|
|
// Buffer memory is only required if output needs to be
|
|
// persisted across iterations of the pc/KC loop.
|
|
// It was observed that the locks used while checking out
|
|
// a buffer from memory pool had an impact on performance
|
|
// and is better to not checkout if k <= KC.
|
|
if ( k > KC )
|
|
{
|
|
mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start );
|
|
|
|
lpgemm_alloc_mem_panel
|
|
(
|
|
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
|
&mem_scale_c, rntm
|
|
);
|
|
|
|
temp_scal_c_buffer_u8s8s16o16 = bli_mem_buffer( &mem_scale_c );
|
|
|
|
c_use_jc = ( int16_t* )temp_scal_c_buffer_u8s8s16o16;
|
|
}
|
|
|
|
// The temp c buffer stride is modified as opposed to original C matrix.
|
|
rs_c_use = nc0;
|
|
}
|
|
|
|
for (dim_t pc = 0; pc < k; pc += KC)
|
|
{
|
|
int16_t beta0 = (pc == 0) ? beta : 1;
|
|
dim_t kc0 = bli_min((k - pc), KC);
|
|
|
|
// No parallelization in k dim, k always starts at 0.
|
|
is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE );
|
|
post_ops_attr.is_first_k = is_first_k;
|
|
|
|
is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
|
|
post_ops_attr.is_last_k = is_last_k;
|
|
|
|
// kc0 needs to be a multiple of 2 so that it can be
|
|
// used with vpmaddubsw instruction. Padding is added in
|
|
// cases this condition is not satisfied, and therefore
|
|
// the kc0 offsets used for packed/reordered buffers
|
|
// needs to be updated.
|
|
dim_t kc0_updated = make_multiple_of_n(kc0, 2);
|
|
|
|
if (mtag_b == PACK)
|
|
{
|
|
// Pack B chunks are based on jc work id.
|
|
dim_t jc_work_id = bli_thread_work_id(&thread_jc);
|
|
|
|
// Using child thrinfo (thread_ic) tid to decide chief thread
|
|
// per B matrix chunk (jc work id group)
|
|
if (bli_thread_am_ochief(&thread_ic))
|
|
{
|
|
// nc0 needs to be a multiple of 16 since this gives maximum
|
|
// vectorization. Packing B always results in buffers with width
|
|
// which is a multiple of 16. Subsequently the nc0 offsets used
|
|
// for packed/reordered buffers needs to be updated.
|
|
dim_t nc0_updated = make_multiple_of_n(nc0, packb_min_NR);
|
|
mem_b_size_req = sizeof(int8_t) * nc0_updated * kc0_updated;
|
|
|
|
lpgemm_alloc_mem_panel(
|
|
mem_b_size_req, BLIS_BUFFER_FOR_B_PANEL,
|
|
&mem_b, rntm);
|
|
|
|
thread->comm[jc_work_id].sent_object =
|
|
bli_mem_buffer(&mem_b);
|
|
}
|
|
|
|
// All threads in work group should wait till chief thread has
|
|
// finished allocating the packing buffers.
|
|
bli_thrcomm_barrier
|
|
(
|
|
bli_thread_ocomm_id(&thread_ic),
|
|
&thread->comm[jc_work_id]
|
|
);
|
|
|
|
pack_b_buffer_u8s8s16o16 =
|
|
(int8_t *)thread->comm[jc_work_id].sent_object;
|
|
|
|
// Compute the B panel per thread loop range for parallel
|
|
// packing using ic_ways number of threads. Since atmost only
|
|
// ic_ways threads can be used, the thread_ic attributes are
|
|
// used to split the loop range.
|
|
dim_t jc_packb_start, jc_packb_end;
|
|
bli_thread_range_sub
|
|
(
|
|
&thread_ic, nc0, NR, FALSE,
|
|
&jc_packb_start, &jc_packb_end
|
|
);
|
|
|
|
// Ensure thread ranges are valid, especially cases where no:
|
|
// of threads available for parallelization are greater than
|
|
// no: of B panel NR chunks.
|
|
if ((jc_packb_end > jc_packb_start) &&
|
|
(jc_packb_start < (jc + nc0)))
|
|
{
|
|
( ( packb_s16 )lcntx->packb_fun_ptr )
|
|
(
|
|
pack_b_buffer_u8s8s16o16 +
|
|
(jc_packb_start * kc0_updated),
|
|
(b + (rs_b * pc) + (cs_b * jc) +
|
|
(cs_b * jc_packb_start)),
|
|
rs_b,
|
|
(jc_packb_end - jc_packb_start), kc0,
|
|
&rs_b_use, &cs_b_use
|
|
);
|
|
}
|
|
else
|
|
{
|
|
lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
|
|
}
|
|
|
|
// All threads in work group should wait till B matrix packing
|
|
// is completed by the participating threads.
|
|
bli_thrcomm_barrier
|
|
(
|
|
bli_thread_ocomm_id(&thread_ic),
|
|
&thread->comm[jc_work_id]
|
|
);
|
|
|
|
b_use = pack_b_buffer_u8s8s16o16;
|
|
}
|
|
else if (mtag_b == REORDERED)
|
|
{
|
|
// In multi-threaded scenarios, an extra offset into a given
|
|
// packed B panel is required, since the jc loop split can
|
|
// result in per thread start offset inside the panel, instead
|
|
// of panel boundaries.
|
|
b_use = b + (jc_cur_loop * k_updated) +
|
|
(n_sub_updated * pc) +
|
|
(jc_cur_loop_rem * kc0_updated);
|
|
|
|
lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
|
|
}
|
|
else
|
|
{
|
|
// Unpacked B not supported.
|
|
return;
|
|
}
|
|
|
|
for (dim_t ic = ic_start; ic < ic_end; ic += MC)
|
|
{
|
|
dim_t mc0 = bli_min((ic_end - ic), MC);
|
|
|
|
// Only per thread C matrix is stored in temp buffer, so both
|
|
// per thread jc and ic start should be normalized to zero.
|
|
if ( c_downscale < S16 )
|
|
{
|
|
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
|
|
}
|
|
else
|
|
{
|
|
c_use_ic = c_use_jc + ( rs_c_use * ic );
|
|
}
|
|
|
|
// Matrix A packed and reordered code path is not triggerred
|
|
// currently for row-major inputs since we do not support it yet.
|
|
// Pack is enabled for column-major inputs to transform into
|
|
// row-major inputs as kernel expects row storage format.
|
|
if ( mtag_a == PACK )
|
|
{
|
|
mem_a_size_req = sizeof( uint8_t ) * mc0 * kc0_updated;
|
|
|
|
lpgemm_alloc_mem_panel
|
|
(
|
|
mem_a_size_req, BLIS_BUFFER_FOR_A_BLOCK,
|
|
&mem_a, rntm
|
|
);
|
|
pack_a_buffer_u8s8s16o16 = ( uint8_t* )bli_mem_buffer( &mem_a );
|
|
|
|
( ( packa_s16 )lcntx->packa_fun_ptr )
|
|
(
|
|
pack_a_buffer_u8s8s16o16,
|
|
( a + ( rs_a * ic ) + ( cs_a * pc ) ), rs_a, cs_a,
|
|
mc0, kc0,
|
|
&rs_a_use, &cs_a_use
|
|
);
|
|
a_use = pack_a_buffer_u8s8s16o16;
|
|
|
|
if( cs_a == 1 )
|
|
{
|
|
a_block_stride = kc0_updated;
|
|
}
|
|
|
|
else
|
|
{
|
|
a_block_stride = rs_a_use;
|
|
}
|
|
|
|
}
|
|
else if ( mtag_a == REORDERED )
|
|
{
|
|
lpgemm_get_packa_strides( lcntx, &rs_a_use, &cs_a_use );
|
|
a_use = a + ( pc * m ) + ( kc0_updated * ic );
|
|
a_block_stride = kc0_updated;
|
|
}
|
|
else
|
|
{
|
|
a_use = a + ( rs_a * ic ) + ( cs_a * pc );
|
|
cs_a_use = 1;
|
|
a_block_stride = rs_a;
|
|
}
|
|
|
|
for (dim_t jr = 0; jr < nc0; jr += NR)
|
|
{
|
|
dim_t nr0 = bli_min((nc0 - jr), NR);
|
|
|
|
// Post ops meta attributes.
|
|
post_ops_attr.post_op_c_i = ic;
|
|
post_ops_attr.post_op_c_j = ( jc + jr );
|
|
post_ops_attr.rs_c_downscale = rs_c_downscale;
|
|
|
|
// Calls for reorder B
|
|
( ( lpgemm_rowvar_s16 )lcntx->kern_fun_ptr )
|
|
(
|
|
mc0, nr0, kc0,
|
|
a_use, rs_a_use, cs_a_use, a_block_stride,
|
|
(b_use + (jr * kc0_updated)), rs_b_use, cs_b_use,
|
|
(c_use_ic + jr), rs_c_use, 1,
|
|
alpha, beta0,
|
|
post_op_list, post_ops_attr
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (mtag_b == REORDERED)
|
|
{
|
|
adjust_B_panel_reordered_jc(&jc, jc_cur_loop);
|
|
}
|
|
}
|
|
|
|
// Release pack buffers.
|
|
if (mtag_b == PACK)
|
|
{
|
|
// All threads in work group should wait till B matrix usage is
|
|
// completed by the participating threads.
|
|
bli_thrcomm_barrier(
|
|
bli_thread_ocomm_id(&thread_jc),
|
|
&thread->comm[bli_thread_work_id(&thread_jc)]);
|
|
|
|
if (bli_thread_am_ochief(&thread_ic))
|
|
{
|
|
if (bli_mem_is_alloc(&mem_b))
|
|
{
|
|
bli_pba_release(rntm, &mem_b);
|
|
}
|
|
}
|
|
}
|
|
if ( c_downscale < S16 )
|
|
{
|
|
if ( bli_mem_is_alloc( &mem_scale_c ) )
|
|
{
|
|
bli_pba_release( rntm, &mem_scale_c );
|
|
}
|
|
}
|
|
}
|