mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Added experimental bli_gemm_ker_var5().
Details: - Added support for an experimental gemm macro-kernel incrementally packs one micro-panel of B at a time. This is useful for certain special cases of gemm where m is small. - Minor changes to default values of clarksville configuration. - Defined BLIS_PACKED_BLOCKS as part of pack_t type, even though we do not yet have any use (or implementation support) for block storage. - Comment update to bli_packm_init.c.
This commit is contained in:
@@ -373,8 +373,12 @@ void bli_packm_init_pack( bool_t densify,
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the pack schema is something else, we assume stride information
|
||||
// of p is set later on, by the implementation.
|
||||
// NOTE: When implementing block storage, we only need to implement
|
||||
// the following two cases:
|
||||
// - row-stored blocks in row-major order
|
||||
// - column-stored blocks in column-major order
|
||||
// The other two combinations coincide with that of packed row-panel
|
||||
// and packed column- panel storage.
|
||||
|
||||
size_p = 0;
|
||||
}
|
||||
|
||||
@@ -34,7 +34,9 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern gemm_t* gemm_cntl;
|
||||
extern gemm_t* gemm_cntl;
|
||||
extern gemm_t* gemm_cntl_packa;
|
||||
extern blksz_t* gemm_mc;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
@@ -110,6 +112,15 @@ void bli_gemm( obj_t* alpha,
|
||||
// Choose the control tree.
|
||||
cntl = gemm_cntl;
|
||||
|
||||
#if 0
|
||||
if ( bli_obj_length_after_trans( c_local ) <=
|
||||
bli_blksz_total_for_obj( &c_local, gemm_mc ) )
|
||||
{
|
||||
cntl = gemm_cntl_packa;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_gemm_int( &alpha_local,
|
||||
&a_local,
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "bli_gemm_blk_var4.h"
|
||||
|
||||
#include "bli_gemm_ker_var2.h"
|
||||
#include "bli_gemm_ker_var5.h"
|
||||
|
||||
#include "bli_gemm_ref_mxn.h"
|
||||
|
||||
|
||||
@@ -37,12 +37,18 @@
|
||||
extern scalm_t* scalm_cntl;
|
||||
|
||||
gemm_t* gemm_cntl;
|
||||
gemm_t* gemm_cntl_packa;
|
||||
|
||||
gemm_t* gemm_cntl_bp_ke;
|
||||
gemm_t* gemm_cntl_op_bp;
|
||||
gemm_t* gemm_cntl_mm_op;
|
||||
gemm_t* gemm_cntl_vl_mm;
|
||||
|
||||
gemm_t* gemm_cntl_bp_ke5;
|
||||
gemm_t* gemm_cntl_pm_bp;
|
||||
gemm_t* gemm_cntl_mm_pm;
|
||||
gemm_t* gemm_cntl_vl_mm5;
|
||||
|
||||
packm_t* gemm_packa_cntl;
|
||||
packm_t* gemm_packb_cntl;
|
||||
packm_t* gemm_packc_cntl;
|
||||
@@ -138,7 +144,7 @@ void bli_gemm_cntl_init()
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COLUMNS,
|
||||
BLIS_BUFFER_FOR_GEN_USE );
|
||||
BLIS_BUFFER_FOR_C_PANEL );
|
||||
|
||||
gemm_unpackc_cntl
|
||||
=
|
||||
@@ -147,6 +153,10 @@ void bli_gemm_cntl_init()
|
||||
NULL ); // no blocksize needed
|
||||
|
||||
|
||||
//
|
||||
// Create a control tree for packing A and B, and streaming C.
|
||||
//
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
gemm_cntl_bp_ke
|
||||
=
|
||||
@@ -160,7 +170,6 @@ void bli_gemm_cntl_init()
|
||||
gemm_cntl_op_bp
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
//BLIS_VARIANT4, // var1 with incremental pack in iter 0
|
||||
BLIS_VARIANT1,
|
||||
gemm_mc,
|
||||
gemm_ni,
|
||||
@@ -180,7 +189,7 @@ void bli_gemm_cntl_init()
|
||||
gemm_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm_cntl_op_bp,
|
||||
@@ -203,6 +212,60 @@ void bli_gemm_cntl_init()
|
||||
|
||||
// Alias the "master" gemm control tree to a shorter name.
|
||||
gemm_cntl = gemm_cntl_vl_mm;
|
||||
|
||||
|
||||
//
|
||||
// Create a control tree for packing A, and streaming B and C.
|
||||
//
|
||||
|
||||
gemm_cntl_bp_ke5
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT5,
|
||||
NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL );
|
||||
gemm_cntl_pm_bp
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm_packa_cntl,
|
||||
NULL,
|
||||
//gemm_packc_cntl,
|
||||
NULL,
|
||||
gemm_cntl_bp_ke5,
|
||||
//gemm_unpackc_cntl );
|
||||
NULL );
|
||||
|
||||
gemm_cntl_mm_pm
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm_cntl_pm_bp,
|
||||
NULL );
|
||||
|
||||
gemm_cntl_vl_mm5
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm_cntl_mm_pm,
|
||||
NULL );
|
||||
|
||||
gemm_cntl_packa = gemm_cntl_vl_mm5;
|
||||
}
|
||||
|
||||
void bli_gemm_cntl_finalize()
|
||||
@@ -224,6 +287,11 @@ void bli_gemm_cntl_finalize()
|
||||
bli_cntl_obj_free( gemm_cntl_op_bp );
|
||||
bli_cntl_obj_free( gemm_cntl_mm_op );
|
||||
bli_cntl_obj_free( gemm_cntl_vl_mm );
|
||||
|
||||
bli_cntl_obj_free( gemm_cntl_bp_ke5 );
|
||||
bli_cntl_obj_free( gemm_cntl_pm_bp );
|
||||
bli_cntl_obj_free( gemm_cntl_mm_pm );
|
||||
bli_cntl_obj_free( gemm_cntl_vl_mm5 );
|
||||
}
|
||||
|
||||
gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type,
|
||||
|
||||
@@ -50,7 +50,7 @@ static FUNCPTR_T vars[6][3] =
|
||||
{ NULL, bli_gemm_ker_var2, bli_gemm_blk_var2 },
|
||||
{ NULL, NULL, bli_gemm_blk_var3 },
|
||||
{ NULL, NULL, bli_gemm_blk_var4 },
|
||||
{ NULL, NULL, NULL },
|
||||
{ NULL, bli_gemm_ker_var5, NULL },
|
||||
{ NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
|
||||
356
frame/3/gemm/bli_gemm_ker_var5.c
Normal file
356
frame/3/gemm/bli_gemm_ker_var5.c
Normal file
@@ -0,0 +1,356 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5);
|
||||
|
||||
|
||||
void bli_gemm_ker_var5( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
dim_t m = bli_obj_length( *c );
|
||||
dim_t n = bli_obj_width( *c );
|
||||
dim_t k = bli_obj_width( *a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bli_obj_row_stride( *a );
|
||||
inc_t cs_a = bli_obj_col_stride( *a );
|
||||
inc_t ps_a = bli_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bli_obj_row_stride( *b );
|
||||
inc_t cs_b = bli_obj_col_stride( *b );
|
||||
inc_t ps_b = bli_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bli_obj_is_complex( *a ) && bli_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// If alpha is a scalar constant, use dt_exec to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the alpha object and extract the buffer at the alpha offset.
|
||||
bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha );
|
||||
|
||||
// If beta is a scalar constant, use dt_exec to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,nifac) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC(ch,mr); \
|
||||
\
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = cs_b * NR; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* The current packed micro-panel of B will always be stored in bd. */ \
|
||||
bp = bd; \
|
||||
\
|
||||
/* Since we pack micro-panels of B incrementall, one at a time, the
|
||||
address of the next micro-panel of B remains constant. */ \
|
||||
b2 = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Incrementally pack a single micro-panel of B. */ \
|
||||
PASTEMAC(ch,packm_cxk)( BLIS_NO_CONJUGATE, \
|
||||
NR, \
|
||||
k, \
|
||||
one, \
|
||||
b1, cs_b, rs_b, \
|
||||
bp, PACKNR ); \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom edge handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
if ( n_left ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Right edge loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the right edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the bottom-right corner of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm_ker_var5, GEMM_UKERNEL )
|
||||
|
||||
65
frame/3/gemm/bli_gemm_ker_var5.h
Normal file
65
frame/3/gemm/bli_gemm_ker_var5.h
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_gemm_ker_var5( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ker_var5 )
|
||||
|
||||
@@ -243,6 +243,12 @@
|
||||
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
|
||||
|
||||
// Incremental packing factors
|
||||
|
||||
#define bli_snifac BLIS_DEFAULT_NI_FAC
|
||||
#define bli_dnifac BLIS_DEFAULT_NI_FAC
|
||||
#define bli_cnifac BLIS_DEFAULT_NI_FAC
|
||||
#define bli_znifac BLIS_DEFAULT_NI_FAC
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -89,6 +89,7 @@
|
||||
#define BLIS_BITVAL_PACKED_COLUMNS 0x40000
|
||||
#define BLIS_BITVAL_PACKED_ROW_PANELS 0x50000
|
||||
#define BLIS_BITVAL_PACKED_COL_PANELS 0x60000
|
||||
#define BLIS_BITVAL_PACKED_BLOCKS 0x70000
|
||||
#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0
|
||||
#define BLIS_BITVAL_PACK_REV_IF_UPPER 0x80000
|
||||
#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0
|
||||
@@ -195,7 +196,8 @@ typedef enum
|
||||
BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS,
|
||||
BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS,
|
||||
BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS,
|
||||
BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS
|
||||
BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS,
|
||||
BLIS_PACKED_BLOCKS = BLIS_BITVAL_PACKED_BLOCKS
|
||||
} pack_t;
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user