Added experimental bli_gemm_ker_var5().

Details:
- Added support for an experimental gemm macro-kernel incrementally
  packs one micro-panel of B at a time. This is useful for certain
  special cases of gemm where m is small.
- Minor changes to default values of clarksville configuration.
- Defined BLIS_PACKED_BLOCKS as part of pack_t type, even though we
  do not yet have any use (or implementation support) for block storage.
- Comment update to bli_packm_init.c.
This commit is contained in:
Field G. Van Zee
2013-07-08 11:24:18 -05:00
parent 9915d667a7
commit 3725013985
12 changed files with 533 additions and 11 deletions

View File

@@ -373,8 +373,12 @@ void bli_packm_init_pack( bool_t densify,
}
else
{
// If the pack schema is something else, we assume stride information
// of p is set later on, by the implementation.
// NOTE: When implementing block storage, we only need to implement
// the following two cases:
// - row-stored blocks in row-major order
// - column-stored blocks in column-major order
// The other two combinations coincide with that of packed row-panel
// and packed column- panel storage.
size_p = 0;
}

View File

@@ -34,7 +34,9 @@
#include "blis.h"
extern gemm_t* gemm_cntl;
extern gemm_t* gemm_cntl;
extern gemm_t* gemm_cntl_packa;
extern blksz_t* gemm_mc;
//
// Define object-based interface.
@@ -110,6 +112,15 @@ void bli_gemm( obj_t* alpha,
// Choose the control tree.
cntl = gemm_cntl;
#if 0
if ( bli_obj_length_after_trans( c_local ) <=
bli_blksz_total_for_obj( &c_local, gemm_mc ) )
{
cntl = gemm_cntl_packa;
}
#endif
// Invoke the internal back-end.
bli_gemm_int( &alpha_local,
&a_local,

View File

@@ -43,6 +43,7 @@
#include "bli_gemm_blk_var4.h"
#include "bli_gemm_ker_var2.h"
#include "bli_gemm_ker_var5.h"
#include "bli_gemm_ref_mxn.h"

View File

@@ -37,12 +37,18 @@
extern scalm_t* scalm_cntl;
gemm_t* gemm_cntl;
gemm_t* gemm_cntl_packa;
gemm_t* gemm_cntl_bp_ke;
gemm_t* gemm_cntl_op_bp;
gemm_t* gemm_cntl_mm_op;
gemm_t* gemm_cntl_vl_mm;
gemm_t* gemm_cntl_bp_ke5;
gemm_t* gemm_cntl_pm_bp;
gemm_t* gemm_cntl_mm_pm;
gemm_t* gemm_cntl_vl_mm5;
packm_t* gemm_packa_cntl;
packm_t* gemm_packb_cntl;
packm_t* gemm_packc_cntl;
@@ -138,7 +144,7 @@ void bli_gemm_cntl_init()
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_COLUMNS,
BLIS_BUFFER_FOR_GEN_USE );
BLIS_BUFFER_FOR_C_PANEL );
gemm_unpackc_cntl
=
@@ -147,6 +153,10 @@ void bli_gemm_cntl_init()
NULL ); // no blocksize needed
//
// Create a control tree for packing A and B, and streaming C.
//
// Create control tree object for lowest-level block-panel kernel.
gemm_cntl_bp_ke
=
@@ -160,7 +170,6 @@ void bli_gemm_cntl_init()
gemm_cntl_op_bp
=
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
//BLIS_VARIANT4, // var1 with incremental pack in iter 0
BLIS_VARIANT1,
gemm_mc,
gemm_ni,
@@ -180,7 +189,7 @@ void bli_gemm_cntl_init()
gemm_kc,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
gemm_cntl_op_bp,
@@ -203,6 +212,60 @@ void bli_gemm_cntl_init()
// Alias the "master" gemm control tree to a shorter name.
gemm_cntl = gemm_cntl_vl_mm;
//
// Create a control tree for packing A, and streaming B and C.
//
gemm_cntl_bp_ke5
=
bli_gemm_cntl_obj_create( BLIS_UNB_OPT,
BLIS_VARIANT5,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL );
gemm_cntl_pm_bp
=
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
gemm_kc,
NULL,
NULL,
gemm_packa_cntl,
NULL,
//gemm_packc_cntl,
NULL,
gemm_cntl_bp_ke5,
//gemm_unpackc_cntl );
NULL );
gemm_cntl_mm_pm
=
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
gemm_mc,
NULL,
NULL,
NULL,
NULL,
NULL,
gemm_cntl_pm_bp,
NULL );
gemm_cntl_vl_mm5
=
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
gemm_nc,
NULL,
NULL,
NULL,
NULL,
NULL,
gemm_cntl_mm_pm,
NULL );
gemm_cntl_packa = gemm_cntl_vl_mm5;
}
void bli_gemm_cntl_finalize()
@@ -224,6 +287,11 @@ void bli_gemm_cntl_finalize()
bli_cntl_obj_free( gemm_cntl_op_bp );
bli_cntl_obj_free( gemm_cntl_mm_op );
bli_cntl_obj_free( gemm_cntl_vl_mm );
bli_cntl_obj_free( gemm_cntl_bp_ke5 );
bli_cntl_obj_free( gemm_cntl_pm_bp );
bli_cntl_obj_free( gemm_cntl_mm_pm );
bli_cntl_obj_free( gemm_cntl_vl_mm5 );
}
gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type,

View File

@@ -50,7 +50,7 @@ static FUNCPTR_T vars[6][3] =
{ NULL, bli_gemm_ker_var2, bli_gemm_blk_var2 },
{ NULL, NULL, bli_gemm_blk_var3 },
{ NULL, NULL, bli_gemm_blk_var4 },
{ NULL, NULL, NULL },
{ NULL, bli_gemm_ker_var5, NULL },
{ NULL, NULL, NULL }
};

View File

@@ -0,0 +1,356 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5);
void bli_gemm_ker_var5( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
dim_t m = bli_obj_length( *c );
dim_t n = bli_obj_width( *c );
dim_t k = bli_obj_width( *a );
void* buf_a = bli_obj_buffer_at_off( *a );
inc_t rs_a = bli_obj_row_stride( *a );
inc_t cs_a = bli_obj_col_stride( *a );
inc_t ps_a = bli_obj_panel_stride( *a );
void* buf_b = bli_obj_buffer_at_off( *b );
inc_t rs_b = bli_obj_row_stride( *b );
inc_t cs_b = bli_obj_col_stride( *b );
inc_t ps_b = bli_obj_panel_stride( *b );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
num_t dt_alpha;
void* buf_alpha;
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bli_obj_is_complex( *a ) && bli_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// If alpha is a scalar constant, use dt_exec to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the alpha object and extract the buffer at the alpha offset.
bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha );
// If beta is a scalar constant, use dt_exec to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the beta object and extract the buffer at the beta offset.
bli_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,nifac) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC(ch,mr); \
\
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict a2; \
ctype* restrict b2; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = cs_b * NR; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* The current packed micro-panel of B will always be stored in bd. */ \
bp = bd; \
\
/* Since we pack micro-panels of B incrementall, one at a time, the
address of the next micro-panel of B remains constant. */ \
b2 = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* Incrementally pack a single micro-panel of B. */ \
PASTEMAC(ch,packm_cxk)( BLIS_NO_CONJUGATE, \
NR, \
k, \
one, \
b1, cs_b, rs_b, \
bp, PACKNR ); \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
{ \
a2 = a_cast; \
} \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Bottom edge handling. */ \
if ( m_left ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
if ( n_left ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* Right edge loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
{ \
a2 = a_cast; \
} \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the right edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the bottom-right corner of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( gemm_ker_var5, GEMM_UKERNEL )

View File

@@ -0,0 +1,65 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interface.
//
void bli_gemm_ker_var5( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl );
//
// Prototype BLAS-like interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( gemm_ker_var5 )

View File

@@ -243,6 +243,12 @@
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
// Incremental packing factors
#define bli_snifac BLIS_DEFAULT_NI_FAC
#define bli_dnifac BLIS_DEFAULT_NI_FAC
#define bli_cnifac BLIS_DEFAULT_NI_FAC
#define bli_znifac BLIS_DEFAULT_NI_FAC
#endif

View File

@@ -89,6 +89,7 @@
#define BLIS_BITVAL_PACKED_COLUMNS 0x40000
#define BLIS_BITVAL_PACKED_ROW_PANELS 0x50000
#define BLIS_BITVAL_PACKED_COL_PANELS 0x60000
#define BLIS_BITVAL_PACKED_BLOCKS 0x70000
#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0
#define BLIS_BITVAL_PACK_REV_IF_UPPER 0x80000
#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0
@@ -195,7 +196,8 @@ typedef enum
BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS,
BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS,
BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS,
BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS
BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS,
BLIS_PACKED_BLOCKS = BLIS_BITVAL_PACKED_BLOCKS
} pack_t;