Added 1m-specific APIs for bp, pb gemm algorithms.

Details:
- Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the
  body of bli_gemm_cntl_create() replaced with a call to the former.
- Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now,
  bli_cntl_free() can check if the thread parameter is NULL, and if so,
  call the latter, and otherwise call the former.
- Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in
  terms of bli_gemm1mxx_cntx_init(), which behaves the same as
  bli_gemm1m_cntx_init() did before, except that an extra bool parameter
  (is_pb) is used to support both bp and pb algorithms (including to
  support the anti-preference field described below).
- Added support for "anti-preference" in context. The anti_pref field,
  when true, will toggle the boolean return value of routines such as
  bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of
  causing BLIS to transpose the operation to achieve disagreement (rather
  than agreement) between the storage of C and the micro-kernel output
  preference. This disagreement is needed for panel-block implementations,
  since they induce a transposition of the suboperation immediately before
  the macro-kernel is called, which changes the apparent storage of C. For
  now, anti-preference is used only with the pb algorithm for 1m (and not
  with any other non-1m implementation).
- Defined new functions,
    bli_cntx_l3_ukr_eff_prefers_storage_of()
    bli_cntx_l3_ukr_eff_dislikes_storage_of()
    bli_cntx_l3_nat_ukr_eff_prefers_storage_of()
    bli_cntx_l3_nat_ukr_eff_dislikes_storage_of()
  which are identical to their non-"eff" (effectively) counterparts except
  that they take the anti-preference field of the context into account.
- Explicitly initialize the anti-pref field to FALSE in
  bli_gks_cntx_set_l3_nat_ukr_prefs().
- Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel
  in terms of the existing block-panel macro-kernel _ker_var2(). This
  technique requires inducing transposes on all operands and swapping
  the A and B.
- Changed bli_obj_induce_trans() macro so that pack-related fields are
  also changed to reflect the induced transposition.
- Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily
  specify the 1m algorithm (block-panel or panel-block).
- Renamed the following cntx_t-related macros:
    bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block()
    bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel()
    bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel()
  and updated all instantiations. Also updated the field names in the
  cntx_t struct.
- Comment updates.
This commit is contained in:
Field G. Van Zee
2017-01-25 16:25:46 -06:00
committed by prangana
parent 1d728ccb23
commit 4f61528d56
25 changed files with 892 additions and 187 deletions

View File

@@ -121,11 +121,11 @@ siz_t bli_packm_init
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
{
schema = bli_cntx_get_pack_schema_a( cntx );
schema = bli_cntx_get_pack_schema_a_block( cntx );
}
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
{
schema = bli_cntx_get_pack_schema_b( cntx );
schema = bli_cntx_get_pack_schema_b_panel( cntx );
}
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
{

View File

@@ -70,8 +70,8 @@ void bli_l3_cntl_create_if
else
{
// If the user provided a control tree, create a copy and use it
// instead (so that it can be used to cache things like pack mem_t
// entries).
// instead (so that threads can use its local tree as a place to
// cache things like pack mem_t entries).
*cntl_use = bli_cntl_copy( cntl_orig );
}
}

View File

@@ -63,9 +63,8 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx )
cntx );
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
BLIS_PACKED_COL_PANELS,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
}
void bli_gemm_cntx_finalize( cntx_t* cntx )
@@ -106,9 +105,8 @@ void bli_trsm_cntx_init( num_t dt, cntx_t* cntx )
cntx );
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
BLIS_PACKED_COL_PANELS,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
}
void bli_trsm_cntx_finalize( cntx_t* cntx )

View File

@@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create
opid_t family
)
{
void* macro_kernel_p = bli_gemm_ker_var2;
return bli_gemmbp_cntl_create( family );
}
// -----------------------------------------------------------------------------
cntl_t* bli_gemmbp_cntl_create
(
opid_t family
)
{
void* macro_kernel_p = bli_gemm_ker_var2;
// Change the macro-kernel if the operation family is herk or trmm.
if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
@@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create
// Create a node for packing matrix A.
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
(
bli_gemm_packa,
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
BLIS_MR,
BLIS_KR,
@@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create
// Create a node for packing matrix B.
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
(
bli_gemm_packb,
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
BLIS_KR,
BLIS_NR,
@@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create
return gemm_cntl_vl_mm;
}
// -----------------------------------------------------------------------------
cntl_t* bli_gemmpb_cntl_create
(
opid_t family
)
{
void* macro_kernel_p = bli_gemm_ker_var1;
// Change the macro-kernel if the operation family is herk or trmm.
//if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create
(
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create
(
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
gemm_cntl_ub_ke
);
// Create a node for packing matrix A (which is really the right-hand
// operand "B").
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
(
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
BLIS_KR,
BLIS_MR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
gemm_cntl_pb_ub
);
// Create a node for partitioning the n dimension by MC.
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create
(
BLIS_MC,
bli_gemm_blk_var2,
gemm_cntl_packb
);
// Create a node for packing matrix B (which is really the left-hand
// operand "A").
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
(
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
BLIS_NR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
gemm_cntl_op_pb
);
// Create a node for partitioning the k dimension by KC.
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
(
BLIS_KC,
bli_gemm_blk_var3,
gemm_cntl_packa
);
// Create a node for partitioning the m dimension by NC.
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
(
BLIS_NC,
bli_gemm_blk_var1,
gemm_cntl_mm_op
);
return gemm_cntl_vl_mm;
}
// -----------------------------------------------------------------------------
void bli_gemm_cntl_free
(
cntl_t* cntl,

View File

@@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create
opid_t family
);
// -----------------------------------------------------------------------------
cntl_t* bli_gemmbp_cntl_create
(
opid_t family
);
cntl_t* bli_gemmpb_cntl_create
(
opid_t family
);
// -----------------------------------------------------------------------------
void bli_gemm_cntl_free
(
cntl_t* cntl,

View File

@@ -46,11 +46,10 @@ void bli_gemm_front
cntl_t* cntl
)
{
#ifdef BLIS_SMALL_MATRIX_ENABLE
#ifndef BLIS_ENABLE_MULTITHREADING
gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl);
if(BLIS_SUCCESS != status)
#endif
#endif
{
obj_t a_local;
@@ -90,9 +89,6 @@ void bli_gemm_front
bli_obj_induce_trans( c_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_GEMM, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
@@ -103,6 +99,7 @@ void bli_gemm_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_ker_var1
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Implement _ker_var1() in terms of _ker_var2() by transposing the
// entire suboperation (which also requires swapping A and B).
bli_obj_induce_trans( *a );
bli_obj_induce_trans( *b );
bli_obj_induce_trans( *c );
bli_gemm_ker_var2( b, a, c, cntx, cntl, thread );
}

View File

@@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 )
GENPROT( gemm_packa )
GENPROT( gemm_packb )
GENPROT( gemm_ker_var1 )
GENPROT( gemm_ker_var2 )
// Headers for induced algorithms:

View File

@@ -97,6 +97,16 @@ void bli_cntl_free
cntl_t* cntl,
thrinfo_t* thread
)
{
if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread );
else bli_cntl_free_wo_thrinfo( cntl );
}
void bli_cntl_free_w_thrinfo
(
cntl_t* cntl,
thrinfo_t* thread
)
{
// Base case: simply return when asked to free NULL nodes.
if ( cntl == NULL ) return;
@@ -112,7 +122,7 @@ void bli_cntl_free
{
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free( cntl_sub_node, thread_sub_node );
bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node );
}
// Free the current node's params field, if it is non-NULL.
@@ -122,8 +132,8 @@ void bli_cntl_free
}
// Release the current node's pack mem_t entry back to the memory
// broker from which it originated, but only if the current thread
// is chief for its group, and only if the mem_t is allocated.
// broker from which it originated, but only if the mem_t entry is
// allocated, and only if the current thread is chief for its group.
if ( bli_thread_am_ochief( thread ) )
if ( bli_mem_is_alloc( cntl_pack_mem ) )
{
@@ -134,6 +144,42 @@ void bli_cntl_free
bli_cntl_obj_free( cntl );
}
void bli_cntl_free_wo_thrinfo
(
cntl_t* cntl
)
{
// Base case: simply return when asked to free NULL nodes.
if ( cntl == NULL ) return;
cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl );
void* cntl_params = bli_cntl_params( cntl );
mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl );
{
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free_wo_thrinfo( cntl_sub_node );
}
// Free the current node's params field, if it is non-NULL.
if ( cntl_params != NULL )
{
bli_free_intl( cntl_params );
}
// Release the current node's pack mem_t entry back to the memory
// broker from which it originated, but only if the mem_t entry is
// allocated.
if ( bli_mem_is_alloc( cntl_pack_mem ) )
{
bli_membrk_release( cntl_pack_mem );
}
// Free the current node.
bli_cntl_obj_free( cntl );
}
// -----------------------------------------------------------------------------
cntl_t* bli_cntl_copy

View File

@@ -75,12 +75,25 @@ void bli_cntl_obj_clear
cntl_t* cntl
);
// -----------------------------------------------------------------------------
void bli_cntl_free
(
cntl_t* cntl,
thrinfo_t* thread
);
void bli_cntl_free_w_thrinfo
(
cntl_t* cntl,
thrinfo_t* thread
);
void bli_cntl_free_wo_thrinfo
(
cntl_t* cntl
);
cntl_t* bli_cntl_copy
(
cntl_t* cntl

View File

@@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx )
return bli_cntx_method( cntx );
}
pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx )
pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx )
{
return bli_cntx_schema_a( cntx );
return bli_cntx_schema_a_block( cntx );
}
pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx )
pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx )
{
return bli_cntx_schema_b( cntx );
return bli_cntx_schema_b_panel( cntx );
}
pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx )
{
return bli_cntx_schema_c_panel( cntx );
}
bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx )
{
return bli_cntx_anti_pref( cntx );
}
#endif
@@ -705,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method,
bli_cntx_set_method( method, cntx );
}
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx )
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx )
{
bli_cntx_set_schema_a( schema_a, cntx );
bli_cntx_set_schema_b( schema_b, cntx );
bli_cntx_set_schema_a_block( schema_a, cntx );
bli_cntx_set_schema_b_panel( schema_b, cntx );
}
void bli_cntx_set_pack_schema_a( pack_t schema_a,
cntx_t* cntx )
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
cntx_t* cntx )
{
bli_cntx_set_schema_a( schema_a, cntx );
bli_cntx_set_schema_a_block( schema_a, cntx );
}
void bli_cntx_set_pack_schema_b( pack_t schema_b,
cntx_t* cntx )
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
cntx_t* cntx )
{
bli_cntx_set_schema_b( schema_b, cntx );
bli_cntx_set_schema_b_panel( schema_b, cntx );
}
void bli_cntx_set_pack_schema_c( pack_t schema_c,
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
cntx_t* cntx )
{
bli_cntx_set_schema_c_panel( schema_c, cntx );
}
#if 0
void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
cntx_t* cntx )
{
bli_cntx_set_schema_c( schema_c, cntx );
bli_cntx_set_anti_pref( anti_pref, cntx );
}
#endif
void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
dim_t m, dim_t n, dim_t k )
@@ -904,6 +922,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
// -----------------------------------------------------------------------------
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx )
@@ -953,6 +997,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
// -----------------------------------------------------------------------------
void bli_cntx_print( cntx_t* cntx )

View File

@@ -59,6 +59,8 @@ typedef struct cntx_s
pack_t schema_b;
pack_t schema_c;
bool_t anti_pref;
dim_t* thrloop;
membrk_t* membrk;
@@ -113,26 +115,30 @@ typedef struct cntx_s
\
( (cntx)->method )
#define bli_cntx_schema_a( cntx ) \
#define bli_cntx_schema_a_block( cntx ) \
\
( (cntx)->schema_a )
( (cntx)->schema_a_block )
#define bli_cntx_schema_b( cntx ) \
#define bli_cntx_schema_b_panel( cntx ) \
\
( (cntx)->schema_b )
( (cntx)->schema_b_panel )
#define bli_cntx_schema_c( cntx ) \
#define bli_cntx_schema_c_panel( cntx ) \
\
( (cntx)->schema_c )
( (cntx)->schema_c_panel )
#define bli_cntx_membrk( cntx ) \
#define bli_cntx_anti_pref( cntx ) \
\
( (cntx)->membrk )
( (cntx)->anti_pref )
#define bli_cntx_thrloop( cntx ) \
\
( (cntx)->thrloop )
#define bli_cntx_membrk( cntx ) \
\
( (cntx)->membrk )
#if 1
#define bli_cntx_jc_way( cntx ) \
\
@@ -211,24 +217,24 @@ typedef struct cntx_s
(cntx_p)->method = _method; \
}
#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \
#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \
{ \
(cntx_p)->schema_a = _schema_a; \
(cntx_p)->schema_a_block = _schema_a_block; \
}
#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \
#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \
{ \
(cntx_p)->schema_b = _schema_b; \
(cntx_p)->schema_b_panel = _schema_b_panel; \
}
#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \
#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \
{ \
(cntx_p)->schema_c = _schema_c; \
(cntx_p)->schema_c_panel = _schema_c_panel; \
}
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \
{ \
(cntx_p)->membrk = _membrk; \
(cntx_p)->anti_pref = _anti_pref; \
}
#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \
@@ -241,6 +247,11 @@ typedef struct cntx_s
(cntx_p)->thrloop[ BLIS_KR ] = 1; \
}
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
{ \
(cntx_p)->membrk = _membrk; \
}
// cntx_t query (complex)
#define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \
@@ -323,13 +334,17 @@ typedef struct cntx_s
\
bli_cntx_method( cntx )
#define bli_cntx_get_pack_schema_a( cntx ) \
#define bli_cntx_get_pack_schema_a_block( cntx ) \
\
bli_cntx_schema_a( cntx )
bli_cntx_schema_a_block( cntx )
#define bli_cntx_get_pack_schema_b( cntx ) \
#define bli_cntx_get_pack_schema_b_panel( cntx ) \
\
bli_cntx_schema_b( cntx )
bli_cntx_schema_b_panel( cntx )
#define bli_cntx_get_pack_schema_c_panel( cntx ) \
\
bli_cntx_schema_c_panel( cntx )
#define bli_cntx_get_membrk( cntx ) \
\
@@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
// l1vkr_t ker_id,
// cntx_t* cntx );
//ind_t bli_cntx_get_ind_method( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx );
//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx );
dim_t bli_cntx_get_num_threads( cntx_t* cntx );
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl );
@@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func,
cntx_t* cntx );
void bli_cntx_set_ind_method( ind_t method,
cntx_t* cntx );
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_a( pack_t schema_a,
cntx_t* cntx );
void bli_cntx_set_pack_schema_b( pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_c( pack_t schema_c,
cntx_t* cntx );
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
cntx_t* cntx );
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
cntx_t* cntx );
//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
// cntx_t* cntx );
void bli_cntx_set_thrloop_from_env( opid_t l3_op,
side_t side,
cntx_t* cntx,
@@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj,
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx );
@@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj,
bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
// print function

View File

@@ -606,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr,
mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ];
bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref );
// Explicitly set the anti-preference to FALSE.
bli_cntx_set_anti_pref( FALSE, cntx );
}

View File

@@ -877,6 +877,12 @@ bli_obj_width_stored( obj )
(obj).n_panel = n0; \
}
#define bli_obj_set_panel_dims( m0, n0, obj ) \
{ \
bli_obj_set_panel_length( m0, obj ); \
bli_obj_set_panel_width( n0, obj ); \
}
#define bli_obj_set_panel_dim( panel_dim, obj ) \
{ \
(obj).pd = panel_dim; \
@@ -985,6 +991,7 @@ bli_obj_width_stored( obj )
#define bli_obj_induce_trans( obj ) \
{ \
{ \
/* Induce transposition among basic fields. */ \
dim_t m_ = bli_obj_length( obj ); \
dim_t n_ = bli_obj_width( obj ); \
inc_t rs_ = bli_obj_row_stride( obj ); \
@@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj )
\
if ( bli_obj_is_upper_or_lower( obj ) ) \
bli_obj_toggle_uplo( obj ); \
\
/* Induce transposition among packed fields. */ \
dim_t m_padded_ = bli_obj_padded_length( obj ); \
dim_t n_padded_ = bli_obj_padded_width( obj ); \
dim_t m_panel_ = bli_obj_panel_length( obj ); \
dim_t n_panel_ = bli_obj_panel_width( obj ); \
\
bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \
bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \
\
/* Note that this macro DOES NOT touch the transposition bit! If
the calling code is using this macro to handle an object whose

View File

@@ -975,9 +975,11 @@ typedef struct cntx_s
opid_t family;
ind_t method;
pack_t schema_a;
pack_t schema_b;
pack_t schema_c;
pack_t schema_a_block;
pack_t schema_b_panel;
pack_t schema_c_panel;
bool_t anti_pref;
dim_t thrloop[ BLIS_NUM_LOOPS ];

View File

@@ -151,9 +151,8 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI,
BLIS_PACKED_COL_PANELS_3MI,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
}
void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -200,9 +199,8 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS,
BLIS_PACKED_COL_PANELS_3MI,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MS, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
}
void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -249,9 +247,8 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
BLIS_PACKED_COL_PANELS_3MS,
cntx );
bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MS, cntx );
}
void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -259,15 +256,15 @@ void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx )
// Set the pack_t schemas as a function of the stage of execution.
if ( stage == 0 )
{
bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
}
else if ( stage == 1 )
{
bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_IO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
}
else // if ( stage == 2 )
{
bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RPI, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
}
}
@@ -311,9 +308,8 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
0, // not yet needed; varies with _stage()
cntx );
bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage()
}
void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -321,18 +317,18 @@ void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx )
// Set the pack_t schemas as a function of the stage of execution.
if ( stage == 0 )
{
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
BLIS_PACKED_COL_PANELS_RO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
}
else if ( stage == 1 )
{
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
BLIS_PACKED_COL_PANELS_IO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
}
else // if ( stage == 2 )
{
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RPI,
BLIS_PACKED_COL_PANELS_RPI, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx );
}
}
@@ -376,9 +372,8 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
BLIS_PACKED_COL_PANELS_4MI,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
}
void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -425,9 +420,8 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
BLIS_PACKED_COL_PANELS_4MI,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
}
void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -474,9 +468,8 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
0, // not yet needed; varies with _stage()
cntx );
bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage()
}
void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -484,23 +477,23 @@ void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx )
// Set the pack_t schemas as a function of the stage of execution.
if ( stage == 0 )
{
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
BLIS_PACKED_COL_PANELS_RO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
}
else if ( stage == 1 )
{
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
BLIS_PACKED_COL_PANELS_IO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
}
else if ( stage == 2 )
{
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
BLIS_PACKED_COL_PANELS_IO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
}
else // if ( stage == 3 )
{
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
BLIS_PACKED_COL_PANELS_RO, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
}
}
@@ -511,6 +504,22 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx )
// -----------------------------------------------------------------------------
void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
{
// Default to context for block-panel algorithm.
bli_gemm1mbp_cntx_init( dt, cntx );
}
void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx )
{
bli_gemm1mxx_cntx_init( dt, FALSE, cntx );
}
void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx )
{
bli_gemm1mxx_cntx_init( dt, TRUE, cntx );
}
void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx )
{
const ind_t method = BLIS_1M;
@@ -529,8 +538,24 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
// Initialize the context with packm-related kernels.
bli_packm_cntx_init( dt, cntx );
// Initialize the blocksizes according to the micro-kernel preference as
// well as the algorithm.
if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
// This branch is used for algorithms 1m_c_bp, 1m_r_pb.
// Set the pack_t schemas for the c_bp or r_pb algorithms.
if ( !is_pb )
{
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
}
else // if ( is_pb )
{
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx );
}
// Initialize the context with the current architecture's register
// and cache blocksizes (and multiples), and the induced method.
bli_gks_cntx_set_blkszs
@@ -544,14 +569,23 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
BLIS_KR, BLIS_KR, 1.0, 1.0,
cntx
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E,
BLIS_PACKED_COL_PANELS_1R,
cntx );
}
else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
// This branch is used for algorithms 1m_r_bp, 1m_c_pb.
// Set the pack_t schemas for the r_bp or c_pb algorithms.
if ( !is_pb )
{
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
}
else // if ( is_pb )
{
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx );
}
// Initialize the context with the current architecture's register
// and cache blocksizes (and multiples), and the induced method.
bli_gks_cntx_set_blkszs
@@ -565,12 +599,15 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
BLIS_KR, BLIS_KR, 1.0, 1.0,
cntx
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R,
BLIS_PACKED_COL_PANELS_1E,
cntx );
}
// Set the anti-preference field to TRUE when executing a panel-block
// algorithm, and FALSE otherwise. This will cause higher-level generic
// code to establish (if needed) disagreement between the storage of C and
// the micro-kernel output preference so that the two will come back into
// agreement in the panel-block macro-kernel (which implemented in terms
// of the block-panel macro-kernel with some induced transpositions).
bli_cntx_set_anti_pref( is_pb, cntx );
}
void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx )

View File

@@ -65,6 +65,9 @@ void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx );
void bli_gemm4m1_cntx_finalize( cntx_t* cntx );
void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx );
void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx );
void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx );
void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx );
void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx );
void bli_gemm1m_cntx_finalize( cntx_t* cntx );

View File

@@ -74,9 +74,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI,
BLIS_PACKED_COL_PANELS_3MI,
cntx );
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI,
BLIS_PACKED_COL_PANELS_3MI,
cntx );
}
void bli_trsm3m1_cntx_finalize( cntx_t* cntx )
@@ -123,9 +123,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
BLIS_PACKED_COL_PANELS_4MI,
cntx );
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI,
BLIS_PACKED_COL_PANELS_4MI,
cntx );
}
void bli_trsm4m1_cntx_finalize( cntx_t* cntx )
@@ -174,9 +174,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E,
BLIS_PACKED_COL_PANELS_1R,
cntx );
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E,
BLIS_PACKED_COL_PANELS_1R,
cntx );
}
else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
@@ -195,9 +195,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R,
BLIS_PACKED_COL_PANELS_1E,
cntx );
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R,
BLIS_PACKED_COL_PANELS_1E,
cntx );
}
}

View File

@@ -0,0 +1,85 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -- gemmbp/gemmpb ------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname, imeth, alg ) \
\
void PASTEMAC2(opname,imeth,alg) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
) \
{ \
num_t dt = bli_obj_datatype( *c ); \
cntx_t cntx; \
cntl_t* cntl_p; \
\
/* If the objects are in the real domain, execute the native
implementation. */ \
if ( bli_obj_is_real( *c ) ) \
{ \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, NULL ); \
return; \
} \
\
/* Initialize a local 1m context for the current algorithm (bp or pb). */ \
PASTEMAC3(opname,imeth,alg,_cntx_init)( dt, &cntx ); \
\
/* Create a control tree for the current algorithm (bp or pb). */ \
cntl_p = PASTEMAC2(opname,alg,_cntl_create)( BLIS_GEMM ); \
\
/* Invoke the operation's front end using the context and control
tree we just created. */ \
PASTEMAC(opname,_front)( alpha, a, b, beta, c, &cntx, cntl_p ); \
\
/* Free the control tree. Since the implementation will only make
copies of it (and not use it directly) we do not need to supply
a thread object. */ \
bli_cntl_free( cntl_p, NULL ); \
\
/* Finalize the local context. */ \
PASTEMAC2(opname,imeth,_cntx_finalize)( &cntx ); \
}
// gemm
GENFRONT( gemm, 1m, bp )
GENFRONT( gemm, 1m, pb )

View File

@@ -62,6 +62,14 @@ void PASTEMAC(opname,imeth) \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \
return; \
} \
\
/* A temporary hack to easily specify the 1m algorithm (block-panel or
panel-block). */ \
if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \
{ \
bli_gemm1mbp( alpha, a, b, beta, c ); \
return; \
} \
\
/* Initialize a local context if the one provided is NULL. */ \
bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \

View File

@@ -80,3 +80,17 @@ GENPROT_NO2OP( 3m2 )
GENPROT_NO2OP( 4mh )
GENPROT_NO2OP( 4mb )
//
// Generate object-based prototypes for 1m methods that specify an algorithm
// (e.g., block-panel or panel-block).
//
#undef GENPROT
#define GENPROT( imeth, alg ) \
\
void PASTEMAC2(gemm,imeth,alg) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); \
GENPROT( 1m, bp )
GENPROT( 1m, pb )

View File

@@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \
PASTECH(chr,gemm_ukr_ft) \
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const bool_t row_pref = !col_pref; \
/*const bool_t row_pref = !col_pref;*/ \
\
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -77,10 +77,8 @@ void PASTEMAC(ch,varname) \
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
\
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
\
ctype_r beta_use; \
ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \
ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \
\
ctype_r* c_use; \
inc_t rs_c_use; \
@@ -96,75 +94,71 @@ void PASTEMAC(ch,varname) \
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
\
/* Sanity check: These should never occur because storage/preference
agreement is handled at a higher level. */ \
/*
if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
*/ \
\
\
/* If beta has a non-zero imaginary component OR if c is stored with
general stride OR if for some reason the storage of c is not the
preferred storage of the micro-kernel, then we compute the
alpha*a*b product into temporary storage and then accumulate that
result into c afterwards. */ \
if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \
else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
general stride, then we compute the alpha*a*b product into temporary
storage and then accumulate that result into c afterwards. Note that
the other two cases concerning disagreement between the storage of C
and the output preference of the micro-kernel, should never occur
(though we could handle them if they did occur). */ \
if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \
/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \
else using_ct = FALSE; \
\
\
if ( using_ct ) \
{ \
/* In the atypical cases, we compute the result into temporary
workspace ct and then accumulated it back to c at the end. */ \
\
/* Set the strides of ct based on the preference of the underlying
native real domain gemm micro-kernel. Note that we set the ct
strides in units of complex elements. */ \
if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \
else { rs_ct = nr; cs_ct = 1; } \
\
beta_use = *zero_r; \
c_use = ( ctype_r* )ct; \
rs_c_use = rs_ct; \
cs_c_use = cs_ct; \
} \
else \
{ \
/* In a typical case, we use the real part of beta and accumulate
directly into the output matrix c. */ \
beta_use = beta_r; \
c_use = ( ctype_r* )c; \
rs_c_use = rs_c; \
cs_c_use = cs_c; \
} \
\
/* Convert the strides from being in units of complex elements to
be in units of real elements. Note that we don't need to check for
general storage here because that case corresponds to the scenario
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
else rs_c_use *= 2; \
\
/* Convert the strides from being in units of complex elements to
be in units of real elements. Note that we don't need to check for
general storage here because that case corresponds to the scenario
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
else rs_c_use *= 2; \
/* The following gemm micro-kernel call implements the 1m method,
which induces a complex matrix multiplication by calling the
real matrix micro-kernel on micro-panels that have been packed
according to the 1e and 1r formats. */ \
\
/* c = beta * c + alpha_r * a * b; */ \
rgemm_ukr \
( \
k2, \
alpha_r, \
a_r, \
b_r, \
zero_r, \
c_use, rs_c_use, cs_c_use, \
data, \
cntx \
); \
\
/* The following gemm micro-kernel call implements the 1m method,
which induces a complex matrix multiplication by calling the
real matrix micro-kernel on micro-panels that have been packed
according to the 1e and 1r formats. */ \
\
/* c = beta * c + alpha_r * a * b; */ \
rgemm_ukr \
( \
k2, \
alpha_r, \
a_r, \
b_r, \
&beta_use, \
c_use, rs_c_use, cs_c_use, \
data, \
cntx \
); \
\
\
/* If necessary, accumulate the final result in ct back to c. */ \
if ( using_ct ) \
{ \
dim_t i, j; \
\
/* Accumulate the final result in ct back to c. */ \
for ( j = 0; j < nr; ++j ) \
for ( i = 0; i < mr; ++i ) \
{ \
@@ -173,6 +167,40 @@ void PASTEMAC(ch,varname) \
*(c + i*rs_c + j*cs_c ) ); \
} \
} \
else \
{ \
/* In the typical cases, we use the real part of beta and
accumulate directly into the output matrix c. */ \
\
c_use = ( ctype_r* )c; \
rs_c_use = rs_c; \
cs_c_use = cs_c; \
\
/* Convert the strides from being in units of complex elements to
be in units of real elements. Note that we don't need to check for
general storage here because that case corresponds to the scenario
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
else rs_c_use *= 2; \
\
/* The following gemm micro-kernel call implements the 1m method,
which induces a complex matrix multiplication by calling the
real matrix micro-kernel on micro-panels that have been packed
according to the 1e and 1r formats. */ \
\
/* c = beta * c + alpha_r * a * b; */ \
rgemm_ukr \
( \
k2, \
alpha_r, \
a_r, \
b_r, \
beta_r, \
c_use, rs_c_use, cs_c_use, \
data, \
cntx \
); \
} \
}
INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR )

View File

@@ -0,0 +1,188 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \
\
void PASTEMAC(ch,varname) \
( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
const num_t dt_r = PASTEMAC(chr,type); \
\
PASTECH(chr,gemm_ukr_ft) \
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
/*const bool_t row_pref = !col_pref;*/ \
\
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
\
const dim_t k2 = 2 * k; \
\
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype_r ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
inc_t rs_ct; \
inc_t cs_ct; \
\
ctype_r* restrict a_r = ( ctype_r* )a; \
\
ctype_r* restrict b_r = ( ctype_r* )b; \
\
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
\
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
\
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
\
ctype_r beta_use; \
\
ctype_r* c_use; \
inc_t rs_c_use; \
inc_t cs_c_use; \
\
bool_t using_ct; \
\
\
/* SAFETY CHECK: The higher level implementation should never
allow an alpha with non-zero imaginary component to be passed
in, because it can't be applied properly using the 1m method.
If alpha is not real, then something is very wrong. */ \
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
\
/* Sanity check: These should never occur because storage/preference
agreement is handled at a higher level. */ \
/*
if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
*/ \
\
\
/* If beta has a non-zero imaginary component OR if c is stored with
general stride, then we compute the alpha*a*b product into temporary
storage and then accumulate that result into c afterwards. Note that
the other two cases concerning disagreement between the storage of C
and the output preference of the micro-kernel, should never occur
(though we could handle them if they did occur). */ \
if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \
/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \
else using_ct = FALSE; \
\
\
if ( using_ct ) \
{ \
/* Set the strides of ct based on the preference of the underlying
native real domain gemm micro-kernel. Note that we set the ct
strides in units of complex elements. */ \
if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \
else { rs_ct = nr; cs_ct = 1; } \
\
beta_use = *zero_r; \
c_use = ( ctype_r* )ct; \
rs_c_use = rs_ct; \
cs_c_use = cs_ct; \
} \
else \
{ \
/* In a typical case, we use the real part of beta and accumulate
directly into the output matrix c. */ \
beta_use = beta_r; \
c_use = ( ctype_r* )c; \
rs_c_use = rs_c; \
cs_c_use = cs_c; \
} \
\
\
/* Convert the strides from being in units of complex elements to
be in units of real elements. Note that we don't need to check for
general storage here because that case corresponds to the scenario
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
else rs_c_use *= 2; \
\
\
/* The following gemm micro-kernel call implements the 1m method,
which induces a complex matrix multiplication by calling the
real matrix micro-kernel on micro-panels that have been packed
according to the 1e and 1r formats. */ \
\
/* c = beta * c + alpha_r * a * b; */ \
rgemm_ukr \
( \
k2, \
alpha_r, \
a_r, \
b_r, \
&beta_use, \
c_use, rs_c_use, cs_c_use, \
data, \
cntx \
); \
\
\
/* If necessary, accumulate the final result in ct back to c. */ \
if ( using_ct ) \
{ \
dim_t i, j; \
\
for ( j = 0; j < nr; ++j ) \
for ( i = 0; i < mr; ++i ) \
{ \
PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
*beta, \
*(c + i*rs_c + j*cs_c ) ); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR )

View File

@@ -78,7 +78,7 @@ void PASTEMAC(ch,varname) \
\
const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
\
const pack_t schema_b = bli_cntx_schema_b( cntx ); \
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
\
const dim_t k2 = 2 * k; \
\

View File

@@ -67,7 +67,7 @@ void PASTEMAC(ch,varname) \
const inc_t ld_a = cs_a; \
const inc_t ld_b = rs_b; \
\
const pack_t schema_b = bli_cntx_schema_b( cntx ); \
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
\
dim_t iter, i, j, l; \
dim_t n_behind; \
@@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \
const inc_t ld_a = cs_a; \
const inc_t ld_b = rs_b; \
\
const pack_t schema_b = bli_cntx_schema_b( cntx ); \
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
\
dim_t iter, i, j, l; \
dim_t n_behind; \