mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Added 1m-specific APIs for bp, pb gemm algorithms.
Details:
- Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the
body of bli_gemm_cntl_create() replaced with a call to the former.
- Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now,
bli_cntl_free() can check if the thread parameter is NULL, and if so,
call the latter, and otherwise call the former.
- Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in
terms of bli_gemm1mxx_cntx_init(), which behaves the same as
bli_gemm1m_cntx_init() did before, except that an extra bool parameter
(is_pb) is used to support both bp and pb algorithms (including to
support the anti-preference field described below).
- Added support for "anti-preference" in context. The anti_pref field,
when true, will toggle the boolean return value of routines such as
bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of
causing BLIS to transpose the operation to achieve disagreement (rather
than agreement) between the storage of C and the micro-kernel output
preference. This disagreement is needed for panel-block implementations,
since they induce a transposition of the suboperation immediately before
the macro-kernel is called, which changes the apparent storage of C. For
now, anti-preference is used only with the pb algorithm for 1m (and not
with any other non-1m implementation).
- Defined new functions,
bli_cntx_l3_ukr_eff_prefers_storage_of()
bli_cntx_l3_ukr_eff_dislikes_storage_of()
bli_cntx_l3_nat_ukr_eff_prefers_storage_of()
bli_cntx_l3_nat_ukr_eff_dislikes_storage_of()
which are identical to their non-"eff" (effectively) counterparts except
that they take the anti-preference field of the context into account.
- Explicitly initialize the anti-pref field to FALSE in
bli_gks_cntx_set_l3_nat_ukr_prefs().
- Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel
in terms of the existing block-panel macro-kernel _ker_var2(). This
technique requires inducing transposes on all operands and swapping
the A and B.
- Changed bli_obj_induce_trans() macro so that pack-related fields are
also changed to reflect the induced transposition.
- Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily
specify the 1m algorithm (block-panel or panel-block).
- Renamed the following cntx_t-related macros:
bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block()
bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel()
bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel()
and updated all instantiations. Also updated the field names in the
cntx_t struct.
- Comment updates.
This commit is contained in:
committed by
prangana
parent
1d728ccb23
commit
4f61528d56
@@ -121,11 +121,11 @@ siz_t bli_packm_init
|
||||
|
||||
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
|
||||
{
|
||||
schema = bli_cntx_get_pack_schema_a( cntx );
|
||||
schema = bli_cntx_get_pack_schema_a_block( cntx );
|
||||
}
|
||||
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
|
||||
{
|
||||
schema = bli_cntx_get_pack_schema_b( cntx );
|
||||
schema = bli_cntx_get_pack_schema_b_panel( cntx );
|
||||
}
|
||||
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
|
||||
{
|
||||
|
||||
@@ -70,8 +70,8 @@ void bli_l3_cntl_create_if
|
||||
else
|
||||
{
|
||||
// If the user provided a control tree, create a copy and use it
|
||||
// instead (so that it can be used to cache things like pack mem_t
|
||||
// entries).
|
||||
// instead (so that threads can use its local tree as a place to
|
||||
// cache things like pack mem_t entries).
|
||||
*cntl_use = bli_cntl_copy( cntl_orig );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,9 +63,8 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
cntx );
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm_cntx_finalize( cntx_t* cntx )
|
||||
@@ -106,9 +105,8 @@ void bli_trsm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
cntx );
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
|
||||
}
|
||||
|
||||
void bli_trsm_cntx_finalize( cntx_t* cntx )
|
||||
|
||||
@@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create
|
||||
opid_t family
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_gemm_ker_var2;
|
||||
return bli_gemmbp_cntl_create( family );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmbp_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_gemm_ker_var2;
|
||||
|
||||
// Change the macro-kernel if the operation family is herk or trmm.
|
||||
if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
|
||||
@@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create
|
||||
// Create a node for packing matrix A.
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packa,
|
||||
bli_gemm_packa, // pack the left-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_MR,
|
||||
BLIS_KR,
|
||||
@@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create
|
||||
// Create a node for packing matrix B.
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packb,
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_KR,
|
||||
BLIS_NR,
|
||||
@@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create
|
||||
return gemm_cntl_vl_mm;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmpb_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_gemm_ker_var1;
|
||||
|
||||
// Change the macro-kernel if the operation family is herk or trmm.
|
||||
//if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
|
||||
//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_MR, // needed for bli_thrinfo_rgrow()
|
||||
NULL, // variant function pointer not used
|
||||
NULL // no sub-node; this is the leaf of the tree.
|
||||
);
|
||||
|
||||
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
|
||||
macro_kernel_p,
|
||||
gemm_cntl_ub_ke
|
||||
);
|
||||
|
||||
// Create a node for packing matrix A (which is really the right-hand
|
||||
// operand "B").
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_KR,
|
||||
BLIS_MR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
gemm_cntl_pb_ub
|
||||
);
|
||||
|
||||
// Create a node for partitioning the n dimension by MC.
|
||||
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_MC,
|
||||
bli_gemm_blk_var2,
|
||||
gemm_cntl_packb
|
||||
);
|
||||
|
||||
// Create a node for packing matrix B (which is really the left-hand
|
||||
// operand "A").
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packa, // pack the left-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_NR,
|
||||
BLIS_KR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
gemm_cntl_op_pb
|
||||
);
|
||||
|
||||
// Create a node for partitioning the k dimension by KC.
|
||||
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_KC,
|
||||
bli_gemm_blk_var3,
|
||||
gemm_cntl_packa
|
||||
);
|
||||
|
||||
// Create a node for partitioning the m dimension by NC.
|
||||
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_NC,
|
||||
bli_gemm_blk_var1,
|
||||
gemm_cntl_mm_op
|
||||
);
|
||||
|
||||
return gemm_cntl_vl_mm;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_gemm_cntl_free
|
||||
(
|
||||
cntl_t* cntl,
|
||||
|
||||
@@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create
|
||||
opid_t family
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmbp_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
);
|
||||
|
||||
cntl_t* bli_gemmpb_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_gemm_cntl_free
|
||||
(
|
||||
cntl_t* cntl,
|
||||
|
||||
@@ -46,11 +46,10 @@ void bli_gemm_front
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
|
||||
#ifdef BLIS_SMALL_MATRIX_ENABLE
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl);
|
||||
if(BLIS_SUCCESS != status)
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
obj_t a_local;
|
||||
@@ -90,9 +89,6 @@ void bli_gemm_front
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_GEMM, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -103,6 +99,7 @@ void bli_gemm_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
56
frame/3/gemm/bli_gemm_ker_var1.c
Normal file
56
frame/3/gemm/bli_gemm_ker_var1.c
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_ker_var1
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
// Implement _ker_var1() in terms of _ker_var2() by transposing the
|
||||
// entire suboperation (which also requires swapping A and B).
|
||||
|
||||
bli_obj_induce_trans( *a );
|
||||
bli_obj_induce_trans( *b );
|
||||
bli_obj_induce_trans( *c );
|
||||
|
||||
bli_gemm_ker_var2( b, a, c, cntx, cntl, thread );
|
||||
}
|
||||
|
||||
@@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 )
|
||||
GENPROT( gemm_packa )
|
||||
GENPROT( gemm_packb )
|
||||
|
||||
GENPROT( gemm_ker_var1 )
|
||||
GENPROT( gemm_ker_var2 )
|
||||
|
||||
// Headers for induced algorithms:
|
||||
|
||||
@@ -97,6 +97,16 @@ void bli_cntl_free
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread );
|
||||
else bli_cntl_free_wo_thrinfo( cntl );
|
||||
}
|
||||
|
||||
void bli_cntl_free_w_thrinfo
|
||||
(
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
// Base case: simply return when asked to free NULL nodes.
|
||||
if ( cntl == NULL ) return;
|
||||
@@ -112,7 +122,7 @@ void bli_cntl_free
|
||||
{
|
||||
// Recursively free all memory associated with the sub-node and its
|
||||
// children.
|
||||
bli_cntl_free( cntl_sub_node, thread_sub_node );
|
||||
bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node );
|
||||
}
|
||||
|
||||
// Free the current node's params field, if it is non-NULL.
|
||||
@@ -122,8 +132,8 @@ void bli_cntl_free
|
||||
}
|
||||
|
||||
// Release the current node's pack mem_t entry back to the memory
|
||||
// broker from which it originated, but only if the current thread
|
||||
// is chief for its group, and only if the mem_t is allocated.
|
||||
// broker from which it originated, but only if the mem_t entry is
|
||||
// allocated, and only if the current thread is chief for its group.
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
if ( bli_mem_is_alloc( cntl_pack_mem ) )
|
||||
{
|
||||
@@ -134,6 +144,42 @@ void bli_cntl_free
|
||||
bli_cntl_obj_free( cntl );
|
||||
}
|
||||
|
||||
void bli_cntl_free_wo_thrinfo
|
||||
(
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// Base case: simply return when asked to free NULL nodes.
|
||||
if ( cntl == NULL ) return;
|
||||
|
||||
cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl );
|
||||
void* cntl_params = bli_cntl_params( cntl );
|
||||
mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl );
|
||||
|
||||
{
|
||||
// Recursively free all memory associated with the sub-node and its
|
||||
// children.
|
||||
bli_cntl_free_wo_thrinfo( cntl_sub_node );
|
||||
}
|
||||
|
||||
// Free the current node's params field, if it is non-NULL.
|
||||
if ( cntl_params != NULL )
|
||||
{
|
||||
bli_free_intl( cntl_params );
|
||||
}
|
||||
|
||||
// Release the current node's pack mem_t entry back to the memory
|
||||
// broker from which it originated, but only if the mem_t entry is
|
||||
// allocated.
|
||||
if ( bli_mem_is_alloc( cntl_pack_mem ) )
|
||||
{
|
||||
bli_membrk_release( cntl_pack_mem );
|
||||
}
|
||||
|
||||
// Free the current node.
|
||||
bli_cntl_obj_free( cntl );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_cntl_copy
|
||||
|
||||
@@ -75,12 +75,25 @@ void bli_cntl_obj_clear
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntl_free
|
||||
(
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
void bli_cntl_free_w_thrinfo
|
||||
(
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
void bli_cntl_free_wo_thrinfo
|
||||
(
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
cntl_t* bli_cntl_copy
|
||||
(
|
||||
cntl_t* cntl
|
||||
|
||||
@@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx )
|
||||
return bli_cntx_method( cntx );
|
||||
}
|
||||
|
||||
pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx )
|
||||
pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_schema_a( cntx );
|
||||
return bli_cntx_schema_a_block( cntx );
|
||||
}
|
||||
|
||||
pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx )
|
||||
pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_schema_b( cntx );
|
||||
return bli_cntx_schema_b_panel( cntx );
|
||||
}
|
||||
|
||||
pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_schema_c_panel( cntx );
|
||||
}
|
||||
|
||||
bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_anti_pref( cntx );
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -705,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method,
|
||||
bli_cntx_set_method( method, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_a( schema_a, cntx );
|
||||
bli_cntx_set_schema_b( schema_b, cntx );
|
||||
bli_cntx_set_schema_a_block( schema_a, cntx );
|
||||
bli_cntx_set_schema_b_panel( schema_b, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_a( pack_t schema_a,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_a( schema_a, cntx );
|
||||
bli_cntx_set_schema_a_block( schema_a, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_b( pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_b( schema_b, cntx );
|
||||
bli_cntx_set_schema_b_panel( schema_b, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_c( pack_t schema_c,
|
||||
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_c_panel( schema_c, cntx );
|
||||
}
|
||||
|
||||
#if 0
|
||||
void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_c( schema_c, cntx );
|
||||
bli_cntx_set_anti_pref( anti_pref, cntx );
|
||||
}
|
||||
#endif
|
||||
|
||||
void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
|
||||
dim_t m, dim_t n, dim_t k )
|
||||
@@ -904,6 +922,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
@@ -953,6 +997,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_print( cntx_t* cntx )
|
||||
|
||||
@@ -59,6 +59,8 @@ typedef struct cntx_s
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
bool_t anti_pref;
|
||||
|
||||
dim_t* thrloop;
|
||||
|
||||
membrk_t* membrk;
|
||||
@@ -113,26 +115,30 @@ typedef struct cntx_s
|
||||
\
|
||||
( (cntx)->method )
|
||||
|
||||
#define bli_cntx_schema_a( cntx ) \
|
||||
#define bli_cntx_schema_a_block( cntx ) \
|
||||
\
|
||||
( (cntx)->schema_a )
|
||||
( (cntx)->schema_a_block )
|
||||
|
||||
#define bli_cntx_schema_b( cntx ) \
|
||||
#define bli_cntx_schema_b_panel( cntx ) \
|
||||
\
|
||||
( (cntx)->schema_b )
|
||||
( (cntx)->schema_b_panel )
|
||||
|
||||
#define bli_cntx_schema_c( cntx ) \
|
||||
#define bli_cntx_schema_c_panel( cntx ) \
|
||||
\
|
||||
( (cntx)->schema_c )
|
||||
( (cntx)->schema_c_panel )
|
||||
|
||||
#define bli_cntx_membrk( cntx ) \
|
||||
#define bli_cntx_anti_pref( cntx ) \
|
||||
\
|
||||
( (cntx)->membrk )
|
||||
( (cntx)->anti_pref )
|
||||
|
||||
#define bli_cntx_thrloop( cntx ) \
|
||||
\
|
||||
( (cntx)->thrloop )
|
||||
|
||||
#define bli_cntx_membrk( cntx ) \
|
||||
\
|
||||
( (cntx)->membrk )
|
||||
|
||||
#if 1
|
||||
#define bli_cntx_jc_way( cntx ) \
|
||||
\
|
||||
@@ -211,24 +217,24 @@ typedef struct cntx_s
|
||||
(cntx_p)->method = _method; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \
|
||||
#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_a = _schema_a; \
|
||||
(cntx_p)->schema_a_block = _schema_a_block; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \
|
||||
#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_b = _schema_b; \
|
||||
(cntx_p)->schema_b_panel = _schema_b_panel; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \
|
||||
#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_c = _schema_c; \
|
||||
(cntx_p)->schema_c_panel = _schema_c_panel; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
|
||||
#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->membrk = _membrk; \
|
||||
(cntx_p)->anti_pref = _anti_pref; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \
|
||||
@@ -241,6 +247,11 @@ typedef struct cntx_s
|
||||
(cntx_p)->thrloop[ BLIS_KR ] = 1; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->membrk = _membrk; \
|
||||
}
|
||||
|
||||
// cntx_t query (complex)
|
||||
|
||||
#define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \
|
||||
@@ -323,13 +334,17 @@ typedef struct cntx_s
|
||||
\
|
||||
bli_cntx_method( cntx )
|
||||
|
||||
#define bli_cntx_get_pack_schema_a( cntx ) \
|
||||
#define bli_cntx_get_pack_schema_a_block( cntx ) \
|
||||
\
|
||||
bli_cntx_schema_a( cntx )
|
||||
bli_cntx_schema_a_block( cntx )
|
||||
|
||||
#define bli_cntx_get_pack_schema_b( cntx ) \
|
||||
#define bli_cntx_get_pack_schema_b_panel( cntx ) \
|
||||
\
|
||||
bli_cntx_schema_b( cntx )
|
||||
bli_cntx_schema_b_panel( cntx )
|
||||
|
||||
#define bli_cntx_get_pack_schema_c_panel( cntx ) \
|
||||
\
|
||||
bli_cntx_schema_c_panel( cntx )
|
||||
|
||||
#define bli_cntx_get_membrk( cntx ) \
|
||||
\
|
||||
@@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
|
||||
// l1vkr_t ker_id,
|
||||
// cntx_t* cntx );
|
||||
//ind_t bli_cntx_get_ind_method( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx );
|
||||
//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx );
|
||||
dim_t bli_cntx_get_num_threads( cntx_t* cntx );
|
||||
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl );
|
||||
|
||||
@@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_ind_method( ind_t method,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_a( pack_t schema_a,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_b( pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_c( pack_t schema_c,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
|
||||
cntx_t* cntx );
|
||||
//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
|
||||
// cntx_t* cntx );
|
||||
void bli_cntx_set_thrloop_from_env( opid_t l3_op,
|
||||
side_t side,
|
||||
cntx_t* cntx,
|
||||
@@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj,
|
||||
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
@@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj,
|
||||
bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
|
||||
// print function
|
||||
|
||||
|
||||
@@ -606,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr,
|
||||
mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ];
|
||||
|
||||
bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref );
|
||||
|
||||
// Explicitly set the anti-preference to FALSE.
|
||||
bli_cntx_set_anti_pref( FALSE, cntx );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -877,6 +877,12 @@ bli_obj_width_stored( obj )
|
||||
(obj).n_panel = n0; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_dims( m0, n0, obj ) \
|
||||
{ \
|
||||
bli_obj_set_panel_length( m0, obj ); \
|
||||
bli_obj_set_panel_width( n0, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_dim( panel_dim, obj ) \
|
||||
{ \
|
||||
(obj).pd = panel_dim; \
|
||||
@@ -985,6 +991,7 @@ bli_obj_width_stored( obj )
|
||||
#define bli_obj_induce_trans( obj ) \
|
||||
{ \
|
||||
{ \
|
||||
/* Induce transposition among basic fields. */ \
|
||||
dim_t m_ = bli_obj_length( obj ); \
|
||||
dim_t n_ = bli_obj_width( obj ); \
|
||||
inc_t rs_ = bli_obj_row_stride( obj ); \
|
||||
@@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj )
|
||||
\
|
||||
if ( bli_obj_is_upper_or_lower( obj ) ) \
|
||||
bli_obj_toggle_uplo( obj ); \
|
||||
\
|
||||
/* Induce transposition among packed fields. */ \
|
||||
dim_t m_padded_ = bli_obj_padded_length( obj ); \
|
||||
dim_t n_padded_ = bli_obj_padded_width( obj ); \
|
||||
dim_t m_panel_ = bli_obj_panel_length( obj ); \
|
||||
dim_t n_panel_ = bli_obj_panel_width( obj ); \
|
||||
\
|
||||
bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \
|
||||
bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \
|
||||
\
|
||||
/* Note that this macro DOES NOT touch the transposition bit! If
|
||||
the calling code is using this macro to handle an object whose
|
||||
|
||||
@@ -975,9 +975,11 @@ typedef struct cntx_s
|
||||
|
||||
opid_t family;
|
||||
ind_t method;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
pack_t schema_a_block;
|
||||
pack_t schema_b_panel;
|
||||
pack_t schema_c_panel;
|
||||
|
||||
bool_t anti_pref;
|
||||
|
||||
dim_t thrloop[ BLIS_NUM_LOOPS ];
|
||||
|
||||
|
||||
@@ -151,9 +151,8 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI,
|
||||
BLIS_PACKED_COL_PANELS_3MI,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
@@ -200,9 +199,8 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS,
|
||||
BLIS_PACKED_COL_PANELS_3MI,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MS, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
@@ -249,9 +247,8 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
|
||||
BLIS_PACKED_COL_PANELS_3MS,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MS, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
@@ -259,15 +256,15 @@ void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
// Set the pack_t schemas as a function of the stage of execution.
|
||||
if ( stage == 0 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
}
|
||||
else if ( stage == 1 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
}
|
||||
else // if ( stage == 2 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RPI, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -311,9 +308,8 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
|
||||
0, // not yet needed; varies with _stage()
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
|
||||
bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage()
|
||||
}
|
||||
|
||||
void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
@@ -321,18 +317,18 @@ void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
// Set the pack_t schemas as a function of the stage of execution.
|
||||
if ( stage == 0 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
|
||||
BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
}
|
||||
else if ( stage == 1 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
|
||||
BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
}
|
||||
else // if ( stage == 2 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RPI,
|
||||
BLIS_PACKED_COL_PANELS_RPI, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -376,9 +372,8 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
|
||||
BLIS_PACKED_COL_PANELS_4MI,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
@@ -425,9 +420,8 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
|
||||
BLIS_PACKED_COL_PANELS_4MI,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
@@ -474,9 +468,8 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
|
||||
0, // not yet needed; varies with _stage()
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
|
||||
bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage()
|
||||
}
|
||||
|
||||
void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
@@ -484,23 +477,23 @@ void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
// Set the pack_t schemas as a function of the stage of execution.
|
||||
if ( stage == 0 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
|
||||
BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
}
|
||||
else if ( stage == 1 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
|
||||
BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
}
|
||||
else if ( stage == 2 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
|
||||
BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
}
|
||||
else // if ( stage == 3 )
|
||||
{
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
|
||||
BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -511,6 +504,22 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx )
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
// Default to context for block-panel algorithm.
|
||||
bli_gemm1mbp_cntx_init( dt, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
bli_gemm1mxx_cntx_init( dt, FALSE, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
bli_gemm1mxx_cntx_init( dt, TRUE, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx )
|
||||
{
|
||||
const ind_t method = BLIS_1M;
|
||||
|
||||
@@ -529,8 +538,24 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
// Initialize the context with packm-related kernels.
|
||||
bli_packm_cntx_init( dt, cntx );
|
||||
|
||||
// Initialize the blocksizes according to the micro-kernel preference as
|
||||
// well as the algorithm.
|
||||
if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_c_bp, 1m_r_pb.
|
||||
|
||||
// Set the pack_t schemas for the c_bp or r_pb algorithms.
|
||||
if ( !is_pb )
|
||||
{
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
|
||||
}
|
||||
else // if ( is_pb )
|
||||
{
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx );
|
||||
}
|
||||
|
||||
// Initialize the context with the current architecture's register
|
||||
// and cache blocksizes (and multiples), and the induced method.
|
||||
bli_gks_cntx_set_blkszs
|
||||
@@ -544,14 +569,23 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
BLIS_KR, BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E,
|
||||
BLIS_PACKED_COL_PANELS_1R,
|
||||
cntx );
|
||||
}
|
||||
else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_r_bp, 1m_c_pb.
|
||||
|
||||
// Set the pack_t schemas for the r_bp or c_pb algorithms.
|
||||
if ( !is_pb )
|
||||
{
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
|
||||
}
|
||||
else // if ( is_pb )
|
||||
{
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx );
|
||||
}
|
||||
|
||||
// Initialize the context with the current architecture's register
|
||||
// and cache blocksizes (and multiples), and the induced method.
|
||||
bli_gks_cntx_set_blkszs
|
||||
@@ -565,12 +599,15 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
BLIS_KR, BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R,
|
||||
BLIS_PACKED_COL_PANELS_1E,
|
||||
cntx );
|
||||
}
|
||||
|
||||
// Set the anti-preference field to TRUE when executing a panel-block
|
||||
// algorithm, and FALSE otherwise. This will cause higher-level generic
|
||||
// code to establish (if needed) disagreement between the storage of C and
|
||||
// the micro-kernel output preference so that the two will come back into
|
||||
// agreement in the panel-block macro-kernel (which implemented in terms
|
||||
// of the block-panel macro-kernel with some induced transpositions).
|
||||
bli_cntx_set_anti_pref( is_pb, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx )
|
||||
|
||||
@@ -65,6 +65,9 @@ void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx );
|
||||
void bli_gemm4m1_cntx_finalize( cntx_t* cntx );
|
||||
|
||||
void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx );
|
||||
void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx );
|
||||
void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx );
|
||||
void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx );
|
||||
void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx );
|
||||
void bli_gemm1m_cntx_finalize( cntx_t* cntx );
|
||||
|
||||
|
||||
@@ -74,9 +74,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI,
|
||||
BLIS_PACKED_COL_PANELS_3MI,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI,
|
||||
BLIS_PACKED_COL_PANELS_3MI,
|
||||
cntx );
|
||||
}
|
||||
|
||||
void bli_trsm3m1_cntx_finalize( cntx_t* cntx )
|
||||
@@ -123,9 +123,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
|
||||
BLIS_PACKED_COL_PANELS_4MI,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI,
|
||||
BLIS_PACKED_COL_PANELS_4MI,
|
||||
cntx );
|
||||
}
|
||||
|
||||
void bli_trsm4m1_cntx_finalize( cntx_t* cntx )
|
||||
@@ -174,9 +174,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E,
|
||||
BLIS_PACKED_COL_PANELS_1R,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E,
|
||||
BLIS_PACKED_COL_PANELS_1R,
|
||||
cntx );
|
||||
}
|
||||
else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
@@ -195,9 +195,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R,
|
||||
BLIS_PACKED_COL_PANELS_1E,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R,
|
||||
BLIS_PACKED_COL_PANELS_1E,
|
||||
cntx );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
85
frame/ind/oapi/bli_l3_1mbppb_oapi.c
Normal file
85
frame/ind/oapi/bli_l3_1mbppb_oapi.c
Normal file
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// -- gemmbp/gemmpb ------------------------------------------------------------
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, imeth, alg ) \
|
||||
\
|
||||
void PASTEMAC2(opname,imeth,alg) \
|
||||
( \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* beta, \
|
||||
obj_t* c \
|
||||
) \
|
||||
{ \
|
||||
num_t dt = bli_obj_datatype( *c ); \
|
||||
cntx_t cntx; \
|
||||
cntl_t* cntl_p; \
|
||||
\
|
||||
/* If the objects are in the real domain, execute the native
|
||||
implementation. */ \
|
||||
if ( bli_obj_is_real( *c ) ) \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( alpha, a, b, beta, c, NULL ); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* Initialize a local 1m context for the current algorithm (bp or pb). */ \
|
||||
PASTEMAC3(opname,imeth,alg,_cntx_init)( dt, &cntx ); \
|
||||
\
|
||||
/* Create a control tree for the current algorithm (bp or pb). */ \
|
||||
cntl_p = PASTEMAC2(opname,alg,_cntl_create)( BLIS_GEMM ); \
|
||||
\
|
||||
/* Invoke the operation's front end using the context and control
|
||||
tree we just created. */ \
|
||||
PASTEMAC(opname,_front)( alpha, a, b, beta, c, &cntx, cntl_p ); \
|
||||
\
|
||||
/* Free the control tree. Since the implementation will only make
|
||||
copies of it (and not use it directly) we do not need to supply
|
||||
a thread object. */ \
|
||||
bli_cntl_free( cntl_p, NULL ); \
|
||||
\
|
||||
/* Finalize the local context. */ \
|
||||
PASTEMAC2(opname,imeth,_cntx_finalize)( &cntx ); \
|
||||
}
|
||||
|
||||
// gemm
|
||||
GENFRONT( gemm, 1m, bp )
|
||||
GENFRONT( gemm, 1m, pb )
|
||||
|
||||
@@ -62,6 +62,14 @@ void PASTEMAC(opname,imeth) \
|
||||
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* A temporary hack to easily specify the 1m algorithm (block-panel or
|
||||
panel-block). */ \
|
||||
if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \
|
||||
{ \
|
||||
bli_gemm1mbp( alpha, a, b, beta, c ); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* Initialize a local context if the one provided is NULL. */ \
|
||||
bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \
|
||||
|
||||
@@ -80,3 +80,17 @@ GENPROT_NO2OP( 3m2 )
|
||||
GENPROT_NO2OP( 4mh )
|
||||
GENPROT_NO2OP( 4mb )
|
||||
|
||||
|
||||
//
|
||||
// Generate object-based prototypes for 1m methods that specify an algorithm
|
||||
// (e.g., block-panel or panel-block).
|
||||
//
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( imeth, alg ) \
|
||||
\
|
||||
void PASTEMAC2(gemm,imeth,alg) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); \
|
||||
|
||||
GENPROT( 1m, bp )
|
||||
GENPROT( 1m, pb )
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t row_pref = !col_pref; \
|
||||
/*const bool_t row_pref = !col_pref;*/ \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
@@ -77,10 +77,8 @@ void PASTEMAC(ch,varname) \
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
ctype_r beta_use; \
|
||||
ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \
|
||||
ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
ctype_r* c_use; \
|
||||
inc_t rs_c_use; \
|
||||
@@ -96,75 +94,71 @@ void PASTEMAC(ch,varname) \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* Sanity check: These should never occur because storage/preference
|
||||
agreement is handled at a higher level. */ \
|
||||
/*
|
||||
if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/* If beta has a non-zero imaginary component OR if c is stored with
|
||||
general stride OR if for some reason the storage of c is not the
|
||||
preferred storage of the micro-kernel, then we compute the
|
||||
alpha*a*b product into temporary storage and then accumulate that
|
||||
result into c afterwards. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \
|
||||
else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
|
||||
general stride, then we compute the alpha*a*b product into temporary
|
||||
storage and then accumulate that result into c afterwards. Note that
|
||||
the other two cases concerning disagreement between the storage of C
|
||||
and the output preference of the micro-kernel, should never occur
|
||||
(though we could handle them if they did occur). */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \
|
||||
/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
|
||||
else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \
|
||||
else using_ct = FALSE; \
|
||||
\
|
||||
\
|
||||
if ( using_ct ) \
|
||||
{ \
|
||||
/* In the atypical cases, we compute the result into temporary
|
||||
workspace ct and then accumulated it back to c at the end. */ \
|
||||
\
|
||||
/* Set the strides of ct based on the preference of the underlying
|
||||
native real domain gemm micro-kernel. Note that we set the ct
|
||||
strides in units of complex elements. */ \
|
||||
if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \
|
||||
else { rs_ct = nr; cs_ct = 1; } \
|
||||
\
|
||||
beta_use = *zero_r; \
|
||||
c_use = ( ctype_r* )ct; \
|
||||
rs_c_use = rs_ct; \
|
||||
cs_c_use = cs_ct; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* In a typical case, we use the real part of beta and accumulate
|
||||
directly into the output matrix c. */ \
|
||||
beta_use = beta_r; \
|
||||
c_use = ( ctype_r* )c; \
|
||||
rs_c_use = rs_c; \
|
||||
cs_c_use = cs_c; \
|
||||
} \
|
||||
\
|
||||
/* Convert the strides from being in units of complex elements to
|
||||
be in units of real elements. Note that we don't need to check for
|
||||
general storage here because that case corresponds to the scenario
|
||||
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
|
||||
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
|
||||
else rs_c_use *= 2; \
|
||||
\
|
||||
/* Convert the strides from being in units of complex elements to
|
||||
be in units of real elements. Note that we don't need to check for
|
||||
general storage here because that case corresponds to the scenario
|
||||
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
|
||||
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
|
||||
else rs_c_use *= 2; \
|
||||
/* The following gemm micro-kernel call implements the 1m method,
|
||||
which induces a complex matrix multiplication by calling the
|
||||
real matrix micro-kernel on micro-panels that have been packed
|
||||
according to the 1e and 1r formats. */ \
|
||||
\
|
||||
/* c = beta * c + alpha_r * a * b; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k2, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
zero_r, \
|
||||
c_use, rs_c_use, cs_c_use, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* The following gemm micro-kernel call implements the 1m method,
|
||||
which induces a complex matrix multiplication by calling the
|
||||
real matrix micro-kernel on micro-panels that have been packed
|
||||
according to the 1e and 1r formats. */ \
|
||||
\
|
||||
/* c = beta * c + alpha_r * a * b; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k2, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
&beta_use, \
|
||||
c_use, rs_c_use, cs_c_use, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/* If necessary, accumulate the final result in ct back to c. */ \
|
||||
if ( using_ct ) \
|
||||
{ \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Accumulate the final result in ct back to c. */ \
|
||||
for ( j = 0; j < nr; ++j ) \
|
||||
for ( i = 0; i < mr; ++i ) \
|
||||
{ \
|
||||
@@ -173,6 +167,40 @@ void PASTEMAC(ch,varname) \
|
||||
*(c + i*rs_c + j*cs_c ) ); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* In the typical cases, we use the real part of beta and
|
||||
accumulate directly into the output matrix c. */ \
|
||||
\
|
||||
c_use = ( ctype_r* )c; \
|
||||
rs_c_use = rs_c; \
|
||||
cs_c_use = cs_c; \
|
||||
\
|
||||
/* Convert the strides from being in units of complex elements to
|
||||
be in units of real elements. Note that we don't need to check for
|
||||
general storage here because that case corresponds to the scenario
|
||||
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
|
||||
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
|
||||
else rs_c_use *= 2; \
|
||||
\
|
||||
/* The following gemm micro-kernel call implements the 1m method,
|
||||
which induces a complex matrix multiplication by calling the
|
||||
real matrix micro-kernel on micro-panels that have been packed
|
||||
according to the 1e and 1r formats. */ \
|
||||
\
|
||||
/* c = beta * c + alpha_r * a * b; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k2, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
beta_r, \
|
||||
c_use, rs_c_use, cs_c_use, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR )
|
||||
|
||||
188
frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev
Normal file
188
frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev
Normal file
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
/*const bool_t row_pref = !col_pref;*/ \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t k2 = 2 * k; \
|
||||
\
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
inc_t rs_ct; \
|
||||
inc_t cs_ct; \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
\
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
ctype_r beta_use; \
|
||||
\
|
||||
ctype_r* c_use; \
|
||||
inc_t rs_c_use; \
|
||||
inc_t cs_c_use; \
|
||||
\
|
||||
bool_t using_ct; \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 1m method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* Sanity check: These should never occur because storage/preference
|
||||
agreement is handled at a higher level. */ \
|
||||
/*
|
||||
if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/* If beta has a non-zero imaginary component OR if c is stored with
|
||||
general stride, then we compute the alpha*a*b product into temporary
|
||||
storage and then accumulate that result into c afterwards. Note that
|
||||
the other two cases concerning disagreement between the storage of C
|
||||
and the output preference of the micro-kernel, should never occur
|
||||
(though we could handle them if they did occur). */ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \
|
||||
/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
|
||||
else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \
|
||||
else using_ct = FALSE; \
|
||||
\
|
||||
\
|
||||
if ( using_ct ) \
|
||||
{ \
|
||||
/* Set the strides of ct based on the preference of the underlying
|
||||
native real domain gemm micro-kernel. Note that we set the ct
|
||||
strides in units of complex elements. */ \
|
||||
if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \
|
||||
else { rs_ct = nr; cs_ct = 1; } \
|
||||
\
|
||||
beta_use = *zero_r; \
|
||||
c_use = ( ctype_r* )ct; \
|
||||
rs_c_use = rs_ct; \
|
||||
cs_c_use = cs_ct; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* In a typical case, we use the real part of beta and accumulate
|
||||
directly into the output matrix c. */ \
|
||||
beta_use = beta_r; \
|
||||
c_use = ( ctype_r* )c; \
|
||||
rs_c_use = rs_c; \
|
||||
cs_c_use = cs_c; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Convert the strides from being in units of complex elements to
|
||||
be in units of real elements. Note that we don't need to check for
|
||||
general storage here because that case corresponds to the scenario
|
||||
where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
|
||||
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
|
||||
else rs_c_use *= 2; \
|
||||
\
|
||||
\
|
||||
/* The following gemm micro-kernel call implements the 1m method,
|
||||
which induces a complex matrix multiplication by calling the
|
||||
real matrix micro-kernel on micro-panels that have been packed
|
||||
according to the 1e and 1r formats. */ \
|
||||
\
|
||||
/* c = beta * c + alpha_r * a * b; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k2, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
&beta_use, \
|
||||
c_use, rs_c_use, cs_c_use, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/* If necessary, accumulate the final result in ct back to c. */ \
|
||||
if ( using_ct ) \
|
||||
{ \
|
||||
dim_t i, j; \
|
||||
\
|
||||
for ( j = 0; j < nr; ++j ) \
|
||||
for ( i = 0; i < mr; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
|
||||
*beta, \
|
||||
*(c + i*rs_c + j*cs_c ) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR )
|
||||
|
||||
@@ -78,7 +78,7 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b( cntx ); \
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
\
|
||||
const dim_t k2 = 2 * k; \
|
||||
\
|
||||
|
||||
@@ -67,7 +67,7 @@ void PASTEMAC(ch,varname) \
|
||||
const inc_t ld_a = cs_a; \
|
||||
const inc_t ld_b = rs_b; \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b( cntx ); \
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
@@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \
|
||||
const inc_t ld_a = cs_a; \
|
||||
const inc_t ld_b = rs_b; \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b( cntx ); \
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
|
||||
Reference in New Issue
Block a user