mirror of
https://github.com/amd/blis.git
synced 2026-03-15 23:07:22 +00:00
Allow use of 1m with mixing of row/col-pref ukrs.
Details: - Fixed a bug that broke the use of 1m for dcomplex when the single- precision real and double-precision real ukernels had opposing I/O preferences (row-preferential sgemm ukernel + column-preferential dgemm ukernel, or vice versa). The fix involved adjusting the API to bli_cntx_set_ind_blkszs() so that the induced method context init function (e.g., bli_cntx_init_<subconfig>_ind()) could call that function for only one datatype at a time. This allowed the blocksize scaling (which varies depending on whether we're doing 1m_r or 1m_c) to happen on a per-datatype basis. This fixes issue #557. Thanks to Devin Matthews and RuQing Xu for helping discover and report this bug. - The aforementioned 1m fix required moving the 1m_r/1m_c logic from bli_cntx_ref.c into a new function, bli_l3_set_schemas(), which is called from each level-3 _front() function. The pack_t schemas in the cntx_t were also removed entirely, along with the associated accessor functions. This in turn required updating the trsm1m-related virtual ukernels to read the pack schema for B from the auxinfo_t struct rather than the context. This also required slight tweaks to bli_gemm_md.c. - Repositioned the logic for transposing the operation to accommodate the microkernel IO preference. This mostly only affects gemm. Thanks to Devin Matthews for his help with this. - Updated dpackm pack ukernels in the 'armsve' kernel set to avoid querying pack_t schemas from the context. - Removed the num_t dt argument from the ind_cntx_init_ft type defined in bli_gks.c. The context initialization functions for induced methods were previously passed a dt argument, but I can no longer figure out *why* they were passed this value. To reduce confusion, I've removed the dt argument (including also from the function defintion + prototype). - Commented out setting of cntx_t schemas in bli_cntx_ind_stage.c. This breaks high-leve implementations of 3m and 4m, but this is okay since those implementations will be removed very soon. - Removed some older blocks of preprocessor-disabled code. - Comment update to test_libblis.c.
This commit is contained in:
@@ -112,52 +112,6 @@ siz_t bli_packm_init
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
pack_t schema;
|
||||
|
||||
if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
// We now ignore the pack_schema field in the control tree and
|
||||
// extract the schema from the context, depending on whether we are
|
||||
// preparing to pack a block of A or panel of B. For A and B, we must
|
||||
// obtain the schema from the context since the induced methods reuse
|
||||
// the same control trees used by native execution, and those induced
|
||||
// methods specify the schema used by the current execution phase
|
||||
// within the context (whereas the control tree does not change).
|
||||
|
||||
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
|
||||
{
|
||||
schema = bli_cntx_schema_a_block( cntx );
|
||||
}
|
||||
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
|
||||
{
|
||||
schema = bli_cntx_schema_b_panel( cntx );
|
||||
}
|
||||
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
|
||||
{
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
}
|
||||
}
|
||||
else // ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
// For native execution, we obtain the schema from the control tree
|
||||
// node. (Notice that it doesn't matter if the pack_buf_type is for
|
||||
// A or B.)
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
}
|
||||
// This is no longer needed now that we branch between native and
|
||||
// non-native cases above.
|
||||
#if 0
|
||||
if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
|
||||
{
|
||||
// If we get a request to pack C for some reason, it is likely
|
||||
// not part of an induced method, and so it would be safe (and
|
||||
// necessary) to read the pack schema from the control tree.
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Prepare a few other variables based on properties of the control
|
||||
// tree.
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
#include "bli_l3_direct.h"
|
||||
#include "bli_l3_prune.h"
|
||||
#include "bli_l3_packm.h"
|
||||
#include "bli_l3_schema.h"
|
||||
|
||||
// Prototype object APIs (expert and non-expert).
|
||||
#include "bli_oapi_ex.h"
|
||||
|
||||
80
frame/3/bli_l3_schema.c
Normal file
80
frame/3/bli_l3_schema.c
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_l3_set_schemas
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
// Begin with pack schemas for native execution.
|
||||
pack_t schema_a = BLIS_PACKED_ROW_PANELS;
|
||||
pack_t schema_b = BLIS_PACKED_COL_PANELS;
|
||||
|
||||
// When executing the 1m method, choose the appropriate pack schemas based
|
||||
// on the microkernel preference encoded within the current cntx_t (which
|
||||
// was presumably returned by the gks).
|
||||
if ( bli_cntx_method( cntx ) == BLIS_1M )
|
||||
{
|
||||
num_t dt = bli_obj_domain( c ) | bli_obj_comp_prec( c );
|
||||
|
||||
// Note that bli_cntx_l3_vir_ukr_prefers_cols_dt() will use the real
|
||||
// projection of dt to query the preference of the corresponding native
|
||||
// real-domain microkernel. This is what ultimately determines which
|
||||
// variant of 1m is applicable.
|
||||
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
schema_a = BLIS_PACKED_ROW_PANELS_1E;
|
||||
schema_b = BLIS_PACKED_COL_PANELS_1R;
|
||||
}
|
||||
else
|
||||
{
|
||||
schema_a = BLIS_PACKED_ROW_PANELS_1R;
|
||||
schema_b = BLIS_PACKED_COL_PANELS_1E;
|
||||
}
|
||||
}
|
||||
|
||||
// Embed the schemas into the objects for A and B. This is a sort of hack
|
||||
// for communicating the desired pack schemas to bli_gemm_cntl_create()
|
||||
// (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows
|
||||
// us to subsequently access the schemas from the control tree, which
|
||||
// hopefully reduces some confusion, particularly in bli_packm_init().
|
||||
bli_obj_set_pack_schema( schema_a, a );
|
||||
bli_obj_set_pack_schema( schema_b, b );
|
||||
}
|
||||
|
||||
41
frame/3/bli_l3_schema.h
Normal file
41
frame/3/bli_l3_schema.h
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_l3_set_schemas
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
);
|
||||
@@ -91,6 +91,22 @@ void bli_gemm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
cntx_t cntx_local;
|
||||
|
||||
@@ -110,24 +126,8 @@ void bli_gemm_front
|
||||
// is adjusted to point to cntx_local.)
|
||||
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
|
||||
}
|
||||
//else // homogeneous datatypes
|
||||
#endif
|
||||
|
||||
// Load the pack schemas from the context and embed them into the objects
|
||||
// for A and B. (Native contexts are initialized with the correct pack
|
||||
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
|
||||
// have made a copy and modified the schemas, so reading them from the
|
||||
// context should be a safe bet at this point.) This is a sort of hack for
|
||||
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
|
||||
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
|
||||
// to subsequently access the schemas from the control tree, which
|
||||
// hopefully reduces some confusion, particularly in bli_packm_init().
|
||||
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Next, we handle the possibility of needing to typecast alpha to the
|
||||
// computation datatype and/or beta to the storage datatype of C.
|
||||
|
||||
@@ -153,22 +153,6 @@ void bli_gemm_front
|
||||
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
|
||||
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
|
||||
#endif
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
|
||||
// We must also swap the pack schemas, which were set by bli_gemm_md()
|
||||
// or the inlined code above.
|
||||
bli_obj_swap_pack_schemas( &a_local, &b_local );
|
||||
}
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
|
||||
@@ -187,6 +187,10 @@ mddm_t bli_gemm_md_ccr
|
||||
bli_obj_induce_trans( b );
|
||||
bli_obj_induce_trans( c );
|
||||
|
||||
// We must swap the pack schemas because the schemas were set before
|
||||
// the objects were swapped.
|
||||
bli_obj_swap_pack_schemas( a, b );
|
||||
|
||||
return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
|
||||
}
|
||||
|
||||
@@ -230,7 +234,7 @@ mddm_t bli_gemm_md_ccr
|
||||
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
|
||||
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
|
||||
|
||||
// Use the default pack schemas in the context.
|
||||
// Use the default pack schemas in the objects.
|
||||
|
||||
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
|
||||
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
|
||||
@@ -288,6 +292,10 @@ mddm_t bli_gemm_md_crc
|
||||
bli_obj_induce_trans( b );
|
||||
bli_obj_induce_trans( c );
|
||||
|
||||
// We must swap the pack schemas because the schemas were set before
|
||||
// the objects were swapped.
|
||||
bli_obj_swap_pack_schemas( a, b );
|
||||
|
||||
return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
|
||||
}
|
||||
|
||||
@@ -331,7 +339,7 @@ mddm_t bli_gemm_md_crc
|
||||
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc );
|
||||
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc );
|
||||
|
||||
// Use the default pack schemas in the context.
|
||||
// Use the default pack schemas in the objects.
|
||||
|
||||
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
|
||||
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
|
||||
@@ -405,8 +413,8 @@ mddm_t bli_gemm_md_rcc
|
||||
|
||||
// Use the 1r pack schema for both A and B with the conjugation
|
||||
// of A or B toggled (to produce ar * br - ai * bi).
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, *cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, *cntx );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b );
|
||||
|
||||
bli_obj_toggle_conj( b );
|
||||
|
||||
@@ -485,7 +493,7 @@ mddm_t bli_gemm_md_crr
|
||||
}
|
||||
#endif
|
||||
|
||||
// Use the default pack schemas in the context.
|
||||
// Use the default pack schemas in the objects.
|
||||
|
||||
// Return the computation and execution domains.
|
||||
return doms;
|
||||
@@ -523,7 +531,7 @@ mddm_t bli_gemm_md_rcr
|
||||
// Overwrite the complex obj_t with its real-only alias.
|
||||
*a = a_real;
|
||||
|
||||
// Use the default pack schemas in the context.
|
||||
// Use the default pack schemas in the objects.
|
||||
|
||||
// Return the computation and execution domains.
|
||||
return doms;
|
||||
@@ -561,7 +569,7 @@ mddm_t bli_gemm_md_rrc
|
||||
// Overwrite the complex obj_t with its real-only alias.
|
||||
*b = b_real;
|
||||
|
||||
// Use the default pack schemas in the context.
|
||||
// Use the default pack schemas in the objects.
|
||||
|
||||
// Return the computation and execution domains.
|
||||
return doms;
|
||||
@@ -591,7 +599,7 @@ mddm_t bli_gemm_md_rrr
|
||||
doms.comp = BLIS_REAL;
|
||||
doms.exec = BLIS_REAL;
|
||||
|
||||
// Use the default pack schemas in the context.
|
||||
// Use the default pack schemas in the objects.
|
||||
|
||||
// Return the computation and execution domains.
|
||||
return doms;
|
||||
@@ -621,248 +629,10 @@ mddm_t bli_gemm_md_ccc
|
||||
doms.comp = BLIS_COMPLEX;
|
||||
doms.exec = BLIS_COMPLEX;
|
||||
|
||||
// Use the default pack schemas in the context.
|
||||
// Use the default pack schemas in the objects.
|
||||
|
||||
// Return the computation and execution domains.
|
||||
return doms;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#if 0
|
||||
void bli_gemm_md_front
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If alpha is zero, scale by beta and return.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
cntx_t cntx_local;
|
||||
|
||||
// Handle mixed domain cases in bli_gemm_md(), which may modify
|
||||
// the objects or the context. (If the context is modified, cntx
|
||||
// is adjusted to point to cntx_local.)
|
||||
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_rntm_set_ways_for_op
|
||||
(
|
||||
BLIS_GEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
rntm
|
||||
);
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_gemm_md_zgemm
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
#if 1
|
||||
obj_t am, bm, cm;
|
||||
obj_t* c_orig;
|
||||
|
||||
//if ( is_md == TRUE )
|
||||
{
|
||||
//num_t dt_c2 = bli_obj_dt( c );
|
||||
//num_t dt_c1 = bli_dt_proj_to_complex( dt_c2 );
|
||||
//num_t dt_c = bli_dt_proj_to_double_prec( dt_c1 );
|
||||
//num_t dt_c = bli_obj_dt_proj_to_complex( c );
|
||||
num_t dt_c = BLIS_DCOMPLEX;
|
||||
|
||||
if ( bli_obj_is_single_prec( c ) ) dt_c = BLIS_SCOMPLEX;
|
||||
else dt_c = BLIS_DCOMPLEX;
|
||||
|
||||
if ( bli_obj_is_real( a ) &&
|
||||
bli_obj_is_real( b ) &&
|
||||
bli_obj_is_real( c ) ) dt_c = bli_dt_proj_to_real( dt_c );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
bli_obj_create( dt_c, m, k, 0, 0, &am );
|
||||
bli_obj_create( dt_c, k, n, 0, 0, &bm );
|
||||
bli_obj_create( dt_c, m, n, 0, 0, &cm );
|
||||
|
||||
//bli_projm( a, &am );
|
||||
//bli_projm( b, &bm );
|
||||
//bli_projm( c, &cm );
|
||||
bli_castm( a, &am );
|
||||
bli_castm( b, &bm );
|
||||
bli_castm( c, &cm );
|
||||
|
||||
c_orig = c;
|
||||
|
||||
a = &am;
|
||||
b = &bm;
|
||||
c = &cm;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If alpha is zero, scale by beta and return.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
{
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
}
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
bli_rntm_set_ways_for_op
|
||||
(
|
||||
BLIS_GEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
rntm
|
||||
);
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl
|
||||
);
|
||||
|
||||
#if 1
|
||||
//if ( is_md == TRUE )
|
||||
{
|
||||
//bli_projm( &cm, c_orig );
|
||||
bli_castm( &cm, c_orig );
|
||||
|
||||
bli_obj_free( &am );
|
||||
bli_obj_free( &bm );
|
||||
bli_obj_free( &cm );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -92,6 +92,9 @@ void bli_gemmt_front
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Set the pack schemas within the objects, as appropriate.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -105,17 +108,6 @@ void bli_gemmt_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -130,6 +130,9 @@ void bli_hemm_front
|
||||
}
|
||||
#endif
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
@@ -150,17 +153,6 @@ void bli_hemm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -83,12 +83,6 @@ void bli_her2k_front
|
||||
bli_obj_induce_trans( &ah_local );
|
||||
bli_obj_toggle_conj( &ah_local );
|
||||
|
||||
// Initialize a conjugated copy of alpha.
|
||||
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
|
||||
BLIS_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_conj );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
@@ -106,6 +100,16 @@ void bli_her2k_front
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &bh_local, &c_local, cntx );
|
||||
bli_l3_set_schemas( &b_local, &ah_local, &c_local, cntx );
|
||||
|
||||
// Initialize a conjugated copy of alpha.
|
||||
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
|
||||
BLIS_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_conj );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -119,19 +123,6 @@ void bli_her2k_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &bh_local );
|
||||
bli_obj_set_pack_schema( schema_a, &b_local );
|
||||
bli_obj_set_pack_schema( schema_b, &ah_local );
|
||||
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
|
||||
// Invoke the internal back-end.
|
||||
|
||||
@@ -86,6 +86,9 @@ void bli_herk_front
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &ah_local, &c_local, cntx );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -99,17 +102,6 @@ void bli_herk_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &ah_local );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -129,6 +129,9 @@ void bli_symm_front
|
||||
}
|
||||
#endif
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
@@ -149,17 +152,6 @@ void bli_symm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -87,6 +87,10 @@ void bli_syr2k_front
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &bt_local, &c_local, cntx );
|
||||
bli_l3_set_schemas( &b_local, &at_local, &c_local, cntx );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -100,19 +104,6 @@ void bli_syr2k_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &bt_local );
|
||||
bli_obj_set_pack_schema( schema_a, &b_local );
|
||||
bli_obj_set_pack_schema( schema_b, &at_local );
|
||||
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
|
||||
// Invoke the internal back-end.
|
||||
|
||||
@@ -89,6 +89,9 @@ void bli_syrk_front
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &at_local, &c_local, cntx );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -102,17 +105,6 @@ void bli_syrk_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &at_local );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -148,6 +148,9 @@ void bli_trmm_front
|
||||
|
||||
#endif
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
@@ -168,17 +171,6 @@ void bli_trmm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -140,6 +140,9 @@ void bli_trmm3_front
|
||||
|
||||
#endif
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
@@ -160,17 +163,6 @@ void bli_trmm3_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -122,6 +122,9 @@ void bli_trsm_front
|
||||
|
||||
#endif
|
||||
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
@@ -142,17 +145,6 @@ void bli_trsm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
|
||||
@@ -323,13 +323,14 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... )
|
||||
{
|
||||
/* Example prototypes:
|
||||
|
||||
void bli_gks_cntx_set_ind_blkszs
|
||||
(
|
||||
ind_t method != BLIS_NAT,
|
||||
num_t dt,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
|
||||
bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
|
||||
@@ -346,6 +347,9 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
dim_t i;
|
||||
err_t r_val;
|
||||
|
||||
// Project the given datatype to the real domain. This will be used later on.
|
||||
num_t dt_real = bli_dt_proj_to_real( dt );
|
||||
|
||||
// Return early if called with BLIS_NAT.
|
||||
if ( method == BLIS_NAT ) return;
|
||||
|
||||
@@ -427,19 +431,17 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
|
||||
blksz_t* cntx_bmult = bli_cntx_get_bmult( bs_id, cntx );
|
||||
|
||||
// Copy the real domain values of the blksz_t object into the
|
||||
// the complex domain slots of the same object.
|
||||
bli_blksz_copy_dt( BLIS_FLOAT, cntx_blksz, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_DOUBLE, cntx_blksz, BLIS_DCOMPLEX, cntx_blksz );
|
||||
// Copy the real domain value of the blksz_t object into the
|
||||
// corresponding complex domain slot of the same object.
|
||||
bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
|
||||
|
||||
// If the default blocksize scalar is non-unit, we need to scale
|
||||
// the complex domain default blocksizes.
|
||||
if ( dsclr != 1.0 )
|
||||
{
|
||||
// Scale the complex domain default blocksize values in the
|
||||
// blocksize object.
|
||||
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
// Scale the default blocksize value corresponding to the given
|
||||
// datatype.
|
||||
bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
|
||||
|
||||
// Perform rounding to ensure the newly scaled values are still
|
||||
// multiples of their register blocksize multiples. But only
|
||||
@@ -450,9 +452,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// such rounding.
|
||||
if ( bs_id != bm_id && method != BLIS_1M )
|
||||
{
|
||||
// Round the newly-scaled blocksizes down to their multiple.
|
||||
bli_blksz_reduce_def_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_def_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
// Round the newly-scaled blocksize down to its multiple.
|
||||
bli_blksz_reduce_def_to( dt_real, cntx_bmult, dt, cntx_blksz );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -460,10 +461,9 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// to scale the complex domain maximum blocksizes.
|
||||
if ( msclr != 1.0 )
|
||||
{
|
||||
// Scale the complex domain maximum blocksize values in the
|
||||
// blocksize object.
|
||||
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
// Scale the maximum blocksize value corresponding to the given
|
||||
// datatype.
|
||||
bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
|
||||
|
||||
// Perform rounding to ensure the newly scaled values are still
|
||||
// multiples of their register blocksize multiples. But only
|
||||
@@ -474,9 +474,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// such rounding.
|
||||
if ( bs_id != bm_id && method != BLIS_1M )
|
||||
{
|
||||
// Round the newly-scaled blocksizes down to their multiple.
|
||||
bli_blksz_reduce_max_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_max_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
// Round the newly-scaled blocksize down to their multiple.
|
||||
bli_blksz_reduce_max_to( dt_real, cntx_bmult, dt, cntx_blksz );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,9 +63,6 @@ typedef struct cntx_s
|
||||
func_t* unpackm_kers;
|
||||
|
||||
ind_t method;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
} cntx_t;
|
||||
*/
|
||||
@@ -136,18 +133,6 @@ BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
|
||||
{
|
||||
return cntx->method;
|
||||
}
|
||||
BLIS_INLINE pack_t bli_cntx_schema_a_block( cntx_t* cntx )
|
||||
{
|
||||
return cntx->schema_a_block;
|
||||
}
|
||||
BLIS_INLINE pack_t bli_cntx_schema_b_panel( cntx_t* cntx )
|
||||
{
|
||||
return cntx->schema_b_panel;
|
||||
}
|
||||
BLIS_INLINE pack_t bli_cntx_schema_c_panel( cntx_t* cntx )
|
||||
{
|
||||
return cntx->schema_c_panel;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -159,23 +144,6 @@ BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
|
||||
{
|
||||
cntx->method = method;
|
||||
}
|
||||
BLIS_INLINE void bli_cntx_set_schema_a_block( pack_t schema, cntx_t* cntx )
|
||||
{
|
||||
cntx->schema_a_block = schema;
|
||||
}
|
||||
BLIS_INLINE void bli_cntx_set_schema_b_panel( pack_t schema, cntx_t* cntx )
|
||||
{
|
||||
cntx->schema_b_panel = schema;
|
||||
}
|
||||
BLIS_INLINE void bli_cntx_set_schema_c_panel( pack_t schema, cntx_t* cntx )
|
||||
{
|
||||
cntx->schema_c_panel = schema;
|
||||
}
|
||||
BLIS_INLINE void bli_cntx_set_schema_ab_blockpanel( pack_t sa, pack_t sb, cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( sa, cntx );
|
||||
bli_cntx_set_schema_b_panel( sb, cntx );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -735,7 +703,7 @@ BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... );
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );
|
||||
|
||||
@@ -50,7 +50,7 @@ static void_fp cntx_ref_init[ BLIS_NUM_ARCHS ];
|
||||
// Define a function pointer type for context initialization functions.
|
||||
typedef void (*nat_cntx_init_ft)( cntx_t* cntx );
|
||||
typedef void (*ref_cntx_init_ft)( cntx_t* cntx );
|
||||
typedef void (*ind_cntx_init_ft)( ind_t method, num_t dt, cntx_t* cntx );
|
||||
typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -582,7 +582,7 @@ cntx_t* bli_gks_query_ind_cntx
|
||||
// function for the current induced method. (That function assumes
|
||||
// that the context is pre- initialized with values for native
|
||||
// execution.)
|
||||
f( ind, dt, gks_id_ind );
|
||||
f( ind, gks_id_ind );
|
||||
}
|
||||
}
|
||||
// END CRITICAL SECTION
|
||||
|
||||
@@ -69,7 +69,6 @@ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \
|
||||
void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \
|
||||
( \
|
||||
ind_t method, \
|
||||
num_t dt, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
|
||||
@@ -1523,9 +1523,6 @@ typedef struct cntx_s
|
||||
func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ];
|
||||
|
||||
ind_t method;
|
||||
pack_t schema_a_block;
|
||||
pack_t schema_b_panel;
|
||||
pack_t schema_c_panel;
|
||||
|
||||
} cntx_t;
|
||||
|
||||
|
||||
@@ -74,18 +74,18 @@ void bli_cntx_3mh_stage( dim_t stage, cntx_t* cntx )
|
||||
// Set the pack_t schemas as a function of the stage of execution.
|
||||
if ( stage == 0 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
}
|
||||
else if ( stage == 1 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
}
|
||||
else // if ( stage == 2 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,23 +102,23 @@ void bli_cntx_4mh_stage( dim_t stage, cntx_t* cntx )
|
||||
// Set the pack_t schemas as a function of the stage of execution.
|
||||
if ( stage == 0 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
}
|
||||
else if ( stage == 1 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
}
|
||||
else if ( stage == 2 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
|
||||
}
|
||||
else // if ( stage == 3 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -64,12 +64,11 @@ void bli_dpackm_armsve512_asm_10xk
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
#ifdef _A64FX
|
||||
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
|
||||
{
|
||||
// A twisted way to infer whether A or B is being packed.
|
||||
if ( schema == bli_cntx_schema_a_block(cntx) )
|
||||
// Infer whether A or B is being packed.
|
||||
if ( schema == BLIS_PACKED_ROWS )
|
||||
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
|
||||
if ( schema == bli_cntx_schema_b_panel(cntx) )
|
||||
if ( schema == BLIS_PACKED_COLUMNS )
|
||||
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -63,12 +63,11 @@ void bli_dpackm_armsve512_asm_16xk
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
#ifdef _A64FX
|
||||
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
|
||||
{
|
||||
// A twisted way to infer whether A or B is being packed.
|
||||
if ( schema == bli_cntx_schema_a_block(cntx) )
|
||||
// Infer whether A or B is being packed.
|
||||
if ( schema == BLIS_PACKED_ROWS )
|
||||
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
|
||||
if ( schema == bli_cntx_schema_b_panel(cntx) )
|
||||
if ( schema == BLIS_PACKED_COLUMNS )
|
||||
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -334,7 +334,14 @@
|
||||
PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
|
||||
}
|
||||
|
||||
// -- Helper function for 1m ---------------------------------------------------
|
||||
|
||||
void GENBAINAME(cntx_init_blkszs)
|
||||
(
|
||||
ind_t method,
|
||||
num_t dt,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -589,10 +596,6 @@ void GENBARNAME(cntx_init)
|
||||
// -- Set miscellaneous fields ---------------------------------------------
|
||||
|
||||
bli_cntx_set_method( BLIS_NAT, cntx );
|
||||
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
|
||||
bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED, cntx );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
@@ -600,7 +603,6 @@ void GENBARNAME(cntx_init)
|
||||
void GENBAINAME(cntx_init)
|
||||
(
|
||||
ind_t method,
|
||||
num_t dt,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
@@ -826,78 +828,12 @@ void GENBAINAME(cntx_init)
|
||||
}
|
||||
else if ( method == BLIS_1M )
|
||||
{
|
||||
const bool is_pb = FALSE;
|
||||
//const bool is_pb = FALSE;
|
||||
|
||||
// We MUST set the induced method in the context prior to calling
|
||||
// bli_cntx_l3_ukr_prefers_cols_dt() because that function queries
|
||||
// the induced method. It needs the induced method value in order
|
||||
// to determine whether to evaluate the "prefers column storage"
|
||||
// predicate using the storage preference of the kernel for dt, or
|
||||
// the storage preference of the kernel for the real projection of
|
||||
// dt. Failing to set the induced method here can lead to strange
|
||||
// undefined behavior at runtime if the native complex kernel's
|
||||
// storage preference happens to not equal that of the native real
|
||||
// kernel.
|
||||
bli_cntx_set_method( method, cntx );
|
||||
|
||||
// Initialize the blocksizes according to the micro-kernel preference as
|
||||
// well as the algorithm.
|
||||
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_c_bp, 1m_r_pb.
|
||||
|
||||
// Set the pack_t schemas for the c_bp or r_pb algorithms.
|
||||
if ( !is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
|
||||
}
|
||||
else // if ( is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx );
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx );
|
||||
}
|
||||
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 2.0, 2.0, // halve mc...
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_r_bp, 1m_c_pb.
|
||||
|
||||
// Set the pack_t schemas for the r_bp or c_pb algorithms.
|
||||
if ( !is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
|
||||
}
|
||||
else // if ( is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx );
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx );
|
||||
}
|
||||
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 2.0, 2.0, // halve nc...
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
// Call a helper function to initialize blocksizes for each complex
|
||||
// datatype.
|
||||
GENBAINAME(cntx_init_blkszs)( method, BLIS_SCOMPLEX, cntx );
|
||||
GENBAINAME(cntx_init_blkszs)( method, BLIS_DCOMPLEX, cntx );
|
||||
}
|
||||
else // if ( method == BLIS_NAT )
|
||||
{
|
||||
@@ -913,8 +849,8 @@ void GENBAINAME(cntx_init)
|
||||
}
|
||||
else if ( method == BLIS_3M1 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
|
||||
}
|
||||
else if ( method == BLIS_4MH )
|
||||
{
|
||||
@@ -922,8 +858,8 @@ void GENBAINAME(cntx_init)
|
||||
}
|
||||
else if ( method == BLIS_4M1A || method == BLIS_4M1B )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
|
||||
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
|
||||
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
|
||||
}
|
||||
else if ( method == BLIS_1M )
|
||||
{
|
||||
@@ -942,3 +878,60 @@ void GENBAINAME(cntx_init)
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void GENBAINAME(cntx_init_blkszs)
|
||||
(
|
||||
ind_t method,
|
||||
num_t dt,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
// We MUST set the induced method in the context prior to calling
|
||||
// bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries
|
||||
// the induced method. That function needs the induced method value in
|
||||
// order to determine whether to evaluate the "prefers column storage"
|
||||
// predicate using the storage preference of the kernel for dt, or
|
||||
// the storage preference of the kernel for the real projection of
|
||||
// dt. Failing to set the induced method here can lead to strange
|
||||
// undefined behavior at runtime if the native complex kernel's
|
||||
// storage preference happens to not equal that of the native real
|
||||
// kernel.
|
||||
bli_cntx_set_method( method, cntx );
|
||||
|
||||
// Initialize the blocksizes according to the micro-kernel preference as
|
||||
// well as the algorithm.
|
||||
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithm 1m_c_bp.
|
||||
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, dt, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 2.0, 2.0, // halve mc...
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithm 1m_r_bp.
|
||||
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, dt, 6,
|
||||
BLIS_NC, 2.0, 2.0, // halve nc...
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
\
|
||||
const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
const dim_t k2 = 2 * k; \
|
||||
\
|
||||
|
||||
@@ -67,7 +67,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
const inc_t ld_a = cs_a; \
|
||||
const inc_t ld_b = rs_b; \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
@@ -277,7 +277,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
const inc_t ld_a = cs_a; \
|
||||
const inc_t ld_b = rs_b; \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
|
||||
@@ -447,9 +447,12 @@ void libblis_test_gemm_impl
|
||||
#if 0
|
||||
//bli_printm( "alpha", alpha, "%5.2f", "" );
|
||||
//bli_printm( "beta", beta, "%5.2f", "" );
|
||||
if ( bli_obj_dt( c ) == BLIS_DCOMPLEX )
|
||||
{
|
||||
bli_printm( "a", a, "%5.2f", "" );
|
||||
bli_printm( "b", b, "%5.2f", "" );
|
||||
bli_printm( "c", c, "%5.2f", "" );
|
||||
}
|
||||
#endif
|
||||
//if ( bli_obj_length( b ) == 16 &&
|
||||
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
|
||||
@@ -457,8 +460,7 @@ bli_printm( "c", c, "%5.2f", "" );
|
||||
bli_gemm( alpha, a, b, beta, c );
|
||||
//bls_gemm( alpha, a, b, beta, c );
|
||||
#if 0
|
||||
if ( bli_obj_length( c ) == 12 &&
|
||||
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
|
||||
if ( bli_obj_dt( c ) == BLIS_DCOMPLEX )
|
||||
bli_printm( "c after", c, "%6.3f", "" );
|
||||
#endif
|
||||
//bli_printm( "c after", c, "%5.2f", "" );
|
||||
|
||||
@@ -1790,8 +1790,8 @@ void libblis_test_op_driver
|
||||
}
|
||||
}
|
||||
|
||||
// Enumerate all combinations of datatype domains requested, but only
|
||||
// for the gemm operation.
|
||||
// Enumerate all combinations of datatypes requested, but only for the
|
||||
// gemm operation.
|
||||
|
||||
if ( !mixed_domain && mixed_precision && op->opid == BLIS_GEMM )
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user