Allow use of 1m with mixing of row/col-pref ukrs.

Details:
- Fixed a bug that broke the use of 1m for dcomplex when the single-
  precision real and double-precision real ukernels had opposing I/O
  preferences (row-preferential sgemm ukernel + column-preferential
  dgemm ukernel, or vice versa). The fix involved adjusting the API
  to bli_cntx_set_ind_blkszs() so that the induced method context init
  function (e.g., bli_cntx_init_<subconfig>_ind()) could call that
  function for only one datatype at a time. This allowed the blocksize
  scaling (which varies depending on whether we're doing 1m_r or 1m_c)
  to happen on a per-datatype basis. This fixes issue #557. Thanks to
  Devin Matthews and RuQing Xu for helping discover and report this bug.
- The aforementioned 1m fix required moving the 1m_r/1m_c logic from
  bli_cntx_ref.c into a new function, bli_l3_set_schemas(), which is
  called from each level-3 _front() function. The pack_t schemas in the
  cntx_t were also removed entirely, along with the associated accessor
  functions. This in turn required updating the trsm1m-related virtual
  ukernels to read the pack schema for B from the auxinfo_t struct
  rather than the context. This also required slight tweaks to
  bli_gemm_md.c.
- Repositioned the logic for transposing the operation to accommodate
  the microkernel IO preference. This mostly only affects gemm. Thanks
  to Devin Matthews for his help with this.
- Updated dpackm pack ukernels in the 'armsve' kernel set to avoid
  querying pack_t schemas from the context.
- Removed the num_t dt argument from the ind_cntx_init_ft type defined
  in bli_gks.c. The context initialization functions for induced methods
  were previously passed a dt argument, but I can no longer figure out
  *why* they were passed this value. To reduce confusion, I've removed
  the dt argument (including also from the function defintion +
  prototype).
- Commented out setting of cntx_t schemas in bli_cntx_ind_stage.c. This
  breaks high-leve implementations of 3m and 4m, but this is okay since
  those implementations will be removed very soon.
- Removed some older blocks of preprocessor-disabled code.
- Comment update to test_libblis.c.
This commit is contained in:
Field G. Van Zee
2021-10-13 14:15:38 -05:00
parent 81e1034632
commit e9da6425e2
29 changed files with 316 additions and 612 deletions

View File

@@ -112,52 +112,6 @@ siz_t bli_packm_init
return 0;
}
#if 0
pack_t schema;
if ( bli_cntx_method( cntx ) != BLIS_NAT )
{
// We now ignore the pack_schema field in the control tree and
// extract the schema from the context, depending on whether we are
// preparing to pack a block of A or panel of B. For A and B, we must
// obtain the schema from the context since the induced methods reuse
// the same control trees used by native execution, and those induced
// methods specify the schema used by the current execution phase
// within the context (whereas the control tree does not change).
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
{
schema = bli_cntx_schema_a_block( cntx );
}
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
{
schema = bli_cntx_schema_b_panel( cntx );
}
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
{
schema = bli_cntl_packm_params_pack_schema( cntl );
}
}
else // ( bli_cntx_method( cntx ) == BLIS_NAT )
{
// For native execution, we obtain the schema from the control tree
// node. (Notice that it doesn't matter if the pack_buf_type is for
// A or B.)
schema = bli_cntl_packm_params_pack_schema( cntl );
}
// This is no longer needed now that we branch between native and
// non-native cases above.
#if 0
if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
{
// If we get a request to pack C for some reason, it is likely
// not part of an induced method, and so it would be safe (and
// necessary) to read the pack schema from the control tree.
schema = bli_cntl_packm_params_pack_schema( cntl );
}
#endif
#endif
// Prepare a few other variables based on properties of the control
// tree.

View File

@@ -46,6 +46,7 @@
#include "bli_l3_direct.h"
#include "bli_l3_prune.h"
#include "bli_l3_packm.h"
#include "bli_l3_schema.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"

80
frame/3/bli_l3_schema.c Normal file
View File

@@ -0,0 +1,80 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_l3_set_schemas
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
)
{
// Begin with pack schemas for native execution.
pack_t schema_a = BLIS_PACKED_ROW_PANELS;
pack_t schema_b = BLIS_PACKED_COL_PANELS;
// When executing the 1m method, choose the appropriate pack schemas based
// on the microkernel preference encoded within the current cntx_t (which
// was presumably returned by the gks).
if ( bli_cntx_method( cntx ) == BLIS_1M )
{
num_t dt = bli_obj_domain( c ) | bli_obj_comp_prec( c );
// Note that bli_cntx_l3_vir_ukr_prefers_cols_dt() will use the real
// projection of dt to query the preference of the corresponding native
// real-domain microkernel. This is what ultimately determines which
// variant of 1m is applicable.
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
schema_a = BLIS_PACKED_ROW_PANELS_1E;
schema_b = BLIS_PACKED_COL_PANELS_1R;
}
else
{
schema_a = BLIS_PACKED_ROW_PANELS_1R;
schema_b = BLIS_PACKED_COL_PANELS_1E;
}
}
// Embed the schemas into the objects for A and B. This is a sort of hack
// for communicating the desired pack schemas to bli_gemm_cntl_create()
// (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows
// us to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
bli_obj_set_pack_schema( schema_a, a );
bli_obj_set_pack_schema( schema_b, b );
}

41
frame/3/bli_l3_schema.h Normal file
View File

@@ -0,0 +1,41 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_l3_set_schemas
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
);

View File

@@ -91,6 +91,22 @@ void bli_gemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
#ifdef BLIS_ENABLE_GEMM_MD
cntx_t cntx_local;
@@ -110,24 +126,8 @@ void bli_gemm_front
// is adjusted to point to cntx_local.)
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
}
//else // homogeneous datatypes
#endif
// Load the pack schemas from the context and embed them into the objects
// for A and B. (Native contexts are initialized with the correct pack
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
// have made a copy and modified the schemas, so reading them from the
// context should be a safe bet at this point.) This is a sort of hack for
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
// to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Next, we handle the possibility of needing to typecast alpha to the
// computation datatype and/or beta to the storage datatype of C.
@@ -153,22 +153,6 @@ void bli_gemm_front
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
#endif
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
// We must also swap the pack schemas, which were set by bli_gemm_md()
// or the inlined code above.
bli_obj_swap_pack_schemas( &a_local, &b_local );
}
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any

View File

@@ -187,6 +187,10 @@ mddm_t bli_gemm_md_ccr
bli_obj_induce_trans( b );
bli_obj_induce_trans( c );
// We must swap the pack schemas because the schemas were set before
// the objects were swapped.
bli_obj_swap_pack_schemas( a, b );
return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
}
@@ -230,7 +234,7 @@ mddm_t bli_gemm_md_ccr
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
@@ -288,6 +292,10 @@ mddm_t bli_gemm_md_crc
bli_obj_induce_trans( b );
bli_obj_induce_trans( c );
// We must swap the pack schemas because the schemas were set before
// the objects were swapped.
bli_obj_swap_pack_schemas( a, b );
return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
}
@@ -331,7 +339,7 @@ mddm_t bli_gemm_md_crc
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc );
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
@@ -405,8 +413,8 @@ mddm_t bli_gemm_md_rcc
// Use the 1r pack schema for both A and B with the conjugation
// of A or B toggled (to produce ar * br - ai * bi).
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, *cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, *cntx );
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b );
bli_obj_toggle_conj( b );
@@ -485,7 +493,7 @@ mddm_t bli_gemm_md_crr
}
#endif
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -523,7 +531,7 @@ mddm_t bli_gemm_md_rcr
// Overwrite the complex obj_t with its real-only alias.
*a = a_real;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -561,7 +569,7 @@ mddm_t bli_gemm_md_rrc
// Overwrite the complex obj_t with its real-only alias.
*b = b_real;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -591,7 +599,7 @@ mddm_t bli_gemm_md_rrr
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -621,248 +629,10 @@ mddm_t bli_gemm_md_ccc
doms.comp = BLIS_COMPLEX;
doms.exec = BLIS_COMPLEX;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
#if 0
void bli_gemm_md_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
cntx_t cntx_local;
// Handle mixed domain cases in bli_gemm_md(), which may modify
// the objects or the context. (If the context is modified, cntx
// is adjusted to point to cntx_local.)
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
// Record the threading for each level within the context.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
}
// -----------------------------------------------------------------------------
void bli_gemm_md_zgemm
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
#if 1
obj_t am, bm, cm;
obj_t* c_orig;
//if ( is_md == TRUE )
{
//num_t dt_c2 = bli_obj_dt( c );
//num_t dt_c1 = bli_dt_proj_to_complex( dt_c2 );
//num_t dt_c = bli_dt_proj_to_double_prec( dt_c1 );
//num_t dt_c = bli_obj_dt_proj_to_complex( c );
num_t dt_c = BLIS_DCOMPLEX;
if ( bli_obj_is_single_prec( c ) ) dt_c = BLIS_SCOMPLEX;
else dt_c = BLIS_DCOMPLEX;
if ( bli_obj_is_real( a ) &&
bli_obj_is_real( b ) &&
bli_obj_is_real( c ) ) dt_c = bli_dt_proj_to_real( dt_c );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width_after_trans( a );
bli_obj_create( dt_c, m, k, 0, 0, &am );
bli_obj_create( dt_c, k, n, 0, 0, &bm );
bli_obj_create( dt_c, m, n, 0, 0, &cm );
//bli_projm( a, &am );
//bli_projm( b, &bm );
//bli_projm( c, &cm );
bli_castm( a, &am );
bli_castm( b, &bm );
bli_castm( c, &cm );
c_orig = c;
a = &am;
b = &bm;
c = &cm;
}
#endif
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
{
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
if ( bli_cntx_method( cntx ) == BLIS_NAT )
{
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
}
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
{
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
}
}
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
#if 1
//if ( is_md == TRUE )
{
//bli_projm( &cm, c_orig );
bli_castm( &cm, c_orig );
bli_obj_free( &am );
bli_obj_free( &bm );
bli_obj_free( &cm );
}
#endif
}
#endif
#endif

View File

@@ -92,6 +92,9 @@ void bli_gemmt_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects, as appropriate.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -105,17 +108,6 @@ void bli_gemmt_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(

View File

@@ -130,6 +130,9 @@ void bli_hemm_front
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -150,17 +153,6 @@ void bli_hemm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -83,12 +83,6 @@ void bli_her2k_front
bli_obj_induce_trans( &ah_local );
bli_obj_toggle_conj( &ah_local );
// Initialize a conjugated copy of alpha.
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
BLIS_CONJUGATE,
alpha,
&alpha_conj );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
@@ -106,6 +100,16 @@ void bli_her2k_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &bh_local, &c_local, cntx );
bli_l3_set_schemas( &b_local, &ah_local, &c_local, cntx );
// Initialize a conjugated copy of alpha.
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
BLIS_CONJUGATE,
alpha,
&alpha_conj );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -119,19 +123,6 @@ void bli_her2k_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &bh_local );
bli_obj_set_pack_schema( schema_a, &b_local );
bli_obj_set_pack_schema( schema_b, &ah_local );
// Invoke herk twice, using beta only the first time.
// Invoke the internal back-end.

View File

@@ -86,6 +86,9 @@ void bli_herk_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &ah_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -99,17 +102,6 @@ void bli_herk_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &ah_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -129,6 +129,9 @@ void bli_symm_front
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -149,17 +152,6 @@ void bli_symm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -87,6 +87,10 @@ void bli_syr2k_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &bt_local, &c_local, cntx );
bli_l3_set_schemas( &b_local, &at_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -100,19 +104,6 @@ void bli_syr2k_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &bt_local );
bli_obj_set_pack_schema( schema_a, &b_local );
bli_obj_set_pack_schema( schema_b, &at_local );
// Invoke herk twice, using beta only the first time.
// Invoke the internal back-end.

View File

@@ -89,6 +89,9 @@ void bli_syrk_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &at_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -102,17 +105,6 @@ void bli_syrk_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &at_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -148,6 +148,9 @@ void bli_trmm_front
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -168,17 +171,6 @@ void bli_trmm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -140,6 +140,9 @@ void bli_trmm3_front
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -160,17 +163,6 @@ void bli_trmm3_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -122,6 +122,9 @@ void bli_trsm_front
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -142,17 +145,6 @@ void bli_trsm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -323,13 +323,14 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// -----------------------------------------------------------------------------
void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... )
{
/* Example prototypes:
void bli_gks_cntx_set_ind_blkszs
(
ind_t method != BLIS_NAT,
num_t dt,
dim_t n_bs,
bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
@@ -346,6 +347,9 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
dim_t i;
err_t r_val;
// Project the given datatype to the real domain. This will be used later on.
num_t dt_real = bli_dt_proj_to_real( dt );
// Return early if called with BLIS_NAT.
if ( method == BLIS_NAT ) return;
@@ -427,19 +431,17 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
blksz_t* cntx_bmult = bli_cntx_get_bmult( bs_id, cntx );
// Copy the real domain values of the blksz_t object into the
// the complex domain slots of the same object.
bli_blksz_copy_dt( BLIS_FLOAT, cntx_blksz, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, cntx_blksz, BLIS_DCOMPLEX, cntx_blksz );
// Copy the real domain value of the blksz_t object into the
// corresponding complex domain slot of the same object.
bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
// If the default blocksize scalar is non-unit, we need to scale
// the complex domain default blocksizes.
if ( dsclr != 1.0 )
{
// Scale the complex domain default blocksize values in the
// blocksize object.
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
// Scale the default blocksize value corresponding to the given
// datatype.
bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
// Perform rounding to ensure the newly scaled values are still
// multiples of their register blocksize multiples. But only
@@ -450,9 +452,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
// such rounding.
if ( bs_id != bm_id && method != BLIS_1M )
{
// Round the newly-scaled blocksizes down to their multiple.
bli_blksz_reduce_def_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_def_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz );
// Round the newly-scaled blocksize down to its multiple.
bli_blksz_reduce_def_to( dt_real, cntx_bmult, dt, cntx_blksz );
}
}
@@ -460,10 +461,9 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
// to scale the complex domain maximum blocksizes.
if ( msclr != 1.0 )
{
// Scale the complex domain maximum blocksize values in the
// blocksize object.
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
// Scale the maximum blocksize value corresponding to the given
// datatype.
bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
// Perform rounding to ensure the newly scaled values are still
// multiples of their register blocksize multiples. But only
@@ -474,9 +474,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
// such rounding.
if ( bs_id != bm_id && method != BLIS_1M )
{
// Round the newly-scaled blocksizes down to their multiple.
bli_blksz_reduce_max_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_max_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz );
// Round the newly-scaled blocksize down to their multiple.
bli_blksz_reduce_max_to( dt_real, cntx_bmult, dt, cntx_blksz );
}
}
}

View File

@@ -63,9 +63,6 @@ typedef struct cntx_s
func_t* unpackm_kers;
ind_t method;
pack_t schema_a;
pack_t schema_b;
pack_t schema_c;
} cntx_t;
*/
@@ -136,18 +133,6 @@ BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
{
return cntx->method;
}
BLIS_INLINE pack_t bli_cntx_schema_a_block( cntx_t* cntx )
{
return cntx->schema_a_block;
}
BLIS_INLINE pack_t bli_cntx_schema_b_panel( cntx_t* cntx )
{
return cntx->schema_b_panel;
}
BLIS_INLINE pack_t bli_cntx_schema_c_panel( cntx_t* cntx )
{
return cntx->schema_c_panel;
}
// -----------------------------------------------------------------------------
@@ -159,23 +144,6 @@ BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
{
cntx->method = method;
}
BLIS_INLINE void bli_cntx_set_schema_a_block( pack_t schema, cntx_t* cntx )
{
cntx->schema_a_block = schema;
}
BLIS_INLINE void bli_cntx_set_schema_b_panel( pack_t schema, cntx_t* cntx )
{
cntx->schema_b_panel = schema;
}
BLIS_INLINE void bli_cntx_set_schema_c_panel( pack_t schema, cntx_t* cntx )
{
cntx->schema_c_panel = schema;
}
BLIS_INLINE void bli_cntx_set_schema_ab_blockpanel( pack_t sa, pack_t sb, cntx_t* cntx )
{
bli_cntx_set_schema_a_block( sa, cntx );
bli_cntx_set_schema_b_panel( sb, cntx );
}
// -----------------------------------------------------------------------------
@@ -735,7 +703,7 @@ BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );

View File

@@ -50,7 +50,7 @@ static void_fp cntx_ref_init[ BLIS_NUM_ARCHS ];
// Define a function pointer type for context initialization functions.
typedef void (*nat_cntx_init_ft)( cntx_t* cntx );
typedef void (*ref_cntx_init_ft)( cntx_t* cntx );
typedef void (*ind_cntx_init_ft)( ind_t method, num_t dt, cntx_t* cntx );
typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
// -----------------------------------------------------------------------------
@@ -582,7 +582,7 @@ cntx_t* bli_gks_query_ind_cntx
// function for the current induced method. (That function assumes
// that the context is pre- initialized with values for native
// execution.)
f( ind, dt, gks_id_ind );
f( ind, gks_id_ind );
}
}
// END CRITICAL SECTION

View File

@@ -69,7 +69,6 @@ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \
void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \
( \
ind_t method, \
num_t dt, \
cntx_t* cntx \
);

View File

@@ -1523,9 +1523,6 @@ typedef struct cntx_s
func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ];
ind_t method;
pack_t schema_a_block;
pack_t schema_b_panel;
pack_t schema_c_panel;
} cntx_t;

View File

@@ -74,18 +74,18 @@ void bli_cntx_3mh_stage( dim_t stage, cntx_t* cntx )
// Set the pack_t schemas as a function of the stage of execution.
if ( stage == 0 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
}
else if ( stage == 1 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
}
else // if ( stage == 2 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx );
}
}
@@ -102,23 +102,23 @@ void bli_cntx_4mh_stage( dim_t stage, cntx_t* cntx )
// Set the pack_t schemas as a function of the stage of execution.
if ( stage == 0 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
}
else if ( stage == 1 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
}
else if ( stage == 2 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
}
else // if ( stage == 3 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
}
}

View File

@@ -64,12 +64,11 @@ void bli_dpackm_armsve512_asm_10xk
const bool unitk = bli_deq1( *kappa );
#ifdef _A64FX
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
{
// A twisted way to infer whether A or B is being packed.
if ( schema == bli_cntx_schema_a_block(cntx) )
// Infer whether A or B is being packed.
if ( schema == BLIS_PACKED_ROWS )
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
if ( schema == bli_cntx_schema_b_panel(cntx) )
if ( schema == BLIS_PACKED_COLUMNS )
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
}
#endif

View File

@@ -63,12 +63,11 @@ void bli_dpackm_armsve512_asm_16xk
const bool unitk = bli_deq1( *kappa );
#ifdef _A64FX
if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
{
// A twisted way to infer whether A or B is being packed.
if ( schema == bli_cntx_schema_a_block(cntx) )
// Infer whether A or B is being packed.
if ( schema == BLIS_PACKED_ROWS )
p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
if ( schema == bli_cntx_schema_b_panel(cntx) )
if ( schema == BLIS_PACKED_COLUMNS )
p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
}
#endif

View File

@@ -334,7 +334,14 @@
PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
}
// -- Helper function for 1m ---------------------------------------------------
void GENBAINAME(cntx_init_blkszs)
(
ind_t method,
num_t dt,
cntx_t* cntx
);
// -----------------------------------------------------------------------------
@@ -589,10 +596,6 @@ void GENBARNAME(cntx_init)
// -- Set miscellaneous fields ---------------------------------------------
bli_cntx_set_method( BLIS_NAT, cntx );
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED, cntx );
}
// -----------------------------------------------------------------------------
@@ -600,7 +603,6 @@ void GENBARNAME(cntx_init)
void GENBAINAME(cntx_init)
(
ind_t method,
num_t dt,
cntx_t* cntx
)
{
@@ -826,78 +828,12 @@ void GENBAINAME(cntx_init)
}
else if ( method == BLIS_1M )
{
const bool is_pb = FALSE;
//const bool is_pb = FALSE;
// We MUST set the induced method in the context prior to calling
// bli_cntx_l3_ukr_prefers_cols_dt() because that function queries
// the induced method. It needs the induced method value in order
// to determine whether to evaluate the "prefers column storage"
// predicate using the storage preference of the kernel for dt, or
// the storage preference of the kernel for the real projection of
// dt. Failing to set the induced method here can lead to strange
// undefined behavior at runtime if the native complex kernel's
// storage preference happens to not equal that of the native real
// kernel.
bli_cntx_set_method( method, cntx );
// Initialize the blocksizes according to the micro-kernel preference as
// well as the algorithm.
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
// This branch is used for algorithms 1m_c_bp, 1m_r_pb.
// Set the pack_t schemas for the c_bp or r_pb algorithms.
if ( !is_pb )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
}
else // if ( is_pb )
{
bli_cntx_set_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx );
bli_cntx_set_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx );
}
bli_cntx_set_ind_blkszs
(
method, 6,
BLIS_NC, 1.0, 1.0,
BLIS_KC, 2.0, 2.0, // halve kc...
BLIS_MC, 2.0, 2.0, // halve mc...
BLIS_NR, 1.0, 1.0,
BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
BLIS_KR, 1.0, 1.0,
cntx
);
}
else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
// This branch is used for algorithms 1m_r_bp, 1m_c_pb.
// Set the pack_t schemas for the r_bp or c_pb algorithms.
if ( !is_pb )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
}
else // if ( is_pb )
{
bli_cntx_set_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx );
bli_cntx_set_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx );
}
bli_cntx_set_ind_blkszs
(
method, 6,
BLIS_NC, 2.0, 2.0, // halve nc...
BLIS_KC, 2.0, 2.0, // halve kc...
BLIS_MC, 1.0, 1.0,
BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
BLIS_MR, 1.0, 1.0,
BLIS_KR, 1.0, 1.0,
cntx
);
}
// Call a helper function to initialize blocksizes for each complex
// datatype.
GENBAINAME(cntx_init_blkszs)( method, BLIS_SCOMPLEX, cntx );
GENBAINAME(cntx_init_blkszs)( method, BLIS_DCOMPLEX, cntx );
}
else // if ( method == BLIS_NAT )
{
@@ -913,8 +849,8 @@ void GENBAINAME(cntx_init)
}
else if ( method == BLIS_3M1 )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
}
else if ( method == BLIS_4MH )
{
@@ -922,8 +858,8 @@ void GENBAINAME(cntx_init)
}
else if ( method == BLIS_4M1A || method == BLIS_4M1B )
{
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
//bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
//bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
}
else if ( method == BLIS_1M )
{
@@ -942,3 +878,60 @@ void GENBAINAME(cntx_init)
}
}
// -----------------------------------------------------------------------------
void GENBAINAME(cntx_init_blkszs)
(
ind_t method,
num_t dt,
cntx_t* cntx
)
{
// We MUST set the induced method in the context prior to calling
// bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries
// the induced method. That function needs the induced method value in
// order to determine whether to evaluate the "prefers column storage"
// predicate using the storage preference of the kernel for dt, or
// the storage preference of the kernel for the real projection of
// dt. Failing to set the induced method here can lead to strange
// undefined behavior at runtime if the native complex kernel's
// storage preference happens to not equal that of the native real
// kernel.
bli_cntx_set_method( method, cntx );
// Initialize the blocksizes according to the micro-kernel preference as
// well as the algorithm.
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
// This branch is used for algorithm 1m_c_bp.
bli_cntx_set_ind_blkszs
(
method, dt, 6,
BLIS_NC, 1.0, 1.0,
BLIS_KC, 2.0, 2.0, // halve kc...
BLIS_MC, 2.0, 2.0, // halve mc...
BLIS_NR, 1.0, 1.0,
BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
BLIS_KR, 1.0, 1.0,
cntx
);
}
else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
// This branch is used for algorithm 1m_r_bp.
bli_cntx_set_ind_blkszs
(
method, dt, 6,
BLIS_NC, 2.0, 2.0, // halve nc...
BLIS_KC, 2.0, 2.0, // halve kc...
BLIS_MC, 1.0, 1.0,
BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
BLIS_MR, 1.0, 1.0,
BLIS_KR, 1.0, 1.0,
cntx
);
}
}

View File

@@ -78,7 +78,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
\
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
\
const dim_t k2 = 2 * k; \
\

View File

@@ -67,7 +67,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
const inc_t ld_a = cs_a; \
const inc_t ld_b = rs_b; \
\
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
\
dim_t iter, i, j, l; \
dim_t n_behind; \
@@ -277,7 +277,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
const inc_t ld_a = cs_a; \
const inc_t ld_b = rs_b; \
\
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
\
dim_t iter, i, j, l; \
dim_t n_behind; \

View File

@@ -447,9 +447,12 @@ void libblis_test_gemm_impl
#if 0
//bli_printm( "alpha", alpha, "%5.2f", "" );
//bli_printm( "beta", beta, "%5.2f", "" );
if ( bli_obj_dt( c ) == BLIS_DCOMPLEX )
{
bli_printm( "a", a, "%5.2f", "" );
bli_printm( "b", b, "%5.2f", "" );
bli_printm( "c", c, "%5.2f", "" );
}
#endif
//if ( bli_obj_length( b ) == 16 &&
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
@@ -457,8 +460,7 @@ bli_printm( "c", c, "%5.2f", "" );
bli_gemm( alpha, a, b, beta, c );
//bls_gemm( alpha, a, b, beta, c );
#if 0
if ( bli_obj_length( c ) == 12 &&
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
if ( bli_obj_dt( c ) == BLIS_DCOMPLEX )
bli_printm( "c after", c, "%6.3f", "" );
#endif
//bli_printm( "c after", c, "%5.2f", "" );

View File

@@ -1790,8 +1790,8 @@ void libblis_test_op_driver
}
}
// Enumerate all combinations of datatype domains requested, but only
// for the gemm operation.
// Enumerate all combinations of datatypes requested, but only for the
// gemm operation.
if ( !mixed_domain && mixed_precision && op->opid == BLIS_GEMM )
{