Simplify and rewrite reference packm kernels. (#610)

Details:
- Reorganized the way kernels are stored within the cntx_t structure so
  that rather than having a function pointer for every supported size of
  unrolled packm kernel (2xk, 3xk, 4xk, etc.), we store only two packm
  kernels per datatype: one to pack MRxk micropanels and one to pack
  NRxk micropanels.
  - NOTE: The "bb" (broadcast B) reference kernels have been merged into
    the "standard" kernels (packm [including 1er and unpackm], gemm, 
    trsm, gemmtrsm). This replication factor is controlled by 
    BLIS_BB[MN]_[sdcz] etc. Power9/10 needs testing since only a 
    replication factor of 1 has been tested. armsve also needs testing 
    since the MR value isn't available as a macro.
- Simplified the bli_cntx_*() APIs to conform to the new unified kernel
  array within the cntx_t. Updated existing bli_cntx_init_<subconfig>()
  function definitions for all subconfigurations.
- Consolidated all kernel id types (e.g. l1vkr_t, l1mkr_t, l3ukr_t,
  etc.) into one kernel id type: ukr_t.
- Various edits, updates, and rewrites of reference kernels pursuant to 
  the aforementioned changes.
- Define compile-time macro constants (BLIS_MR_[sdcz], BLIS_NR_[sdcz], 
  and friends) in bli_kernel_macro_defs.h, but only when the macro
  BLIS_IN_REF_KERNEL is defined by the build system.
- Loose ends:
  - Still need to update documentation, including:
    - docs/ConfigurationHowTo.md
    - docs/KernelsHowTo.md
    to reflect changes made in this commit.
This commit is contained in:
Devin Matthews
2022-04-06 20:31:11 -05:00
committed by GitHub
parent b3e674db3c
commit ae10d94954
214 changed files with 5299 additions and 10376 deletions

View File

@@ -164,7 +164,7 @@ void PASTECH2(bao_,ch,varname) \
function pointer type. */ \
/*
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
*/ \
\
/* Temporary C buffer for edge cases. Note that the strides of this
@@ -175,7 +175,7 @@ void PASTECH2(bao_,ch,varname) \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
*/ \
@@ -536,7 +536,7 @@ void PASTECH2(bao_,ch,varname) \
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
@@ -545,7 +545,7 @@ void PASTECH2(bao_,ch,varname) \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\

View File

@@ -137,7 +137,7 @@ void bao_gemmd_ex
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );

View File

@@ -163,7 +163,7 @@ void PASTECH2(bao_,ch,varname) \
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \

View File

@@ -55,15 +55,15 @@ void PASTECH2(bao_,ch,opname) \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
num_t dt = PASTEMAC(ch,type); \
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \

View File

@@ -120,6 +120,8 @@ get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
-DBLIS_CNAME=$(1) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
-DBLIS_IN_REF_KERNEL=1 \
-include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
)
get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
@@ -129,6 +131,8 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
-DBLIS_CNAME=$(1) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
-DBLIS_IN_REF_KERNEL=1 \
-include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
)
get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \

View File

@@ -38,34 +38,42 @@
void bli_cntx_init_a64fx( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_a64fx_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
// packm
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_VA_END
);
// Set SVE-512 packing routine.
bli_cntx_set_packm_kers
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
2,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
// 12xk is not used and disabled for GCC 8-9 compatibility.
// BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -80,66 +88,18 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
// Set A64FX cache sector sizes for each PE/CMG
// SC Fugaku might disable users' setting cache sizes.
#if !defined(CACHE_SECTOR_SIZE_READONLY)

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 32
#define BLIS_MR_d 16
#define BLIS_MR_c 16
#define BLIS_MR_z 8
#define BLIS_NR_s 10
#define BLIS_NR_d 10
#define BLIS_NR_c 10
#define BLIS_NR_z 10
//#endif

View File

@@ -45,9 +45,6 @@ void bli_cntx_init_armsve( cntx_t* cntx )
return;
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
#if 0
blksz_t thresh[ BLIS_NUM_THRESH ];
#endif
// Set default kernel blocksizes and functions.
bli_cntx_init_armsve_ref( cntx );
@@ -64,35 +61,55 @@ void bli_cntx_init_armsve( cntx_t* cntx )
bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
cntx,
// level-3
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Set VL-specific packing routines if applicable.
if (m_r_d==16)
bli_cntx_set_packm_kers
if ( m_r_d == 16 )
{
bli_cntx_set_ukrs
(
2,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
cntx,
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_VA_END
);
else if (m_r_d==8)
bli_cntx_set_packm_kers
}
else if ( m_r_d == 8 )
{
bli_cntx_set_ukrs
(
1,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
cntx
cntx,
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
BLIS_VA_END
);
}
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
@@ -106,64 +123,16 @@ void bli_cntx_init_armsve( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
}

View File

@@ -0,0 +1,58 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
//
// The armsve configuration handles both 256-bit and 512-bit SVE vectors,
// so it is not possible to define specific register block sizes. Thus,
// armsve can't use reference kernels!
//
#define BLIS_MR_s -1
#define BLIS_MR_d -1
#define BLIS_MR_c -1
#define BLIS_MR_z -1
#define BLIS_NR_s 10
#define BLIS_NR_d 10
#define BLIS_NR_c 10
#define BLIS_NR_z 10
//#endif

View File

@@ -43,14 +43,28 @@ void bli_cntx_init_bgq( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_bgq( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_d 8
#define BLIS_MR_z 4
#define BLIS_NR_d 8
#define BLIS_NR_z 4
//#endif

View File

@@ -43,16 +43,32 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 4
#define BLIS_MR_c 8
#define BLIS_MR_z 4
#define BLIS_NR_s 8
#define BLIS_NR_d 6
#define BLIS_NR_c 4
#define BLIS_NR_z 4
//#endif

View File

@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -73,13 +87,16 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 4
#define BLIS_MR_d 4
#define BLIS_NR_s 4
#define BLIS_NR_d 4
//#endif

View File

@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 6
#define BLIS_NR_s 12
#define BLIS_NR_d 8
//#endif

View File

@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 6
#define BLIS_NR_s 12
#define BLIS_NR_d 8
//#endif

View File

@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 4
#define BLIS_MR_d 4
#define BLIS_NR_s 4
#define BLIS_NR_d 4
//#endif

View File

@@ -43,16 +43,32 @@ void bli_cntx_init_excavator( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_excavator( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 16
#define BLIS_MR_d 8
#define BLIS_MR_c 4
#define BLIS_MR_z 2
#define BLIS_NR_s 3
#define BLIS_NR_d 3
#define BLIS_NR_c 2
#define BLIS_NR_z 2
//#endif

View File

@@ -37,32 +37,60 @@
void bli_cntx_init_firestorm( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_firestorm_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
// packm
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
// gemmsup
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
BLIS_VA_END
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
4,
BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
cntx
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
// gemmsup
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -73,72 +101,47 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 );
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KT ], -1, 99, -1, -1 );
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
// -------------------------------------------------------------------------
// sup thresholds
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 );
// level-3 sup
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 6
#define BLIS_NR_s 12
#define BLIS_NR_d 8
//#endif

View File

@@ -0,0 +1,42 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
//#endif

View File

@@ -35,79 +35,58 @@
#include "blis.h"
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
void bli_cntx_init_haswell( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_haswell_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
8,
cntx,
// gemm
#if 1
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
#else
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3,
#endif
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
#if 1
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
// packm
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
@@ -137,7 +116,74 @@ void bli_cntx_init_haswell( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
// gemmsup
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// gemm
#if 1
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
#else
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
#endif
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// gemmsup
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -161,97 +207,54 @@ void bli_cntx_init_haswell( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 201, 201, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 201, 201, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 201, 201, -1, -1 );
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// gemmsup thresholds
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 201, 201, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 201, 201, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 201, 201, -1, -1 );
// level-3 sup
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
cntx
);
#endif
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 6
#define BLIS_MR_d 6
#define BLIS_MR_c 3
#define BLIS_MR_z 3
#define BLIS_NR_s 16
#define BLIS_NR_d 8
#define BLIS_NR_c 8
#define BLIS_NR_z 4
//#endif

View File

@@ -43,13 +43,26 @@ void bli_cntx_init_knc( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
1,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, TRUE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -58,7 +71,7 @@ void bli_cntx_init_knc( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 120, 0, 0,
0, 160, 0, 0 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0,
bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0,
0, 300, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 14400, 0, 0 );
@@ -66,13 +79,16 @@ void bli_cntx_init_knc( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_d 30
#define BLIS_NR_d 8
#define BLIS_PACKMR_d 32
//#endif

View File

@@ -43,47 +43,33 @@ void bli_cntx_init_knl( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE,
cntx
);
cntx,
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
cntx
);
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8,
// packm
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
@@ -92,12 +78,15 @@ void bli_cntx_init_knl( cntx_t* cntx )
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
@@ -106,7 +95,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -125,17 +127,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 24
#define BLIS_MR_d 24
#define BLIS_NR_s 16
#define BLIS_NR_d 8
//#endif

View File

@@ -66,7 +66,7 @@ void bli_cntx_init_armv7a( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,

View File

@@ -203,7 +203,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,

View File

@@ -43,18 +43,36 @@ void bli_cntx_init_penryn( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, FALSE,
//BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE,
//BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, FALSE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, FALSE,
cntx
cntx,
//level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4,
//BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4,
//BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
//level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
//BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
//BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -69,13 +87,16 @@ void bli_cntx_init_penryn( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-1
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 4
#define BLIS_NR_s 4
#define BLIS_NR_d 4
//#endif

View File

@@ -43,16 +43,32 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 16
#define BLIS_MR_d 8
#define BLIS_MR_c 4
#define BLIS_MR_z 2
#define BLIS_NR_s 3
#define BLIS_NR_d 3
#define BLIS_NR_c 2
#define BLIS_NR_z 2
//#endif

View File

@@ -34,35 +34,6 @@
#include "blis.h"
// Instantiate prototypes for packm kernels.
PACKM_KER_PROT( float, s, packm_6xk_bb4_power10_ref )
PACKM_KER_PROT( double, d, packm_6xk_bb2_power10_ref )
// Instantiate prototypes for level-3 kernels.
GEMM_UKR_PROT( float, s, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( float, s, trsmbb_l_power10_ref )
TRSM_UKR_PROT( float, s, trsmbb_u_power10_ref )
GEMM_UKR_PROT( double, d, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( double, d, trsmbb_l_power10_ref )
TRSM_UKR_PROT( double, d, trsmbb_u_power10_ref )
GEMM_UKR_PROT( scomplex, c, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_l_power10_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_u_power10_ref )
GEMM_UKR_PROT( dcomplex, z, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power10_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power10_ref )
void bli_cntx_init_power10( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -72,51 +43,38 @@ void bli_cntx_init_power10( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
12,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, TRUE,
cntx,
BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power10_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, TRUE,
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power10_ref, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power10_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power10_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref, FALSE,
cntx
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8,
BLIS_VA_END
);
// Update the context with customized virtual [gemm]trsm micro-kernels.
bli_cntx_set_l3_vir_ukrs
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
8,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power10_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power10_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref,
cntx
);
cntx,
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power10_ref,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power10_ref,
cntx
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// s d c z
@@ -131,14 +89,16 @@ void bli_cntx_init_power10( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,51 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 8
#define BLIS_NR_s 16
#define BLIS_NR_d 8
#define BLIS_BBN_s 4
#define BLIS_BBN_d 2
//#endif

View File

@@ -43,13 +43,26 @@ void bli_cntx_init_power7( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
1,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -64,13 +77,16 @@ void bli_cntx_init_power7( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_d 8
#define BLIS_NR_d 4
//#endif

View File

@@ -34,35 +34,6 @@
#include "blis.h"
// Instantiate prototypes for packm kernels.
PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref )
PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref )
// Instantiate prototypes for level-3 kernels.
GEMM_UKR_PROT( float, s, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref )
TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref )
GEMM_UKR_PROT( double, d, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref )
TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref )
GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref )
GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref )
void bli_cntx_init_power9( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -72,50 +43,37 @@ void bli_cntx_init_power9( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
12,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE,
cntx,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE,
cntx
// level-3
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6,
BLIS_VA_END
);
// Update the context with customized virtual [gemm]trsm micro-kernels.
bli_cntx_set_l3_vir_ukrs
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
8,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref,
cntx
);
cntx,
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
cntx
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
@@ -131,14 +89,15 @@ void bli_cntx_init_power9( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
BLIS_VA_END
);
}

View File

@@ -0,0 +1,49 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_d 12
#define BLIS_NR_d 6
#define BLIS_BBN_s 4
#define BLIS_BBN_d 2
//#endif

View File

@@ -43,16 +43,32 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 8
#define BLIS_MR_c 8
#define BLIS_MR_z 4
#define BLIS_NR_s 8
#define BLIS_NR_d 4
#define BLIS_NR_c 4
#define BLIS_NR_z 4
//#endif

View File

@@ -43,39 +43,29 @@ void bli_cntx_init_skx( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE,
cntx
);
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14,
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
@@ -84,12 +74,15 @@ void bli_cntx_init_skx( cntx_t* cntx )
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
@@ -98,7 +91,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -116,17 +122,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 32
#define BLIS_MR_d 16
#define BLIS_NR_s 12
#define BLIS_NR_d 14
//#endif

View File

@@ -43,16 +43,32 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 16
#define BLIS_MR_d 8
#define BLIS_MR_c 4
#define BLIS_MR_z 2
#define BLIS_NR_s 3
#define BLIS_NR_d 3
#define BLIS_NR_c 2
#define BLIS_NR_z 2
//#endif

View File

@@ -45,34 +45,44 @@ void bli_cntx_init_template( cntx_t* cntx )
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
bli_cntx_set_ukrs
(
5,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, FALSE,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, FALSE,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, FALSE,
cntx
);
cntx,
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
// level-3
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt,
// level-1f
BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt,
BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt,
BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt,
BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt,
BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
// level-1v
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt,
cntx
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -87,13 +97,16 @@ void bli_cntx_init_template( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
//
// Only defined for block sizes which are not taken as the default (i.e. when
// an optimized kernel is provided).
//
#define BLIS_MR_z 4
#define BLIS_NR_z 4
//
// PACKMR/PACKNR do not need to be defined unless they are different from the
// "normal" MR/NR.
//
//#define BLIS_PACKMR_z 4
//#define BLIS_PACKNR_z 4
//#endif

View File

@@ -43,14 +43,28 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
cntx,
// level-3
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// level-3
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
BLIS_VA_END
);
}

View File

@@ -0,0 +1,48 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 8
#define BLIS_MR_d 6
#define BLIS_NR_s 12
#define BLIS_NR_d 8
//#endif

View File

@@ -40,92 +40,107 @@
void bli_cntx_init_zen( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
8,
cntx,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
#if 1
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
// gemmsup
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
#if 0
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
#endif
// packm
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
#if 1
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
@@ -136,25 +151,76 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
#if 1
// setv
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// gemm
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// gemmsup
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
#endif
cntx
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -195,131 +261,74 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 440, 220, -1, -1 );
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, -1, -1 );
#if 0
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 );
#endif
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
// sup thresholds
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
// gemmsup
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
BLIS_VA_END
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
cntx,
BLIS_GEMM, bli_gemmsup_ref,
//BLIS_GEMMT, bli_gemmtsup_ref,
cntx
BLIS_VA_END
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
#if 0
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
#endif
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
#endif
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
#if 0
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
#endif
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 6
#define BLIS_MR_d 6
#define BLIS_MR_c 3
#define BLIS_MR_z 3
#define BLIS_NR_s 16
#define BLIS_NR_d 8
#define BLIS_NR_c 8
#define BLIS_NR_z 4
//#endif

View File

@@ -38,73 +38,94 @@
void bli_cntx_init_zen2( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen2_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
8,
cntx,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
cntx
);
// level-3 sup
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
#if 1
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
#if 0
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
#endif
// packm
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
@@ -127,18 +148,59 @@ void bli_cntx_init_zen2( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
//swap
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
//copy
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
//set
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
cntx
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// gemm
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// level-3 sup
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -158,130 +220,73 @@ void bli_cntx_init_zen2( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
#if 1
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 500, 249, -1, -1 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 100000, 100000, -1, -1 );
#endif
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
// sup thresholds
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
// level-3 sup
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP,
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP,
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP,
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
BLIS_VA_END
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
#if 1
bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 );
#else
bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 );
#endif
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
cntx,
BLIS_GEMM, bli_gemmsup_ref,
cntx
//BLIS_GEMMT, bli_gemmtsup_ref,
BLIS_VA_END
);
#endif
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
#if 0
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
#endif
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
#endif
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 6
#define BLIS_MR_d 6
#define BLIS_MR_c 3
#define BLIS_MR_z 3
#define BLIS_NR_s 16
#define BLIS_NR_d 8
#define BLIS_NR_c 8
#define BLIS_NR_z 4
//#endif

View File

@@ -37,83 +37,106 @@
void bli_cntx_init_zen3( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen3_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
// Update the context with optimized native gemm micro-kernels.
bli_cntx_set_ukrs
(
8,
cntx,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
// gemmsup
#if 0
// AMD: This will be enabled in other PRs.
// packm kernels
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
cntx
);
// AMD: This should be enabled in the PR which has added these kernels
// Update the context with optimized small/unpacked gemm kernels.
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
#else
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// packm
#if 0
// AMD: This will be enabled in other PRs.
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
#else
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
#endif
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
@@ -135,19 +158,75 @@ void bli_cntx_init_zen3( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
//swap
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
//copy
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
//set
// setv
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
cntx
BLIS_VA_END
);
// Update the context with storage preferences.
bli_cntx_set_ukr_prefs
(
cntx,
// gemm
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
// gemmsup
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
#if 0
// AMD: This should be enabled in the PR which has added these kernels
// Update the context with optimized small/unpacked gemm kernels.
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
#endif
BLIS_VA_END
);
// Initialize level-3 blocksize objects with architecture-specific values.
@@ -164,138 +243,67 @@ void bli_cntx_init_zen3( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 200, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 240, 220, -1, -1 );
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
cntx,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
// sup thresholds
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
// gemmsup
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
BLIS_VA_END
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// -------------------------------------------------------------------------
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
2,
cntx,
BLIS_GEMM, bli_gemmsup_ref,
BLIS_GEMMT, bli_gemmtsup_ref,
cntx
//BLIS_GEMMT, bli_gemmtsup_ref,
BLIS_VA_END
);
#endif
#if 0
// AMD: This should be enabled in the PR which has added these kernels
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
28,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
cntx
);
#else
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
cntx
);
#endif
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_KERNEL_DEFS_H
//#define BLIS_KERNEL_DEFS_H
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
#define BLIS_MR_s 6
#define BLIS_MR_d 6
#define BLIS_MR_c 3
#define BLIS_MR_z 3
#define BLIS_NR_s 16
#define BLIS_NR_d 8
#define BLIS_NR_c 8
#define BLIS_NR_z 4
//#endif

View File

@@ -47,7 +47,7 @@ $ ls config/haswell
bli_cntx_init_haswell.c bli_family_haswell.h make_defs.mk
```
A sub-configuration (`haswell`, in this case) usually contains just three files:
* `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute.
* `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute.
* `bli_family_haswell.h`. This header file is `#included` when the configuration in question, in this case `haswell`, was the target to `./configure`. This is where you would specify certain global parameters and settings. For example, if you wanted to specify custom implementations of `malloc()` and `free()`, this is where you would specify them. The file is oftentimes empty. (In the case of configuration families, the definitions in this file apply to the _entire_ build, and not any specific sub-configuration, but for consistency we support them for all configuration targets, whether they be singleton sub-configurations or configuration families.)
* `make_defs.mk`. This makefile fragment defines the compiler and compiler flags to use during compilation. Specifically, the values defined in this file are used whenever compiling source code specific to the sub-configuration (i.e., reference kernels and optimized kernels). If the sub-configuration is the target of `configure`, then these flags are also used to compile general framework code.
@@ -127,7 +127,7 @@ void bli_cntx_init_fooarch( cntx_t* cntx )
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
@@ -143,7 +143,7 @@ _**Blocksize object array.**_ The `blkszs` array declaration is needed later in
_**Reference initialization.**_ The first function call, `bli_cntx_init_fooarch_ref()`, initializes the context `cntx` with function pointers to reference implementations of all of the kernels supported by BLIS (as well as cache and register blocksizes, and other fields). This function is automatically generated by BLIS for every sub-configuration enabled at configure-time. The function prototype is generated by a preprocessor macro in `frame/include/bli_arch_config.h`.
_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated.
_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated.
_Note:_ Currently, BLIS only allows the kernel developer to signal a preference (row or column) for `gemm` microkernels. The preference of the `gemmtrsm` and `trsm` microkernels can (and must) be set, but are ignored by the framework during execution.
@@ -236,7 +236,7 @@ _**Memory alignment.**_ BLIS implements memory alignment internally, rather than
```
The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`.
The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels.
The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels.
The value `BLIS_HEAP_STRIDE_ALIGN_SIZE` defines the alignment used for so-called "leading dimensions" (i.e. column strides for column-stored matrices, and row strides for row-stored matrices) when creating BLIS matrices via the object-based API (e.g. `bli_obj_create()`). While setting `BLIS_HEAP_ADDR_ALIGN_SIZE` guarantees alignment for the first column (or row), creating a matrix with certain dimension values (_m_ and _n_) may cause subsequent columns (or rows) to be misaligned. Setting this value to `BLIS_SIMD_ALIGN_SIZE` is usually desirable. Additional alignment may or may not be beneficial.
@@ -246,7 +246,7 @@ The value `BLIS_POOL_ADDR_ALIGN_SIZE_*` define the alignments used when allocati
### make_defs.mk
The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library.
The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library.
The format of the file is mostly self-explanatory. However, we will expound on the contents here, using the `make_defs.mk` file for the `haswell` configuration as an example:
```make
@@ -304,7 +304,7 @@ _**Debugging flags.**_ The `CDBGFLAGS` variable should be assigned to contain fl
_**Optimization flags.**_ The `COPTFLAGS` variable should be assigned any flags relating to general compiler optimization. Usually this takes the form of `-O2` or `-O3`, but more specific optimization flags may be included as well, such as `-fomit-frame-pointer`. Note that, as with `CDBGFLAGS`, `COPTFLAGS` is conditionally assigned based on the value of `$(DEBUG_TYPE)`. A separate `CKOPTFLAGS` variable tracks optimizations flags used when compiling kernels. For most configurations, `CKOPTFLAGS` is assigned as a copy of `COPTFLAGS`, but if the kernel developer needs different optimization flags to be applied when compiling kernel source code, `CKOPTFLAGS` should be set accordingly.
_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`.
_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`.
_**Variable storage/renaming.**_ Finally, the last statement commits the variables defined in the file to "storage". That is, they are copied to variable names that contain `THIS_CONFIG` as a suffix. This allows the variables for one configuration to co-exist with variables of another configuration.
@@ -406,7 +406,7 @@ Some sub-configurations, for various reasons, do not rely on their own set of ke
excavator: excavator/piledriver
steamroller: steamroller/piledriver
```
Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner.
Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner.
**Note:** Specifying non-native kernel sets via the `/` character is only allowed when defining singleton configuration families. They may NOT appear in the definitions of umbrella families! When an umbrella family includes a singleton family that is defined to require non-native kernels, this will be accounted for during the parsing of the `config_registry` file.
@@ -467,7 +467,7 @@ configure: skx: skx
configure: steamroller: steamroller
configure: x86_64: haswell sandybridge penryn zen excavator steamroller piledriver bulldozer generic
```
This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically.
This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically.
Next, the kernel list (actually, all kernel lists) is printed:
```
@@ -549,7 +549,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the
2. _**Add support within the framework source code.**_ We also need to make a minor update to the framework to support the new kernels--specifically, to pull in the kernels' function prototypes.
**`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file:
**`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file:
```c
#ifdef BLIS_KERNELS_KNL
#include "bli_kernels_knl.h"
@@ -560,7 +560,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the
## Adding a new configuration family
Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set.
Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set.
@@ -636,7 +636,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
```
THIS_CONFIG := knl
```
and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file.
and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file.
```c
#define BLIS_SIMD_MAX_NUM_REGISTERS 32
#define BLIS_SIMD_MAX_SIZE 64
@@ -714,7 +714,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
#include "bli_family_knl.h"
#endif
```
As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.)
As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.)

View File

@@ -61,15 +61,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
x, incx, \
y, incy, \
cntx \
conjx, \
n, \
x, incx, \
y, incy, \
cntx \
); \
}
@@ -98,14 +98,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
n, \
x, incx, \
index, \
cntx \
n, \
x, incx, \
index, \
cntx \
); \
}
@@ -135,17 +135,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
alpha, \
x, incx, \
beta, \
y, incy, \
cntx \
conjx, \
n, \
alpha, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}
@@ -175,16 +175,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
if ( cntx == NULL ) \
cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
alpha, \
x, incx, \
y, incy, \
cntx \
conjx, \
n, \
alpha, \
x, incx, \
y, incy, \
cntx \
); \
}
@@ -215,17 +215,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
conjy, \
n, \
x, incx, \
y, incy, \
rho, \
cntx \
conjx, \
conjy, \
n, \
x, incx, \
y, incy, \
rho, \
cntx \
); \
}
@@ -257,19 +257,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
beta, \
rho, \
cntx \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
beta, \
rho, \
cntx \
); \
}
@@ -295,13 +295,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
n, \
x, incx, \
cntx \
n, \
x, incx, \
cntx \
); \
}
@@ -329,15 +329,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjalpha, \
n, \
alpha, \
x, incx, \
cntx \
conjalpha, \
n, \
alpha, \
x, incx, \
cntx \
); \
}
@@ -365,14 +365,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
n, \
x, incx, \
y, incy, \
cntx \
n, \
x, incx, \
y, incy, \
cntx \
); \
}
@@ -400,16 +400,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
x, incx, \
beta, \
y, incy, \
cntx \
conjx, \
n, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}

View File

@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
\
copyv_p \
( \

View File

@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
\
copyv_p \
( \

View File

@@ -85,32 +85,33 @@ void PASTEMAC2(ch,opname,EX_SUF) \
\
if ( bli_is_nonunit_diag( diagx ) ) \
{ \
x1 = x + offx; \
y1 = y + offy; \
x1 = x + offx; \
y1 = y + offy; \
} \
else /* if ( bli_is_unit_diag( diagx ) ) */ \
{ \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
} \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjx, \
n_elem, \
x1, incx, \
y1, incy, \
cntx \
); \
f \
( \
conjx, \
n_elem, \
x1, incx, \
y1, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( addd, addv, BLIS_ADDV_KER )
@@ -164,33 +165,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \
\
if ( bli_is_nonunit_diag( diagx ) ) \
{ \
x1 = x + offx; \
y1 = y + offy; \
x1 = x + offx; \
y1 = y + offy; \
} \
else /* if ( bli_is_unit_diag( diagx ) ) */ \
{ \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
} \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjx, \
n_elem, \
alpha, \
x1, incx, \
y1, incy, \
cntx \
); \
f \
( \
conjx, \
n_elem, \
alpha, \
x1, incx, \
y1, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( axpyd, axpyv, BLIS_AXPYV_KER )
@@ -233,20 +235,21 @@ void PASTEMAC2(ch,opname,EX_SUF) \
&offx, &n_elem, &incx \
); \
\
x1 = x + offx; \
x1 = x + offx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
n_elem, \
x1, incx, \
cntx \
); \
f \
( \
n_elem, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
@@ -290,22 +293,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
&offx, &n_elem, &incx \
); \
\
x1 = x + offx; \
x1 = x + offx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjalpha, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
f \
( \
conjalpha, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
@@ -361,27 +365,28 @@ void PASTEMAC2(ch,opname,EX_SUF) \
PASTEMAC(ch,setis)( *alpha, *chi11 ); \
} */ \
\
/* Acquire the addres of the imaginary component of the first element,
/* Acquire the address of the imaginary component of the first element,
and scale the increment for use in the real domain. Note that the
indexing into the imaginary field only needs to work for complex
datatypes since we return early for real domain types. */ \
x1 = ( ctype_r* )( x + offx ) + 1; \
x1 = ( ctype_r* )( x + offx ) + 1; \
incx = 2*incx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \
PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
BLIS_NO_CONJUGATE, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
f \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
@@ -424,22 +429,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
&offx, &n_elem, &incx \
); \
\
x1 = x + offx; \
x1 = x + offx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
BLIS_NO_CONJUGATE, \
n_elem, \
alpha, 0, \
x1, incx, \
cntx \
); \
f \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
alpha, 0, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
@@ -491,33 +497,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \
\
if ( bli_is_nonunit_diag( diagx ) ) \
{ \
x1 = x + offx; \
y1 = y + offy; \
x1 = x + offx; \
y1 = y + offy; \
} \
else /* if ( bli_is_unit_diag( diagx ) ) */ \
{ \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
} \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjx, \
n_elem, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
f \
( \
conjx, \
n_elem, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER )

View File

@@ -65,19 +65,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
conjy, \
n, \
alphax, \
alphay, \
x, incx, \
y, incy, \
z, incz, \
cntx \
conjx, \
conjy, \
n, \
alphax, \
alphay, \
x, incx, \
y, incy, \
z, incz, \
cntx \
); \
}
@@ -109,19 +109,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conja, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
y, incy, \
cntx \
conja, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
y, incy, \
cntx \
); \
}
@@ -154,20 +154,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjxt, \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
rho, \
z, incz, \
cntx \
conjxt, \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
rho, \
z, incz, \
cntx \
); \
}
@@ -204,24 +204,24 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjat, \
conja, \
conjw, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
w, incw, \
x, incx, \
beta, \
y, incy, \
z, incz, \
cntx \
conjat, \
conja, \
conjw, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
w, incw, \
x, incx, \
beta, \
y, incy, \
z, incz, \
cntx \
); \
}
@@ -254,20 +254,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
f \
( \
conjat, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
beta, \
y, incy, \
cntx \
conjat, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}

View File

@@ -102,35 +102,40 @@ INSERT_GENTDEF( packm_cxk )
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjp, \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict p, inc_t ldp, \
ctype* restrict a, inc_t inca, inc_t lda, \
cntx_t* restrict cntx \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( unpackm_cxk )
// packm_1er_ker
// packm_diag_ker
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
struc_t struca, \
diag_t diaga, \
uplo_t uploa, \
conj_t conja, \
pack_t schema, \
bool invdiag, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( packm_cxk_1er )
INSERT_GENTDEF( packm_cxc_diag )
#endif

View File

@@ -47,16 +47,8 @@
#undef GENTPROT
#define GENTPROT PACKM_KER_PROT
INSERT_GENTPROT_BASIC0( packm_2xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_3xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_4xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_6xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_8xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_10xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_12xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_14xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_16xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_mrxk_ker_name )
INSERT_GENTPROT_BASIC0( packm_nrxk_ker_name )
// native unpackm kernels
@@ -64,27 +56,33 @@ INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
#undef GENTPROT
#define GENTPROT UNPACKM_KER_PROT
INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_mrxk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name )
// 1e/1r packm kernels
#undef GENTPROT
#define GENTPROT PACKM_1ER_KER_PROT
#define GENTPROT PACKM_KER_PROT
INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name )
// packm kernels for diagonal blocks
#undef GENTPROT
#define GENTPROT PACKM_DIAG_KER_PROT
INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_ker_name )
INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_ker_name )
// 1e/1r packm kernels for diagonal blocks
#undef GENTPROT
#define GENTPROT PACKM_DIAG_KER_PROT
INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_1er_ker_name )

View File

@@ -37,7 +37,7 @@
// Define template prototypes for level-1m kernels.
//
// native packm kernels
// packm kernels
#define PACKM_KER_PROT( ctype, ch, varname ) \
\
@@ -55,35 +55,40 @@ void PASTEMAC(ch,varname) \
);
// native unpackm kernels
// unpackm kernels
#define UNPACKM_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict p, inc_t ldp, \
ctype* restrict a, inc_t inca, inc_t lda, \
cntx_t* restrict cntx \
);
// 1e/1r packm kernels
#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict p, inc_t ldp, \
ctype* restrict a, inc_t inca, inc_t lda, \
cntx_t* restrict cntx \
);
// packm kernels for diagonal blocks
#define PACKM_DIAG_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t struca, \
diag_t diaga, \
uplo_t uploa, \
conj_t conja, \
pack_t schema, \
bool invdiag, \
dim_t cdim, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
cntx_t* restrict cntx \
);

View File

@@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \
conjx = bli_extract_conj( transx ); \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \
@@ -197,7 +197,7 @@ void PASTEMAC(ch,opname) \
conjx = bli_extract_conj( transx ); \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \
@@ -310,7 +310,7 @@ void PASTEMAC(ch,opname) \
if ( bli_is_zeros( uplox_eff ) ) return; \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \
@@ -423,7 +423,7 @@ void PASTEMAC(ch,opname) \
conjx = bli_extract_conj( transx ); \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \

View File

@@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
num_t dt = PASTEMAC(ch,type); \
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
: BLIS_PACKM_MRXK_KER; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
@@ -91,30 +92,30 @@ void PASTEMAC(ch,opname) \
that happens, the packm kernel must have set the 0's added in
step (3) below.
packm kernel packm kernel packm kernel packm_tri_cxk
packm kernel packm kernel packm kernel packm_tri_cxk
step 1: step 2: step 3: step 4:
x x x x . . x x x x . . x x x x 0 0 x x x x 0 0
? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0
? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0
? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
x x x x . . x x x x . . x x x x 0 0 x x x x 0 0
? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0
? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0
? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
x Copied from A; valid element.
? Copied from A, but value is unknown and unused.
? Copied from A, but value is unknown and unused.
. Uninitialized.
0 Initialized to zero.
1 Initialized to one.
0 Initialized to zero.
1 Initialized to one.
NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
to zero. This is not needed to support trsm, but rather to
support trmm. (Both use the same packing format and code.)
In this case, panel_dim will be 4 because four rows of data are
copied from A, panel_len will be 4 because those four rows span
four columns of A, and panel_len_max will be 6 because there are a
total of 6 columns that can be written to in the packed micropanel,
In this case, panel_dim will be 4 because four rows of data are
copied from A, panel_len will be 4 because those four rows span
four columns of A, and panel_len_max will be 6 because there are a
total of 6 columns that can be written to in the packed micropanel,
2 of which lie beyond the values copied from A. */ \
f \
( \

View File

@@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
num_t dt = PASTEMAC(ch,type); \
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
: BLIS_PACKM_MRXK_1ER_KER; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \

View File

@@ -40,6 +40,7 @@
void PASTEMAC(ch,opname) \
( \
conj_t conjp, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* kappa, \
@@ -48,15 +49,16 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim; \
num_t dt = PASTEMAC(ch,type); \
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
: BLIS_UNPACKM_MRXK_KER; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the unpackm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \

View File

@@ -39,6 +39,7 @@
void PASTEMAC(ch,varname) \
( \
conj_t conjp, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* kappa, \

View File

@@ -43,10 +43,6 @@
#include "bli_packm_part.h"
#include "bli_packm_struc_cxk.h"
#include "bli_packm_struc_cxk_1er.h"
#include "bli_packm_cxk.h"
#include "bli_packm_cxk_1er.h"
// Mixed datatype support.
#ifdef BLIS_ENABLE_GEMM_MD

View File

@@ -43,11 +43,11 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
{ { NULL, bli_cpackm_struc_cxk,
NULL, bli_zpackm_struc_cxk, } },
// 0010 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
{ { NULL, bli_cpackm_struc_cxk,
NULL, bli_zpackm_struc_cxk, } },
};
static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);

View File

@@ -34,8 +34,8 @@
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, cxk_kername, cxc_kername ) \
\
void PASTEMAC(ch,varname) \
( \
@@ -58,460 +58,249 @@ void PASTEMAC(ch,varname) \
cntx_t* cntx \
) \
{ \
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk) \
( \
strucc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
panel_dim, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk) \
( \
strucc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
panel_dim, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
num_t dt = PASTEMAC(ch,type); \
num_t dt_r = PASTEMAC(chr,type); \
dim_t panel_len_pad = panel_len_max - panel_len; \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
bszid_t bsz_id = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \
dim_t packmrnr = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \
dim_t packmrnr_r = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \
\
ukr_t cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
: BLIS_PACKM_MRXK_KER; \
ukr_t cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER \
: BLIS_PACKM_MRXMR_DIAG_KER; \
\
if ( bli_is_1m_packed( schema ) ) \
{ \
cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
: BLIS_PACKM_MRXK_1ER_KER; \
cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER \
: BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
} \
\
PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
\
/* For general matrices, pack and return early */ \
if ( bli_is_general( strucc ) ) \
{ \
f_cxk \
( \
conjc, \
schema, \
panel_dim, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
return; \
} \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
doff_t diagoffc = panel_dim_off - panel_len_off; \
doff_t diagoffc_abs; \
dim_t i, j; \
if ( ( -panel_dim < diagoffc && diagoffc < 0 ) || \
( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
/* For triangular, symmetric, and hermitian matrices we need to consider
three parts. */ \
\
/* Pack to p10. */ \
if ( 0 < diagoffc ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
dim_t p10_dim = panel_dim; \
dim_t p10_len = bli_min( diagoffc, panel_len ); \
dim_t p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
ctype* p10 = p; \
conj_t conjc10 = conjc; \
ctype* c10 = c; \
inc_t incc10 = incc; \
inc_t ldc10 = ldc; \
\
if ( bli_is_upper( uploc ) ) \
{ \
c = c + diagoffc * ( doff_t )ldc + \
-diagoffc * ( doff_t )incc; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
{ \
ctype* restrict c10; \
ctype* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( diagoffc < 0 ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( bli_is_lower( uploc ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
-diagoffc12 * ( doff_t )incc; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( bli_is_upper( uploc ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
-diagoffc10 * ( doff_t )incc; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc10, \
schema, \
p10_dim, \
panel_dim_max, \
p10_len, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
p10, ldp, \
cntx \
); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc12, \
schema, \
p12_dim, \
panel_dim_max, \
p12_len, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
p12, ldp, \
cntx \
); \
\
/* Pack the stored triangle of c11 to p11. */ \
/* If we are referencing the unstored part of a triangular matrix,
explicitly store zeros */ \
if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \
{ \
dim_t p11_m = panel_dim; \
dim_t p11_n = panel_dim; \
dim_t j2 = diagoffc_abs; \
ctype* restrict c11 = c + (j2 )*ldc; \
ctype* restrict p11 = p + (j2 )*ldp; \
trans_t transc = ( trans_t )conjc; \
\
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
transc, \
p11_m, \
p11_n, \
c11, incc, ldc, \
p11, 1, ldp, \
cntx, \
NULL \
); \
\
/* If source matrix c is Hermitian, we have to zero out the
imaginary components of the diagonal of p11 in case the
corresponding elements in c11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
if ( bli_is_1m_packed( schema ) ) \
{ \
ctype* restrict pi11 = p11; \
ctype_r* restrict zero = PASTEMAC(chr,0); \
\
for ( i = 0; i < p11_m; ++i ) \
{ \
PASTEMAC(ch,seti0s)( *pi11 ); \
\
pi11 += 1 + ldp; \
} \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
packmrnr_r, \
p10_len_max * 2, \
zero, \
( ctype_r* )p10, 1, ldp, \
cntx, \
NULL \
); \
} \
else \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
\
/* Now that the diagonal has been made explicitly Hermitian
(if applicable), we can now safely scale the stored
triangle specified by uploc. */ \
PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
packmrnr, \
p10_len_max, \
zero, \
p10, 1, ldp, \
cntx, \
NULL \
); \
} \
} \
else \
{ \
f_cxk \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
p11_m, \
p11_n, \
conjc10, \
schema, \
p10_dim, \
p10_len, \
p10_len_max, \
kappa, \
p11, 1, ldp, \
cntx, \
NULL \
c10, incc10, ldc10, \
p10, ldp, \
cntx \
); \
} \
} \
\
/* Pack to p11. */ \
if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \
{ \
dim_t i = diagoffc; \
dim_t p11_dim = panel_dim; \
dim_t p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \
? panel_len_pad : 0 ); \
ctype* p11 = p + i * ldp; \
conj_t conjc11 = conjc; \
ctype* c11 = c + i * ldc; \
inc_t incc11 = incc; \
inc_t ldc11 = ldc; \
\
f_cxc \
( \
strucc, \
diagc, \
uploc, \
conjc11, \
schema, \
invdiag, \
p11_dim, \
p11_len_max, \
kappa, \
c11, incc11, ldc11, \
p11, ldp, \
cntx \
); \
} \
\
/* Pack to p12. */ \
if ( diagoffc + panel_dim < panel_len ) \
{ \
dim_t i = bli_max( 0, diagoffc + panel_dim ); \
dim_t p12_dim = panel_dim; \
dim_t p12_len = panel_len - i; \
/* If we are packing p12, then it is always the last partial block \
and so we should make sure to pad with zeros if necessary. */ \
dim_t p12_len_max = p12_len + panel_len_pad; \
ctype* p12 = p + i * ldp; \
conj_t conjc12 = conjc; \
ctype* c12 = c + i * ldc; \
inc_t incc12 = incc; \
inc_t ldc12 = ldc; \
\
if ( bli_is_lower( uploc ) ) \
{ \
bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
\
/* If we are referencing the unstored part of a triangular matrix,
explicitly store zeros */ \
if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \
{ \
if ( bli_is_1m_packed( schema ) ) \
{ \
ctype_r* restrict zero = PASTEMAC(chr,0); \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
packmrnr_r, \
p12_len_max * 2, \
zero, \
( ctype_r* )p12, 1, ldp, \
cntx, \
NULL \
); \
} \
else \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
packmrnr, \
p12_len_max, \
zero, \
p12, 1, ldp, \
cntx, \
NULL \
); \
} \
} \
else \
{ \
f_cxk \
( \
conjc12, \
schema, \
p12_dim, \
p12_len, \
p12_len_max, \
kappa, \
c12, incc12, ldc12, \
p12, ldp, \
cntx \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc = panel_dim_off - panel_len_off; \
\
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffc, \
panel_dim, \
panel_len, \
kappa, \
p, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
( \
diagoffc, \
panel_dim, \
panel_len, \
p, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffc, \
BLIS_NONUNIT_DIAG, \
uplop, \
panel_dim, \
panel_len, \
zero, \
p, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( panel_dim != panel_dim_max && \
panel_len != panel_len_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t i = panel_dim; \
dim_t j = panel_len; \
dim_t m_br = panel_dim_max - i; \
dim_t n_br = panel_len_max - j; \
ctype* p_br = p + (i ) + (j )*ldp; \
\
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
one, \
p_br, 1, ldp, \
cntx, \
NULL \
); \
} \
}
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag )

View File

@@ -37,5 +37,3 @@
#include "bli_unpackm_int.h"
#include "bli_unpackm_blk_var1.h"
#include "bli_unpackm_cxk.h"

View File

@@ -36,21 +36,22 @@
#define FUNCPTR_T unpackm_fp
typedef void (*FUNCPTR_T)(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
dim_t m,
dim_t n,
dim_t m_panel,
dim_t n_panel,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
);
typedef void (*FUNCPTR_T)
(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
dim_t m,
dim_t n,
dim_t m_panel,
dim_t n_panel,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
);
static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
@@ -152,10 +153,10 @@ void PASTEMAC(ch,varname) \
dim_t iter_dim; \
dim_t num_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
@@ -164,6 +165,7 @@ void PASTEMAC(ch,varname) \
inc_t ldp; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
pack_t schema; \
\
\
/* If c needs a transposition, induce it so that we can more simply
@@ -182,6 +184,7 @@ void PASTEMAC(ch,varname) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to unpack from column panels. */ \
schema = BLIS_PACKED_COL_PANELS; \
iter_dim = n; \
panel_len = m; \
panel_dim_max = pd_p; \
@@ -196,6 +199,7 @@ void PASTEMAC(ch,varname) \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to unpack from row panels. */ \
schema = BLIS_PACKED_ROW_PANELS; \
iter_dim = m; \
panel_len = n; \
panel_dim_max = pd_p; \
@@ -207,6 +211,14 @@ void PASTEMAC(ch,varname) \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
} \
\
num_t dt = PASTEMAC(ch,type); \
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
: BLIS_UNPACKM_MRXK_KER; \
\
/* Query the context for the unpackm kernel corresponding to the current
panel dimension, or kernel id. */ \
PASTECH2(ch,unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -253,9 +265,10 @@ void PASTEMAC(ch,varname) \
else \
{ \
/* Pack the current panel. */ \
PASTEMAC(ch,unpackm_cxk) \
f \
( \
BLIS_NO_CONJUGATE, \
schema, \
panel_dim_i, \
panel_len, \
one, \

View File

@@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointer. */ \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < n_iter; ++i ) \
{ \

View File

@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < n_iter; ++i ) \
{ \

View File

@@ -71,7 +71,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotxf_ker_ft) kfp_df; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \

View File

@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyf_ker_ft) kfp_af; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \

View File

@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( j = 0; j < n; ++j ) \
{ \

View File

@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointers. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -123,7 +123,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointer. */ \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointers. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -122,7 +122,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointers. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
\
for ( i = 0; i < m; i += f ) \

View File

@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
\
/* Query the context for the kernel function pointer. */ \
kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
\
for ( i = 0; i < m; i += f ) \

View File

@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
\
/* Query the context for the kernel function pointer. */ \
kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

View File

@@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \

Some files were not shown because too many files have changed in this diff Show More