diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c index a0040fec0..9139e89b1 100644 --- a/addon/gemmd/attic/bao_gemmd_bp_var2.c +++ b/addon/gemmd/attic/bao_gemmd_bp_var2.c @@ -164,7 +164,7 @@ void PASTECH2(bao_,ch,varname) \ function pointer type. */ \ /* PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ */ \ \ /* Temporary C buffer for edge cases. Note that the strides of this @@ -175,7 +175,7 @@ void PASTECH2(bao_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ @@ -536,7 +536,7 @@ void PASTECH2(bao_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -545,7 +545,7 @@ void PASTECH2(bao_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c index fadc52691..01185a9d7 100644 --- a/addon/gemmd/bao_gemmd.c +++ b/addon/gemmd/bao_gemmd.c @@ -137,7 +137,7 @@ void bao_gemmd_ex // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c index 09e4df09e..689471367 100644 --- a/addon/gemmd/bao_gemmd_bp_var1.c +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -163,7 +163,7 @@ void PASTECH2(bao_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c index 645f09d79..8680c5332 100644 --- a/addon/gemmd/bao_packm_cxk.c +++ b/addon/gemmd/bao_packm_cxk.c @@ -55,15 +55,15 @@ void PASTECH2(bao_,ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/common.mk b/common.mk index 5f2d30c9b..a93f8ab24 100644 --- a/common.mk +++ b/common.mk @@ -120,6 +120,8 @@ get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ -DBLIS_CNAME=$(1) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ ) get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ @@ -129,6 +131,8 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ -DBLIS_CNAME=$(1) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ ) get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c index 5132b2824..dd920bcec 100644 --- a/config/a64fx/bli_cntx_init_a64fx.c +++ b/config/a64fx/bli_cntx_init_a64fx.c @@ -38,34 +38,42 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_a64fx_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, + + // packm + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, + + BLIS_VA_END ); - // Set SVE-512 packing routine. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 2, - BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, - // 12xk is not used and disabled for GCC 8-9 compatibility. - // BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk, - BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -80,66 +88,18 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -#endif - // Set A64FX cache sector sizes for each PE/CMG // SC Fugaku might disable users' setting cache sizes. #if !defined(CACHE_SECTOR_SIZE_READONLY) diff --git a/config/a64fx/bli_kernel_defs_a64fx.h b/config/a64fx/bli_kernel_defs_a64fx.h new file mode 100644 index 000000000..2c5c97204 --- /dev/null +++ b/config/a64fx/bli_kernel_defs_a64fx.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 +#define BLIS_MR_c 16 +#define BLIS_MR_z 8 + +#define BLIS_NR_s 10 +#define BLIS_NR_d 10 +#define BLIS_NR_c 10 +#define BLIS_NR_z 10 + +//#endif + diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c index ad0e68219..6339ba381 100644 --- a/config/armsve/bli_cntx_init_armsve.c +++ b/config/armsve/bli_cntx_init_armsve.c @@ -45,9 +45,6 @@ void bli_cntx_init_armsve( cntx_t* cntx ) return; blksz_t blkszs[ BLIS_NUM_BLKSZS ]; -#if 0 - blksz_t thresh[ BLIS_NUM_THRESH ]; -#endif // Set default kernel blocksizes and functions. bli_cntx_init_armsve_ref( cntx ); @@ -64,35 +61,55 @@ void bli_cntx_init_armsve( cntx_t* cntx ) bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c); bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z); - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, + cntx, + + // level-3 // These are vector-length agnostic kernels. Yet knowing mr is required at runtime. - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, - cntx + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Set VL-specific packing routines if applicable. - if (m_r_d==16) - bli_cntx_set_packm_kers + if ( m_r_d == 16 ) + { + bli_cntx_set_ukrs ( - 2, - BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, - BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, - cntx + cntx, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, + BLIS_VA_END ); - else if (m_r_d==8) - bli_cntx_set_packm_kers + } + else if ( m_r_d == 8 ) + { + bli_cntx_set_ukrs ( - 1, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk, - cntx + cntx, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk, + BLIS_VA_END ); + } // Initialize level-3 blocksize objects with architecture-specific values. // s d c z @@ -106,64 +123,16 @@ void bli_cntx_init_armsve( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); - -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -#endif } diff --git a/config/armsve/bli_kernel_defs_armsve.h b/config/armsve/bli_kernel_defs_armsve.h new file mode 100644 index 000000000..8c9c0b0dd --- /dev/null +++ b/config/armsve/bli_kernel_defs_armsve.h @@ -0,0 +1,58 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +// +// The armsve configuration handles both 256-bit and 512-bit SVE vectors, +// so it is not possible to define specific register block sizes. Thus, +// armsve can't use reference kernels! +// + +#define BLIS_MR_s -1 +#define BLIS_MR_d -1 +#define BLIS_MR_c -1 +#define BLIS_MR_z -1 + +#define BLIS_NR_s 10 +#define BLIS_NR_d 10 +#define BLIS_NR_c 10 +#define BLIS_NR_z 10 + +//#endif + diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c index 782c441b9..d3871d8f7 100644 --- a/config/bgq/bli_cntx_init_bgq.c +++ b/config/bgq/bli_cntx_init_bgq.c @@ -43,14 +43,28 @@ void bli_cntx_init_bgq( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_bgq( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/bgq/bli_kernel_defs_bgq.h b/config/bgq/bli_kernel_defs_bgq.h new file mode 100644 index 000000000..bd3962e45 --- /dev/null +++ b/config/bgq/bli_kernel_defs_bgq.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 8 +#define BLIS_MR_z 4 + +#define BLIS_NR_d 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/bulldozer/bli_cntx_init_bulldozer.c b/config/bulldozer/bli_cntx_init_bulldozer.c index 9f6e83d6b..5b056f591 100644 --- a/config/bulldozer/bli_cntx_init_bulldozer.c +++ b/config/bulldozer/bli_cntx_init_bulldozer.c @@ -43,16 +43,32 @@ void bli_cntx_init_bulldozer( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_bulldozer( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/bulldozer/bli_kernel_defs_bulldozer.h b/config/bulldozer/bli_kernel_defs_bulldozer.h new file mode 100644 index 000000000..ea1e58e66 --- /dev/null +++ b/config/bulldozer/bli_kernel_defs_bulldozer.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 4 +#define BLIS_MR_c 8 +#define BLIS_MR_z 4 + +#define BLIS_NR_s 8 +#define BLIS_NR_d 6 +#define BLIS_NR_c 4 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/cortexa15/bli_cntx_init_cortexa15.c b/config/cortexa15/bli_cntx_init_cortexa15.c index 7c6134ff0..28ebdef71 100644 --- a/config/cortexa15/bli_cntx_init_cortexa15.c +++ b/config/cortexa15/bli_cntx_init_cortexa15.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa15( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -73,13 +87,16 @@ void bli_cntx_init_cortexa15( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa15/bli_kernel_defs_cortexa15.h b/config/cortexa15/bli_kernel_defs_cortexa15.h new file mode 100644 index 000000000..9c413f7f8 --- /dev/null +++ b/config/cortexa15/bli_kernel_defs_cortexa15.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 4 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/cortexa53/bli_cntx_init_cortexa53.c b/config/cortexa53/bli_cntx_init_cortexa53.c index d7d786f8c..4957de04e 100644 --- a/config/cortexa53/bli_cntx_init_cortexa53.c +++ b/config/cortexa53/bli_cntx_init_cortexa53.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa53( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa53( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa53/bli_kernel_defs_cortexa53.h b/config/cortexa53/bli_kernel_defs_cortexa53.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/cortexa53/bli_kernel_defs_cortexa53.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c index 57d18792d..28558bc52 100644 --- a/config/cortexa57/bli_cntx_init_cortexa57.c +++ b/config/cortexa57/bli_cntx_init_cortexa57.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa57( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa57( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa57/bli_kernel_defs_cortexa57.h b/config/cortexa57/bli_kernel_defs_cortexa57.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/cortexa57/bli_kernel_defs_cortexa57.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c index d38e12ebb..6af3ff91c 100644 --- a/config/cortexa9/bli_cntx_init_cortexa9.c +++ b/config/cortexa9/bli_cntx_init_cortexa9.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa9( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa9( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa9/bli_kernel_defs_cortexa9.h b/config/cortexa9/bli_kernel_defs_cortexa9.h new file mode 100644 index 000000000..9c413f7f8 --- /dev/null +++ b/config/cortexa9/bli_kernel_defs_cortexa9.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 4 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c index adae152d5..d36865b21 100644 --- a/config/excavator/bli_cntx_init_excavator.c +++ b/config/excavator/bli_cntx_init_excavator.c @@ -43,16 +43,32 @@ void bli_cntx_init_excavator( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_excavator( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/excavator/bli_kernel_defs_excavator.h b/config/excavator/bli_kernel_defs_excavator.h new file mode 100644 index 000000000..df4a8c411 --- /dev/null +++ b/config/excavator/bli_kernel_defs_excavator.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c index a15ce0344..8e4d0088d 100644 --- a/config/firestorm/bli_cntx_init_firestorm.c +++ b/config/firestorm/bli_cntx_init_firestorm.c @@ -37,32 +37,60 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_firestorm_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + + BLIS_VA_END ); - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 4, - BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, - BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -73,72 +101,47 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], -1, 99, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], -1, 99, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], -1, 99, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); - // ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 ); + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 8, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); } diff --git a/config/firestorm/bli_kernel_defs_firestorm.h b/config/firestorm/bli_kernel_defs_firestorm.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/firestorm/bli_kernel_defs_firestorm.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/generic/bli_kernel_defs_generic.h b/config/generic/bli_kernel_defs_generic.h new file mode 100644 index 000000000..db2f32947 --- /dev/null +++ b/config/generic/bli_kernel_defs_generic.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +//#endif + diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index f2dc900ea..fe3b45147 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -35,79 +35,58 @@ #include "blis.h" -//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) - void bli_cntx_init_haswell( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_haswell_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, + // gemm #if 1 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, #else - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, #endif // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, #if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -137,7 +116,74 @@ void bli_cntx_init_haswell( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm +#if 1 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, +#else + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, +#endif + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -161,97 +207,54 @@ void bli_cntx_init_haswell( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); + // ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 201, 201, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 201, 201, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 201, 201, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - // ------------------------------------------------------------------------- + // gemmsup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 201, 201, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 201, 201, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 201, 201, -1, -1 ); + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - -#if 0 - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 1, - BLIS_GEMM, bli_gemmsup_ref, - cntx - ); -#endif - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); } diff --git a/config/haswell/bli_kernel_defs_haswell.h b/config/haswell/bli_kernel_defs_haswell.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/haswell/bli_kernel_defs_haswell.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c index 198f08827..8f615588c 100644 --- a/config/knc/bli_cntx_init_knc.c +++ b/config/knc/bli_cntx_init_knc.c @@ -43,13 +43,26 @@ void bli_cntx_init_knc( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 1, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, TRUE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -58,7 +71,7 @@ void bli_cntx_init_knc( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 120, 0, 0, 0, 160, 0, 0 ); - bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0, + bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0, 0, 300, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 14400, 0, 0 ); @@ -66,13 +79,16 @@ void bli_cntx_init_knc( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/knc/bli_kernel_defs_knc.h b/config/knc/bli_kernel_defs_knc.h new file mode 100644 index 000000000..0ae6d1b75 --- /dev/null +++ b/config/knc/bli_kernel_defs_knc.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 30 + +#define BLIS_NR_d 8 + +#define BLIS_PACKMR_d 32 + +//#endif + diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c index 6da3b7a3a..87fa3176a 100644 --- a/config/knl/bli_cntx_init_knl.c +++ b/config/knl/bli_cntx_init_knl.c @@ -43,47 +43,33 @@ void bli_cntx_init_knl( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE, - cntx - ); + cntx, - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk, - BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk, - cntx - ); + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, + + // packm + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, - // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif + // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, @@ -92,12 +78,15 @@ void bli_cntx_init_knl( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif + // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, @@ -106,7 +95,20 @@ void bli_cntx_init_knl( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -125,17 +127,20 @@ void bli_cntx_init_knl( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + BLIS_VA_END ); } diff --git a/config/knl/bli_kernel_defs_knl.h b/config/knl/bli_kernel_defs_knl.h new file mode 100644 index 000000000..ce514bb21 --- /dev/null +++ b/config/knl/bli_kernel_defs_knl.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 24 +#define BLIS_MR_d 24 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/old/armv7a/bli_cntx_init_armv7a.c b/config/old/armv7a/bli_cntx_init_armv7a.c index d4cc9e91d..acd8e6c18 100644 --- a/config/old/armv7a/bli_cntx_init_armv7a.c +++ b/config/old/armv7a/bli_cntx_init_armv7a.c @@ -66,7 +66,7 @@ void bli_cntx_init_armv7a( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, diff --git a/config/old/haswellbb/bli_cntx_init_haswell.c b/config/old/haswellbb/bli_cntx_init_haswell.c index 9e1d03503..88bd14a07 100644 --- a/config/old/haswellbb/bli_cntx_init_haswell.c +++ b/config/old/haswellbb/bli_cntx_init_haswell.c @@ -203,7 +203,7 @@ void bli_cntx_init_haswell( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c index 1576bf944..964438e83 100644 --- a/config/penryn/bli_cntx_init_penryn.c +++ b/config/penryn/bli_cntx_init_penryn.c @@ -43,18 +43,36 @@ void bli_cntx_init_penryn( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, FALSE, - //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE, - //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, FALSE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, FALSE, - cntx + cntx, + + //level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, + //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, + //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + //level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + //BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + //BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -69,13 +87,16 @@ void bli_cntx_init_penryn( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-1 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/penryn/bli_kernel_defs_penryn.h b/config/penryn/bli_kernel_defs_penryn.h new file mode 100644 index 000000000..f1e483646 --- /dev/null +++ b/config/penryn/bli_kernel_defs_penryn.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/piledriver/bli_cntx_init_piledriver.c b/config/piledriver/bli_cntx_init_piledriver.c index 4ed15e322..1c9a96fd9 100644 --- a/config/piledriver/bli_cntx_init_piledriver.c +++ b/config/piledriver/bli_cntx_init_piledriver.c @@ -43,16 +43,32 @@ void bli_cntx_init_piledriver( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_piledriver( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/piledriver/bli_kernel_defs_piledriver.h b/config/piledriver/bli_kernel_defs_piledriver.h new file mode 100644 index 000000000..df4a8c411 --- /dev/null +++ b/config/piledriver/bli_kernel_defs_piledriver.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c index 14c940f99..12d9f51c6 100644 --- a/config/power10/bli_cntx_init_power10.c +++ b/config/power10/bli_cntx_init_power10.c @@ -34,35 +34,6 @@ #include "blis.h" -// Instantiate prototypes for packm kernels. -PACKM_KER_PROT( float, s, packm_6xk_bb4_power10_ref ) -PACKM_KER_PROT( double, d, packm_6xk_bb2_power10_ref ) - -// Instantiate prototypes for level-3 kernels. -GEMM_UKR_PROT( float, s, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( float, s, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( float, s, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( double, d, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( double, d, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( double, d, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( scomplex, c, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( dcomplex, z, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power10_ref ) - void bli_cntx_init_power10( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; @@ -72,51 +43,38 @@ void bli_cntx_init_power10( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 12, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, TRUE, + cntx, - BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power10_ref, FALSE, - - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, TRUE, - - BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power10_ref, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power10_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power10_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref, FALSE, - cntx + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, + + BLIS_VA_END ); - // Update the context with customized virtual [gemm]trsm micro-kernels. - bli_cntx_set_l3_vir_ukrs + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 8, - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref, - cntx - ); + cntx, - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power10_ref, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power10_ref, - cntx + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // s d c z @@ -131,14 +89,16 @@ void bli_cntx_init_power10( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h new file mode 100644 index 000000000..4e32f1173 --- /dev/null +++ b/config/power10/bli_kernel_defs_power10.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 8 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 + +#define BLIS_BBN_s 4 +#define BLIS_BBN_d 2 + +//#endif + diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c index c9caf62a6..d5ffe7dcf 100644 --- a/config/power7/bli_cntx_init_power7.c +++ b/config/power7/bli_cntx_init_power7.c @@ -43,13 +43,26 @@ void bli_cntx_init_power7( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 1, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -64,13 +77,16 @@ void bli_cntx_init_power7( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/power7/bli_kernel_defs_power7.h b/config/power7/bli_kernel_defs_power7.h new file mode 100644 index 000000000..ceec01df3 --- /dev/null +++ b/config/power7/bli_kernel_defs_power7.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 8 + +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c index 4370ce26c..9f2d67632 100644 --- a/config/power9/bli_cntx_init_power9.c +++ b/config/power9/bli_cntx_init_power9.c @@ -34,35 +34,6 @@ #include "blis.h" -// Instantiate prototypes for packm kernels. -PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref ) -PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref ) - -// Instantiate prototypes for level-3 kernels. -GEMM_UKR_PROT( float, s, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( double, d, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref ) - void bli_cntx_init_power9( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; @@ -72,50 +43,37 @@ void bli_cntx_init_power9( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 12, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE, + cntx, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE, - - BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE, - cntx + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, + + BLIS_VA_END ); - // Update the context with customized virtual [gemm]trsm micro-kernels. - bli_cntx_set_l3_vir_ukrs + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 8, - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref, - cntx - ); + cntx, - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref, - cntx + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); @@ -131,14 +89,15 @@ void bli_cntx_init_power9( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); + BLIS_VA_END + ); } diff --git a/config/power9/bli_kernel_defs_power9.h b/config/power9/bli_kernel_defs_power9.h new file mode 100644 index 000000000..debfeac5f --- /dev/null +++ b/config/power9/bli_kernel_defs_power9.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 12 + +#define BLIS_NR_d 6 + +#define BLIS_BBN_s 4 +#define BLIS_BBN_d 2 + +//#endif + diff --git a/config/sandybridge/bli_cntx_init_sandybridge.c b/config/sandybridge/bli_cntx_init_sandybridge.c index 1ffa5bf8b..0697a3351 100644 --- a/config/sandybridge/bli_cntx_init_sandybridge.c +++ b/config/sandybridge/bli_cntx_init_sandybridge.c @@ -43,16 +43,32 @@ void bli_cntx_init_sandybridge( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_sandybridge( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/sandybridge/bli_kernel_defs_sandybridge.h b/config/sandybridge/bli_kernel_defs_sandybridge.h new file mode 100644 index 000000000..dc1b843f6 --- /dev/null +++ b/config/sandybridge/bli_kernel_defs_sandybridge.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 8 +#define BLIS_MR_c 8 +#define BLIS_MR_z 4 + +#define BLIS_NR_s 8 +#define BLIS_NR_d 4 +#define BLIS_NR_c 4 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c index f18503a7a..3af58b38d 100644 --- a/config/skx/bli_cntx_init_skx.c +++ b/config/skx/bli_cntx_init_skx.c @@ -43,39 +43,29 @@ void bli_cntx_init_skx( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE, - cntx - ); + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif + // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, @@ -84,12 +74,15 @@ void bli_cntx_init_skx( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif + // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, @@ -98,7 +91,20 @@ void bli_cntx_init_skx( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -116,17 +122,20 @@ void bli_cntx_init_skx( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + BLIS_VA_END ); } diff --git a/config/skx/bli_kernel_defs_skx.h b/config/skx/bli_kernel_defs_skx.h new file mode 100644 index 000000000..2aaf477ad --- /dev/null +++ b/config/skx/bli_kernel_defs_skx.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 14 + +//#endif + diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c index 13e7f6495..4b4ecdf4e 100644 --- a/config/steamroller/bli_cntx_init_steamroller.c +++ b/config/steamroller/bli_cntx_init_steamroller.c @@ -43,16 +43,32 @@ void bli_cntx_init_steamroller( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_steamroller( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/steamroller/bli_kernel_defs_steamroller.h b/config/steamroller/bli_kernel_defs_steamroller.h new file mode 100644 index 000000000..df4a8c411 --- /dev/null +++ b/config/steamroller/bli_kernel_defs_steamroller.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c index f2b1c8d17..4bacc5d63 100644 --- a/config/template/bli_cntx_init_template.c +++ b/config/template/bli_cntx_init_template.c @@ -45,34 +45,44 @@ void bli_cntx_init_template( cntx_t* cntx ) // Update the context with optimized native gemm micro-kernels and // their storage preferences. - bli_cntx_set_l3_nat_ukrs + bli_cntx_set_ukrs ( - 5, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, FALSE, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, FALSE, - cntx - ); + cntx, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( + // level-3 + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, + BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, + BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, + + // level-1f BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt, BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt, BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt, BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt, BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt, - cntx - ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( + // level-1v BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt, BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt, - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -87,13 +97,16 @@ void bli_cntx_init_template( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/template/bli_kernel_defs_template.h b/config/template/bli_kernel_defs_template.h new file mode 100644 index 000000000..86a33d8d8 --- /dev/null +++ b/config/template/bli_kernel_defs_template.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +// +// Only defined for block sizes which are not taken as the default (i.e. when +// an optimized kernel is provided). +// + +#define BLIS_MR_z 4 + +#define BLIS_NR_z 4 + +// +// PACKMR/PACKNR do not need to be defined unless they are different from the +// "normal" MR/NR. +// + +//#define BLIS_PACKMR_z 4 + +//#define BLIS_PACKNR_z 4 + +//#endif + diff --git a/config/thunderx2/bli_cntx_init_thunderx2.c b/config/thunderx2/bli_cntx_init_thunderx2.c index f2b7b633d..9d1af2c99 100644 --- a/config/thunderx2/bli_cntx_init_thunderx2.c +++ b/config/thunderx2/bli_cntx_init_thunderx2.c @@ -43,14 +43,28 @@ void bli_cntx_init_thunderx2( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_thunderx2( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/thunderx2/bli_kernel_defs_thunderx2.h b/config/thunderx2/bli_kernel_defs_thunderx2.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/thunderx2/bli_kernel_defs_thunderx2.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 1b16cd06f..a10986b23 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -40,92 +40,107 @@ void bli_cntx_init_zen( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, -#if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, +#if 0 + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, +#endif + + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 16, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv -#if 0 - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, -#else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, -#endif -#if 1 // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, -#endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, @@ -136,25 +151,76 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv -#if 0 - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, -#else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, -#endif -#if 1 // setv - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, // swapv BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, #endif - - cntx + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -195,131 +261,74 @@ void bli_cntx_init_zen( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 440, 220, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, -1, -1 ); +#if 0 + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 ); +#endif + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, + + // gemmsup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, + + BLIS_VA_END ); // ------------------------------------------------------------------------- - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - +#if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 1, + cntx, + BLIS_GEMM, bli_gemmsup_ref, //BLIS_GEMMT, bli_gemmtsup_ref, - cntx + + BLIS_VA_END ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, -#if 0 - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, #endif - -#if 0 - // NOTE: This set of kernels is likely broken and therefore disabled. - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, -#endif - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 ); -#if 0 - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); -#endif - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); } - diff --git a/config/zen/bli_kernel_defs_zen.h b/config/zen/bli_kernel_defs_zen.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/zen/bli_kernel_defs_zen.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index ba728602b..c7e40b4d0 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -38,73 +38,94 @@ void bli_cntx_init_zen2( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen2_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, - cntx - ); + // level-3 sup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, -#if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, +#if 0 + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, +#endif + + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 16, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -127,18 +148,59 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, //copy BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // level-3 sup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -158,130 +220,73 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z +#if 1 + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 500, 249, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 500, 249, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 500, 249, -1, -1 ); +#else + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 100000, 100000, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 100000, 100000, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 100000, 100000, -1, -1 ); +#endif + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, + + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, + + BLIS_VA_END ); // ------------------------------------------------------------------------- - // Initialize sup thresholds with architecture-appropriate values. - // s d c z -#if 1 - bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 ); -#else - bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 ); -#endif - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 1, + cntx, + BLIS_GEMM, bli_gemmsup_ref, - cntx + //BLIS_GEMMT, bli_gemmtsup_ref, + + BLIS_VA_END ); #endif - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, -#if 0 - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, -#endif - -#if 0 - // NOTE: This set of kernels is likely broken and therefore disabled. - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, -#endif - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); } diff --git a/config/zen2/bli_kernel_defs_zen2.h b/config/zen2/bli_kernel_defs_zen2.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/zen2/bli_kernel_defs_zen2.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index 0336ddc95..3ee385ed6 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -37,83 +37,106 @@ void bli_cntx_init_zen3( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen3_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, + // gemmsup #if 0 - // AMD: This will be enabled in other PRs. - // packm kernels - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, - cntx - ); + // AMD: This should be enabled in the PR which has added these kernels + // Update the context with optimized small/unpacked gemm kernels. + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, #else - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, + // packm +#if 0 + // AMD: This will be enabled in other PRs. + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, +#else + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, +#endif // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 16, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -135,19 +158,75 @@ void bli_cntx_init_zen3( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, - //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + // swapv + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - //copy + // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, - //set + // setv BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, +#if 0 + // AMD: This should be enabled in the PR which has added these kernels + // Update the context with optimized small/unpacked gemm kernels. + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, +#endif + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -164,138 +243,67 @@ void bli_cntx_init_zen3( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 200, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 240, 220, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, + + // gemmsup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, + + BLIS_VA_END ); -// ------------------------------------------------------------------------- - - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); + // ------------------------------------------------------------------------- #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 2, + cntx, + BLIS_GEMM, bli_gemmsup_ref, - BLIS_GEMMT, bli_gemmtsup_ref, - cntx + //BLIS_GEMMT, bli_gemmtsup_ref, + + BLIS_VA_END ); #endif - -#if 0 - // AMD: This should be enabled in the PR which has added these kernels - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 28, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - cntx - ); -#else - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - cntx - ); - -#endif - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); } diff --git a/config/zen3/bli_kernel_defs_zen3.h b/config/zen3/bli_kernel_defs_zen3.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/zen3/bli_kernel_defs_zen3.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md index dcec7754c..cc1224182 100644 --- a/docs/ConfigurationHowTo.md +++ b/docs/ConfigurationHowTo.md @@ -47,7 +47,7 @@ $ ls config/haswell bli_cntx_init_haswell.c bli_family_haswell.h make_defs.mk ``` A sub-configuration (`haswell`, in this case) usually contains just three files: - * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute. + * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute. * `bli_family_haswell.h`. This header file is `#included` when the configuration in question, in this case `haswell`, was the target to `./configure`. This is where you would specify certain global parameters and settings. For example, if you wanted to specify custom implementations of `malloc()` and `free()`, this is where you would specify them. The file is oftentimes empty. (In the case of configuration families, the definitions in this file apply to the _entire_ build, and not any specific sub-configuration, but for consistency we support them for all configuration targets, whether they be singleton sub-configurations or configuration families.) * `make_defs.mk`. This makefile fragment defines the compiler and compiler flags to use during compilation. Specifically, the values defined in this file are used whenever compiling source code specific to the sub-configuration (i.e., reference kernels and optimized kernels). If the sub-configuration is the target of `configure`, then these flags are also used to compile general framework code. @@ -127,7 +127,7 @@ void bli_cntx_init_fooarch( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, @@ -143,7 +143,7 @@ _**Blocksize object array.**_ The `blkszs` array declaration is needed later in _**Reference initialization.**_ The first function call, `bli_cntx_init_fooarch_ref()`, initializes the context `cntx` with function pointers to reference implementations of all of the kernels supported by BLIS (as well as cache and register blocksizes, and other fields). This function is automatically generated by BLIS for every sub-configuration enabled at configure-time. The function prototype is generated by a preprocessor macro in `frame/include/bli_arch_config.h`. -_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated. +_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated. _Note:_ Currently, BLIS only allows the kernel developer to signal a preference (row or column) for `gemm` microkernels. The preference of the `gemmtrsm` and `trsm` microkernels can (and must) be set, but are ignored by the framework during execution. @@ -236,7 +236,7 @@ _**Memory alignment.**_ BLIS implements memory alignment internally, rather than ``` The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`. -The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. +The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. The value `BLIS_HEAP_STRIDE_ALIGN_SIZE` defines the alignment used for so-called "leading dimensions" (i.e. column strides for column-stored matrices, and row strides for row-stored matrices) when creating BLIS matrices via the object-based API (e.g. `bli_obj_create()`). While setting `BLIS_HEAP_ADDR_ALIGN_SIZE` guarantees alignment for the first column (or row), creating a matrix with certain dimension values (_m_ and _n_) may cause subsequent columns (or rows) to be misaligned. Setting this value to `BLIS_SIMD_ALIGN_SIZE` is usually desirable. Additional alignment may or may not be beneficial. @@ -246,7 +246,7 @@ The value `BLIS_POOL_ADDR_ALIGN_SIZE_*` define the alignments used when allocati ### make_defs.mk -The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library. +The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library. The format of the file is mostly self-explanatory. However, we will expound on the contents here, using the `make_defs.mk` file for the `haswell` configuration as an example: ```make @@ -304,7 +304,7 @@ _**Debugging flags.**_ The `CDBGFLAGS` variable should be assigned to contain fl _**Optimization flags.**_ The `COPTFLAGS` variable should be assigned any flags relating to general compiler optimization. Usually this takes the form of `-O2` or `-O3`, but more specific optimization flags may be included as well, such as `-fomit-frame-pointer`. Note that, as with `CDBGFLAGS`, `COPTFLAGS` is conditionally assigned based on the value of `$(DEBUG_TYPE)`. A separate `CKOPTFLAGS` variable tracks optimizations flags used when compiling kernels. For most configurations, `CKOPTFLAGS` is assigned as a copy of `COPTFLAGS`, but if the kernel developer needs different optimization flags to be applied when compiling kernel source code, `CKOPTFLAGS` should be set accordingly. -_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`. +_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`. _**Variable storage/renaming.**_ Finally, the last statement commits the variables defined in the file to "storage". That is, they are copied to variable names that contain `THIS_CONFIG` as a suffix. This allows the variables for one configuration to co-exist with variables of another configuration. @@ -406,7 +406,7 @@ Some sub-configurations, for various reasons, do not rely on their own set of ke excavator: excavator/piledriver steamroller: steamroller/piledriver ``` -Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner. +Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner. **Note:** Specifying non-native kernel sets via the `/` character is only allowed when defining singleton configuration families. They may NOT appear in the definitions of umbrella families! When an umbrella family includes a singleton family that is defined to require non-native kernels, this will be accounted for during the parsing of the `config_registry` file. @@ -467,7 +467,7 @@ configure: skx: skx configure: steamroller: steamroller configure: x86_64: haswell sandybridge penryn zen excavator steamroller piledriver bulldozer generic ``` -This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically. +This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically. Next, the kernel list (actually, all kernel lists) is printed: ``` @@ -549,7 +549,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the 2. _**Add support within the framework source code.**_ We also need to make a minor update to the framework to support the new kernels--specifically, to pull in the kernels' function prototypes. - **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file: + **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file: ```c #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" @@ -560,7 +560,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the ## Adding a new configuration family -Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set. +Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set. @@ -636,7 +636,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f ``` THIS_CONFIG := knl ``` - and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. + and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. ```c #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #define BLIS_SIMD_MAX_SIZE 64 @@ -714,7 +714,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f #include "bli_family_knl.h" #endif ``` - As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.) + As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.) diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 5fdfdb91e..1d12b42eb 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -61,15 +61,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - x, incx, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -98,14 +98,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - n, \ - x, incx, \ - index, \ - cntx \ + n, \ + x, incx, \ + index, \ + cntx \ ); \ } @@ -135,17 +135,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - alpha, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + alpha, \ + x, incx, \ + beta, \ + y, incy, \ + cntx \ ); \ } @@ -175,16 +175,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \ if ( cntx == NULL ) \ cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - alpha, \ - x, incx, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + alpha, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -215,17 +215,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - x, incx, \ - y, incy, \ - rho, \ - cntx \ + conjx, \ + conjy, \ + n, \ + x, incx, \ + y, incy, \ + rho, \ + cntx \ ); \ } @@ -257,19 +257,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - alpha, \ - x, incx, \ - y, incy, \ - beta, \ - rho, \ - cntx \ + conjx, \ + conjy, \ + n, \ + alpha, \ + x, incx, \ + y, incy, \ + beta, \ + rho, \ + cntx \ ); \ } @@ -295,13 +295,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - n, \ - x, incx, \ - cntx \ + n, \ + x, incx, \ + cntx \ ); \ } @@ -329,15 +329,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjalpha, \ - n, \ - alpha, \ - x, incx, \ - cntx \ + conjalpha, \ + n, \ + alpha, \ + x, incx, \ + cntx \ ); \ } @@ -365,14 +365,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - n, \ - x, incx, \ - y, incy, \ - cntx \ + n, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -400,16 +400,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + x, incx, \ + beta, \ + y, incy, \ + cntx \ ); \ } diff --git a/frame/1/other/packv/bli_packv_unb_var1.c b/frame/1/other/packv/bli_packv_unb_var1.c index 23b370949..ca1323b58 100644 --- a/frame/1/other/packv/bli_packv_unb_var1.c +++ b/frame/1/other/packv/bli_packv_unb_var1.c @@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ diff --git a/frame/1/other/unpackv/bli_unpackv_unb_var1.c b/frame/1/other/unpackv/bli_unpackv_unb_var1.c index 5dc1101b6..43c9a266c 100644 --- a/frame/1/other/unpackv/bli_unpackv_unb_var1.c +++ b/frame/1/other/unpackv/bli_unpackv_unb_var1.c @@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index a8f9e844a..cfaf5150f 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -85,32 +85,33 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ - x1 = x + offx; \ - y1 = y + offy; \ + x1 = x + offx; \ + y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ - /* Simulate a unit diagonal for x with a zero increment over a unit - scalar. */ \ - x1 = PASTEMAC(ch,1); \ - incx = 0; \ - y1 = y + offy; \ + /* Simulate a unit diagonal for x with a zero increment over a unit + scalar. */ \ + x1 = PASTEMAC(ch,1); \ + incx = 0; \ + y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( addd, addv, BLIS_ADDV_KER ) @@ -164,33 +165,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ - x1 = x + offx; \ - y1 = y + offy; \ + x1 = x + offx; \ + y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ - /* Simulate a unit diagonal for x with a zero increment over a unit - scalar. */ \ - x1 = PASTEMAC(ch,1); \ - incx = 0; \ - y1 = y + offy; \ + /* Simulate a unit diagonal for x with a zero increment over a unit + scalar. */ \ + x1 = PASTEMAC(ch,1); \ + incx = 0; \ + y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - alpha, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + alpha, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( axpyd, axpyv, BLIS_AXPYV_KER ) @@ -233,20 +235,21 @@ void PASTEMAC2(ch,opname,EX_SUF) \ &offx, &n_elem, &incx \ ); \ \ - x1 = x + offx; \ + x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - n_elem, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + n_elem, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER ) @@ -290,22 +293,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \ &offx, &n_elem, &incx \ ); \ \ - x1 = x + offx; \ + x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjalpha, \ - n_elem, \ - alpha, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + conjalpha, \ + n_elem, \ + alpha, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER ) @@ -361,27 +365,28 @@ void PASTEMAC2(ch,opname,EX_SUF) \ PASTEMAC(ch,setis)( *alpha, *chi11 ); \ } */ \ \ - /* Acquire the addres of the imaginary component of the first element, + /* Acquire the address of the imaginary component of the first element, and scale the increment for use in the real domain. Note that the indexing into the imaginary field only needs to work for complex datatypes since we return early for real domain types. */ \ - x1 = ( ctype_r* )( x + offx ) + 1; \ + x1 = ( ctype_r* )( x + offx ) + 1; \ incx = 2*incx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \ + PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - alpha, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + alpha, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER ) @@ -424,22 +429,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \ &offx, &n_elem, &incx \ ); \ \ - x1 = x + offx; \ + x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - alpha, 0, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + alpha, 0, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER ) @@ -491,33 +497,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ - x1 = x + offx; \ - y1 = y + offy; \ + x1 = x + offx; \ + y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ - /* Simulate a unit diagonal for x with a zero increment over a unit - scalar. */ \ - x1 = PASTEMAC(ch,1); \ - incx = 0; \ - y1 = y + offy; \ + /* Simulate a unit diagonal for x with a zero increment over a unit + scalar. */ \ + x1 = PASTEMAC(ch,1); \ + incx = 0; \ + y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - beta, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER ) diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c index 332ff5af2..a54379299 100644 --- a/frame/1f/bli_l1f_tapi.c +++ b/frame/1f/bli_l1f_tapi.c @@ -65,19 +65,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - alphax, \ - alphay, \ - x, incx, \ - y, incy, \ - z, incz, \ - cntx \ + conjx, \ + conjy, \ + n, \ + alphax, \ + alphay, \ + x, incx, \ + y, incy, \ + z, incz, \ + cntx \ ); \ } @@ -109,19 +109,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conja, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - y, incy, \ - cntx \ + conja, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -154,20 +154,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjxt, \ - conjx, \ - conjy, \ - n, \ - alpha, \ - x, incx, \ - y, incy, \ - rho, \ - z, incz, \ - cntx \ + conjxt, \ + conjx, \ + conjy, \ + n, \ + alpha, \ + x, incx, \ + y, incy, \ + rho, \ + z, incz, \ + cntx \ ); \ } @@ -204,24 +204,24 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjat, \ - conja, \ - conjw, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - w, incw, \ - x, incx, \ - beta, \ - y, incy, \ - z, incz, \ - cntx \ + conjat, \ + conja, \ + conjw, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + w, incw, \ + x, incx, \ + beta, \ + y, incy, \ + z, incz, \ + cntx \ ); \ } @@ -254,20 +254,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjat, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + conjat, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + x, incx, \ + beta, \ + y, incy, \ + cntx \ ); \ } diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h index 2e813cf4a..41d80e217 100644 --- a/frame/1m/bli_l1m_ft_ker.h +++ b/frame/1m/bli_l1m_ft_ker.h @@ -102,35 +102,40 @@ INSERT_GENTDEF( packm_cxk ) \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ - conj_t conjp, \ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) -// packm_1er_ker +// packm_diag_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ conj_t conja, \ pack_t schema, \ + bool invdiag, \ dim_t cdim, \ - dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* restrict cntx \ ); -INSERT_GENTDEF( packm_cxk_1er ) +INSERT_GENTDEF( packm_cxc_diag ) #endif diff --git a/frame/1m/bli_l1m_ker.h b/frame/1m/bli_l1m_ker.h index 76d51af2b..970c5f040 100644 --- a/frame/1m/bli_l1m_ker.h +++ b/frame/1m/bli_l1m_ker.h @@ -47,16 +47,8 @@ #undef GENTPROT #define GENTPROT PACKM_KER_PROT -INSERT_GENTPROT_BASIC0( packm_2xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_3xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_4xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_6xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_8xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_10xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_12xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_14xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_16xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_24xk_ker_name ) +INSERT_GENTPROT_BASIC0( packm_mrxk_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxk_ker_name ) // native unpackm kernels @@ -64,27 +56,33 @@ INSERT_GENTPROT_BASIC0( packm_24xk_ker_name ) #undef GENTPROT #define GENTPROT UNPACKM_KER_PROT -INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name ) +INSERT_GENTPROT_BASIC0( unpackm_mrxk_ker_name ) +INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name ) // 1e/1r packm kernels #undef GENTPROT -#define GENTPROT PACKM_1ER_KER_PROT +#define GENTPROT PACKM_KER_PROT -INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name ) +INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name ) + + +// packm kernels for diagonal blocks + +#undef GENTPROT +#define GENTPROT PACKM_DIAG_KER_PROT + +INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_ker_name ) + + +// 1e/1r packm kernels for diagonal blocks + +#undef GENTPROT +#define GENTPROT PACKM_DIAG_KER_PROT + +INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_1er_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_1er_ker_name ) diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h index 02d329622..80284ea22 100644 --- a/frame/1m/bli_l1m_ker_prot.h +++ b/frame/1m/bli_l1m_ker_prot.h @@ -37,7 +37,7 @@ // Define template prototypes for level-1m kernels. // -// native packm kernels +// packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ @@ -55,35 +55,40 @@ void PASTEMAC(ch,varname) \ ); -// native unpackm kernels +// unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - dim_t n, \ - ctype* restrict kappa, \ - ctype* restrict p, inc_t ldp, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ); - - -// 1e/1r packm kernels - -#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ -\ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ + ctype* restrict kappa, \ + ctype* restrict p, inc_t ldp, \ + ctype* restrict a, inc_t inca, inc_t lda, \ + cntx_t* restrict cntx \ + ); + + +// packm kernels for diagonal blocks + +#define PACKM_DIAG_KER_PROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ + conj_t conja, \ + pack_t schema, \ + bool invdiag, \ + dim_t cdim, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* restrict cntx \ ); diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c index f2ce3c8d7..c979f082a 100644 --- a/frame/1m/bli_l1m_unb_var1.c +++ b/frame/1m/bli_l1m_unb_var1.c @@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ @@ -197,7 +197,7 @@ void PASTEMAC(ch,opname) \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ @@ -310,7 +310,7 @@ void PASTEMAC(ch,opname) \ if ( bli_is_zeros( uplox_eff ) ) return; \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ @@ -423,7 +423,7 @@ void PASTEMAC(ch,opname) \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/other/bli_packm_cxk.c similarity index 84% rename from frame/1m/packm/bli_packm_cxk.c rename to frame/1m/other/bli_packm_cxk.c index ea0418cae..612b37f78 100644 --- a/frame/1m/packm/bli_packm_cxk.c +++ b/frame/1m/other/bli_packm_cxk.c @@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ @@ -91,30 +92,30 @@ void PASTEMAC(ch,opname) \ that happens, the packm kernel must have set the 0's added in step (3) below. - packm kernel packm kernel packm kernel packm_tri_cxk + packm kernel packm kernel packm kernel packm_tri_cxk step 1: step 2: step 3: step 4: - x x x x . . x x x x . . x x x x 0 0 x x x x 0 0 - ? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0 - ? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0 - ? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0 - . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 - . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 + x x x x . . x x x x . . x x x x 0 0 x x x x 0 0 + ? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0 + ? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0 + ? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0 + . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 x Copied from A; valid element. - ? Copied from A, but value is unknown and unused. + ? Copied from A, but value is unknown and unused. . Uninitialized. - 0 Initialized to zero. - 1 Initialized to one. + 0 Initialized to zero. + 1 Initialized to one. NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s to zero. This is not needed to support trsm, but rather to support trmm. (Both use the same packing format and code.) - In this case, panel_dim will be 4 because four rows of data are - copied from A, panel_len will be 4 because those four rows span - four columns of A, and panel_len_max will be 6 because there are a - total of 6 columns that can be written to in the packed micropanel, + In this case, panel_dim will be 4 because four rows of data are + copied from A, panel_len will be 4 because those four rows span + four columns of A, and panel_len_max will be 6 because there are a + total of 6 columns that can be written to in the packed micropanel, 2 of which lie beyond the values copied from A. */ \ f \ ( \ diff --git a/frame/1m/packm/bli_packm_cxk.h b/frame/1m/other/bli_packm_cxk.h similarity index 100% rename from frame/1m/packm/bli_packm_cxk.h rename to frame/1m/other/bli_packm_cxk.h diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/other/bli_packm_cxk_1er.c similarity index 94% rename from frame/1m/packm/bli_packm_cxk_1er.c rename to frame/1m/other/bli_packm_cxk_1er.c index e583c8a82..22598dbac 100644 --- a/frame/1m/packm/bli_packm_cxk_1er.c +++ b/frame/1m/other/bli_packm_cxk_1er.c @@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \ + : BLIS_PACKM_MRXK_1ER_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/frame/1m/packm/bli_packm_cxk_1er.h b/frame/1m/other/bli_packm_cxk_1er.h similarity index 100% rename from frame/1m/packm/bli_packm_cxk_1er.h rename to frame/1m/other/bli_packm_cxk_1er.h diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/other/bli_packm_struc_cxk_1er.c similarity index 100% rename from frame/1m/packm/bli_packm_struc_cxk_1er.c rename to frame/1m/other/bli_packm_struc_cxk_1er.c diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/other/bli_packm_struc_cxk_1er.h similarity index 100% rename from frame/1m/packm/bli_packm_struc_cxk_1er.h rename to frame/1m/other/bli_packm_struc_cxk_1er.h diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/other/bli_unpackm_cxk.c similarity index 92% rename from frame/1m/unpackm/bli_unpackm_cxk.c rename to frame/1m/other/bli_unpackm_cxk.c index 4423c41a2..4b7977e86 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.c +++ b/frame/1m/other/bli_unpackm_cxk.c @@ -40,6 +40,7 @@ void PASTEMAC(ch,opname) \ ( \ conj_t conjp, \ + pack_t schema, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ @@ -48,15 +49,16 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \ + : BLIS_UNPACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the unpackm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/frame/1m/unpackm/bli_unpackm_cxk.h b/frame/1m/other/bli_unpackm_cxk.h similarity index 98% rename from frame/1m/unpackm/bli_unpackm_cxk.h rename to frame/1m/other/bli_unpackm_cxk.h index 53c3c0c44..d828a9b8e 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.h +++ b/frame/1m/other/bli_unpackm_cxk.h @@ -39,6 +39,7 @@ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ + pack_t schema, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 88657a712..7d73bf903 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -43,10 +43,6 @@ #include "bli_packm_part.h" #include "bli_packm_struc_cxk.h" -#include "bli_packm_struc_cxk_1er.h" - -#include "bli_packm_cxk.h" -#include "bli_packm_cxk_1er.h" // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index edeeae2b9..e13391151 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -43,11 +43,11 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { { bli_spackm_struc_cxk, bli_cpackm_struc_cxk, bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } }, // 0001 row/col panels: 1m-expanded (1e) - { { NULL, bli_cpackm_struc_cxk_1er, - NULL, bli_zpackm_struc_cxk_1er, } }, + { { NULL, bli_cpackm_struc_cxk, + NULL, bli_zpackm_struc_cxk, } }, // 0010 row/col panels: 1m-reordered (1r) - { { NULL, bli_cpackm_struc_cxk_1er, - NULL, bli_zpackm_struc_cxk_1er, } }, + { { NULL, bli_cpackm_struc_cxk, + NULL, bli_zpackm_struc_cxk, } }, }; static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index 2a52c42de..dbdaf4738 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -34,8 +34,8 @@ #include "blis.h" -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, cxk_kername, cxc_kername ) \ \ void PASTEMAC(ch,varname) \ ( \ @@ -58,460 +58,249 @@ void PASTEMAC(ch,varname) \ cntx_t* cntx \ ) \ { \ - /* Handle micro-panel packing based on the structure of the matrix - being packed. */ \ - if ( bli_is_general( strucc ) ) \ - { \ - /* For micro-panels of general matrices, we can call the pack - kernel front-end directly. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc, \ - schema, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - cntx \ - ); \ - } \ - else if ( bli_is_herm_or_symm( strucc ) ) \ - { \ - /* Call a helper function for micro-panels of Hermitian/symmetric - matrices. */ \ - PASTEMAC(ch,packm_herm_cxk) \ - ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ - cntx \ - ); \ - } \ - else /* ( bli_is_triangular( strucc ) ) */ \ - { \ - /* Call a helper function for micro-panels of triangular - matrices. */ \ - PASTEMAC(ch,packm_tri_cxk) \ - ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ - cntx \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ + num_t dt = PASTEMAC(ch,type); \ + num_t dt_r = PASTEMAC(chr,type); \ + dim_t panel_len_pad = panel_len_max - panel_len; \ \ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t panel_dim, \ - dim_t panel_len, \ - dim_t panel_dim_max, \ - dim_t panel_len_max, \ - dim_t panel_dim_off, \ - dim_t panel_len_off, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t ldp, \ - inc_t is_p, \ - cntx_t* cntx \ - ) \ -{ \ + bszid_t bsz_id = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \ + dim_t packmrnr = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \ + dim_t packmrnr_r = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \ +\ + ukr_t cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ + ukr_t cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER \ + : BLIS_PACKM_MRXMR_DIAG_KER; \ +\ + if ( bli_is_1m_packed( schema ) ) \ + { \ + cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \ + : BLIS_PACKM_MRXK_1ER_KER; \ + cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER \ + : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \ + } \ +\ + PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \ + PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \ +\ + /* For general matrices, pack and return early */ \ + if ( bli_is_general( strucc ) ) \ + { \ + f_cxk \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ + return; \ + } \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ doff_t diagoffc = panel_dim_off - panel_len_off; \ - doff_t diagoffc_abs; \ - dim_t i, j; \ + if ( ( -panel_dim < diagoffc && diagoffc < 0 ) || \ + ( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ - /* Handle the case where the micro-panel does NOT intersect the - diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ + /* For triangular, symmetric, and hermitian matrices we need to consider + three parts. */ \ +\ + /* Pack to p10. */ \ + if ( 0 < diagoffc ) \ { \ - /* If the current panel is unstored, we need to make a few - adjustments so we refer to the data where it is actually - stored, also taking conjugation into account. (Note this - implicitly assumes we are operating on a dense panel - within a larger symmetric or Hermitian matrix, since a - general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ + dim_t p10_dim = panel_dim; \ + dim_t p10_len = bli_min( diagoffc, panel_len ); \ + dim_t p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \ + ctype* p10 = p; \ + conj_t conjc10 = conjc; \ + ctype* c10 = c; \ + inc_t incc10 = incc; \ + inc_t ldc10 = ldc; \ +\ + if ( bli_is_upper( uploc ) ) \ { \ - c = c + diagoffc * ( doff_t )ldc + \ - -diagoffc * ( doff_t )incc; \ - bli_swap_incs( &incc, &ldc ); \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( &conjc ); \ - } \ -\ - /* Pack the full panel. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc, \ - schema, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - cntx \ - ); \ - } \ - else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ - { \ - ctype* restrict c10; \ - ctype* restrict p10; \ - dim_t p10_dim, p10_len; \ - inc_t incc10, ldc10; \ - doff_t diagoffc10; \ - conj_t conjc10; \ -\ - ctype* restrict c12; \ - ctype* restrict p12; \ - dim_t p12_dim, p12_len; \ - inc_t incc12, ldc12; \ - doff_t diagoffc12; \ - conj_t conjc12; \ -\ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( diagoffc < 0 ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - diagoffc_abs = bli_abs( diagoffc ); \ -\ - if ( bli_is_lower( uploc ) ) \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs; \ - p10 = p; \ - c10 = c; \ - incc10 = incc; \ - ldc10 = ldc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - diagoffc12 = diagoffc_abs - j; \ - p12 = p + (j )*ldp; \ - c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )ldc + \ - -diagoffc12 * ( doff_t )incc; \ - incc12 = ldc; \ - ldc12 = incc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( &conjc12 ); \ - } \ - else /* if ( bli_is_upper( uploc ) ) */ \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs + panel_dim; \ - diagoffc10 = diagoffc; \ - p10 = p; \ - c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )ldc + \ - -diagoffc10 * ( doff_t )incc; \ - incc10 = ldc; \ - ldc10 = incc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - p12 = p + (j )*ldp; \ - c12 = c + (j )*ldc; \ - incc12 = incc; \ - ldc12 = ldc; \ - conjc12 = conjc; \ + bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc10 ); \ } \ \ - /* Pack to p10. For upper storage, this includes the unstored - triangle of c11. */ \ - /* NOTE: Since we're only packing partial panels here, we pass in - p1x_len as panel_len_max; otherwise, the packm kernel will zero- - fill the columns up to panel_len_max, which is not what we need - or want to happen. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc10, \ - schema, \ - p10_dim, \ - panel_dim_max, \ - p10_len, \ - p10_len, \ - kappa, \ - c10, incc10, ldc10, \ - p10, ldp, \ - cntx \ - ); \ -\ - /* Pack to p12. For lower storage, this includes the unstored - triangle of c11. */ \ - /* NOTE: Since we're only packing partial panels here, we pass in - p1x_len as panel_len_max; otherwise, the packm kernel will zero- - fill the columns up to panel_len_max, which is not what we need - or want to happen. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc12, \ - schema, \ - p12_dim, \ - panel_dim_max, \ - p12_len, \ - p12_len, \ - kappa, \ - c12, incc12, ldc12, \ - p12, ldp, \ - cntx \ - ); \ -\ - /* Pack the stored triangle of c11 to p11. */ \ + /* If we are referencing the unstored part of a triangular matrix, + explicitly store zeros */ \ + if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \ { \ - dim_t p11_m = panel_dim; \ - dim_t p11_n = panel_dim; \ - dim_t j2 = diagoffc_abs; \ - ctype* restrict c11 = c + (j2 )*ldc; \ - ctype* restrict p11 = p + (j2 )*ldp; \ - trans_t transc = ( trans_t )conjc; \ -\ - PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - transc, \ - p11_m, \ - p11_n, \ - c11, incc, ldc, \ - p11, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* If source matrix c is Hermitian, we have to zero out the - imaginary components of the diagonal of p11 in case the - corresponding elements in c11 were not already zero. */ \ - if ( bli_is_hermitian( strucc ) ) \ + if ( bli_is_1m_packed( schema ) ) \ { \ - ctype* restrict pi11 = p11; \ + ctype_r* restrict zero = PASTEMAC(chr,0); \ \ - for ( i = 0; i < p11_m; ++i ) \ - { \ - PASTEMAC(ch,seti0s)( *pi11 ); \ -\ - pi11 += 1 + ldp; \ - } \ + PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr_r, \ + p10_len_max * 2, \ + zero, \ + ( ctype_r* )p10, 1, ldp, \ + cntx, \ + NULL \ + ); \ } \ + else \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ \ - /* Now that the diagonal has been made explicitly Hermitian - (if applicable), we can now safely scale the stored - triangle specified by uploc. */ \ - PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \ + PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr, \ + p10_len_max, \ + zero, \ + p10, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ + } \ + else \ + { \ + f_cxk \ ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - p11_m, \ - p11_n, \ + conjc10, \ + schema, \ + p10_dim, \ + p10_len, \ + p10_len_max, \ kappa, \ - p11, 1, ldp, \ - cntx, \ - NULL \ + c10, incc10, ldc10, \ + p10, ldp, \ + cntx \ + ); \ + } \ + } \ +\ + /* Pack to p11. */ \ + if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \ + { \ + dim_t i = diagoffc; \ + dim_t p11_dim = panel_dim; \ + dim_t p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \ + ? panel_len_pad : 0 ); \ + ctype* p11 = p + i * ldp; \ + conj_t conjc11 = conjc; \ + ctype* c11 = c + i * ldc; \ + inc_t incc11 = incc; \ + inc_t ldc11 = ldc; \ +\ + f_cxc \ + ( \ + strucc, \ + diagc, \ + uploc, \ + conjc11, \ + schema, \ + invdiag, \ + p11_dim, \ + p11_len_max, \ + kappa, \ + c11, incc11, ldc11, \ + p11, ldp, \ + cntx \ + ); \ + } \ +\ + /* Pack to p12. */ \ + if ( diagoffc + panel_dim < panel_len ) \ + { \ + dim_t i = bli_max( 0, diagoffc + panel_dim ); \ + dim_t p12_dim = panel_dim; \ + dim_t p12_len = panel_len - i; \ + /* If we are packing p12, then it is always the last partial block \ + and so we should make sure to pad with zeros if necessary. */ \ + dim_t p12_len_max = p12_len + panel_len_pad; \ + ctype* p12 = p + i * ldp; \ + conj_t conjc12 = conjc; \ + ctype* c12 = c + i * ldc; \ + inc_t incc12 = incc; \ + inc_t ldc12 = ldc; \ +\ + if ( bli_is_lower( uploc ) ) \ + { \ + bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( &conjc12 ); \ + } \ +\ + /* If we are referencing the unstored part of a triangular matrix, + explicitly store zeros */ \ + if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \ + { \ + if ( bli_is_1m_packed( schema ) ) \ + { \ + ctype_r* restrict zero = PASTEMAC(chr,0); \ +\ + PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr_r, \ + p12_len_max * 2, \ + zero, \ + ( ctype_r* )p12, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ + else \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ +\ + PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr, \ + p12_len_max, \ + zero, \ + p12, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ + } \ + else \ + { \ + f_cxk \ + ( \ + conjc12, \ + schema, \ + p12_dim, \ + p12_len, \ + p12_len_max, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp, \ + cntx \ ); \ } \ } \ } -INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) - - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t panel_dim, \ - dim_t panel_len, \ - dim_t panel_dim_max, \ - dim_t panel_len_max, \ - dim_t panel_dim_off, \ - dim_t panel_len_off, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t ldp, \ - inc_t is_p, \ - cntx_t* cntx \ - ) \ -{ \ - doff_t diagoffc = panel_dim_off - panel_len_off; \ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc, \ - schema, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - cntx \ - ); \ -\ -\ - /* If the diagonal of c is implicitly unit, explicitly set the - the diagonal of the packed panel to kappa. */ \ - if ( bli_is_unit_diag( diagc ) ) \ - { \ - PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - diagoffc, \ - panel_dim, \ - panel_len, \ - kappa, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ - } \ -\ - /* If requested, invert the diagonal of the packed panel. */ \ - if ( invdiag == TRUE ) \ - { \ - PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - panel_dim, \ - panel_len, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ - } \ -\ - /* Set the region opposite the diagonal of p to zero. To do this, - we need to reference the "unstored" region on the other side of - the diagonal. This amounts to toggling uploc and then shifting - the diagonal offset to shrink the newly referenced region (by - one diagonal). Note that this zero-filling is not needed for - trsm, since the unstored region is not referenced by the trsm - micro-kernel; however, zero-filling is needed for trmm, which - uses the gemm micro-kernel.*/ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - uplo_t uplop = uploc; \ -\ - bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - diagoffc, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - panel_dim, \ - panel_len, \ - zero, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ - } \ -\ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( panel_dim != panel_dim_max && \ - panel_len != panel_len_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t i = panel_dim; \ - dim_t j = panel_len; \ - dim_t m_br = panel_dim_max - i; \ - dim_t n_br = panel_len_max - j; \ - ctype* p_br = p + (i ) + (j )*ldp; \ -\ - PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - one, \ - p_br, 1, ldp, \ - cntx, \ - NULL \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk ) +INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag ) diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index 5e4542841..80fa3804a 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -37,5 +37,3 @@ #include "bli_unpackm_int.h" #include "bli_unpackm_blk_var1.h" - -#include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c index b2c862045..b6165f516 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var1.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c @@ -36,21 +36,22 @@ #define FUNCPTR_T unpackm_fp -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - dim_t m, - dim_t n, - dim_t m_panel, - dim_t n_panel, - void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx - ); +typedef void (*FUNCPTR_T) + ( + struc_t strucc, + doff_t diagoffc, + diag_t diagc, + uplo_t uploc, + trans_t transc, + dim_t m, + dim_t n, + dim_t m_panel, + dim_t n_panel, + void* p, inc_t rs_p, inc_t cs_p, + dim_t pd_p, inc_t ps_p, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx + ); static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1); @@ -152,10 +153,10 @@ void PASTEMAC(ch,varname) \ dim_t iter_dim; \ dim_t num_iter; \ dim_t it, ic, ip; \ - dim_t ic0, ip0; \ + dim_t ic0, ip0; \ doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ + doff_t diagoffc_i; \ + doff_t diagoffc_inc; \ dim_t panel_len; \ dim_t panel_dim_i; \ dim_t panel_dim_max; \ @@ -164,6 +165,7 @@ void PASTEMAC(ch,varname) \ inc_t ldp; \ dim_t* m_panel_full; \ dim_t* n_panel_full; \ + pack_t schema; \ \ \ /* If c needs a transposition, induce it so that we can more simply @@ -182,6 +184,7 @@ void PASTEMAC(ch,varname) \ if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to unpack from column panels. */ \ + schema = BLIS_PACKED_COL_PANELS; \ iter_dim = n; \ panel_len = m; \ panel_dim_max = pd_p; \ @@ -196,6 +199,7 @@ void PASTEMAC(ch,varname) \ else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to unpack from row panels. */ \ + schema = BLIS_PACKED_ROW_PANELS; \ iter_dim = m; \ panel_len = n; \ panel_dim_max = pd_p; \ @@ -207,6 +211,14 @@ void PASTEMAC(ch,varname) \ m_panel_full = &panel_dim_i; \ n_panel_full = &n; \ } \ +\ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \ + : BLIS_UNPACKM_MRXK_KER; \ +\ + /* Query the context for the unpackm kernel corresponding to the current + panel dimension, or kernel id. */ \ + PASTECH2(ch,unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ @@ -253,9 +265,10 @@ void PASTEMAC(ch,varname) \ else \ { \ /* Pack the current panel. */ \ - PASTEMAC(ch,unpackm_cxk) \ + f \ ( \ BLIS_NO_CONJUGATE, \ + schema, \ panel_dim_i, \ panel_len, \ one, \ diff --git a/frame/2/gemv/bli_gemv_unb_var1.c b/frame/2/gemv/bli_gemv_unb_var1.c index 3f5681d2b..840b96901 100644 --- a/frame/2/gemv/bli_gemv_unb_var1.c +++ b/frame/2/gemv/bli_gemv_unb_var1.c @@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < n_iter; ++i ) \ { \ diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c index 8166aa417..7fc4fcfe4 100644 --- a/frame/2/gemv/bli_gemv_unb_var2.c +++ b/frame/2/gemv/bli_gemv_unb_var2.c @@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < n_iter; ++i ) \ { \ diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index e392e830e..0dceed4cf 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -71,7 +71,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ for ( i = 0; i < n_iter; i += f ) \ diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index fe7702e4c..4c43657ad 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ for ( i = 0; i < n_iter; i += f ) \ diff --git a/frame/2/ger/bli_ger_unb_var1.c b/frame/2/ger/bli_ger_unb_var1.c index d6cda277e..d8ddd1247 100644 --- a/frame/2/ger/bli_ger_unb_var1.c +++ b/frame/2/ger/bli_ger_unb_var1.c @@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/ger/bli_ger_unb_var2.c b/frame/2/ger/bli_ger_unb_var2.c index 1590bfe5e..9c49e336b 100644 --- a/frame/2/ger/bli_ger_unb_var2.c +++ b/frame/2/ger/bli_ger_unb_var2.c @@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( j = 0; j < n; ++j ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c index ea5d478be..71c27a326 100644 --- a/frame/2/hemv/bli_hemv_unb_var1.c +++ b/frame/2/hemv/bli_hemv_unb_var1.c @@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointers. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c index 1f7346517..3753c8d3b 100644 --- a/frame/2/hemv/bli_hemv_unb_var2.c +++ b/frame/2/hemv/bli_hemv_unb_var2.c @@ -123,7 +123,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c index 6573e59fc..d592251d5 100644 --- a/frame/2/hemv/bli_hemv_unb_var3.c +++ b/frame/2/hemv/bli_hemv_unb_var3.c @@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointers. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c index deabc3ab4..10cf953b6 100644 --- a/frame/2/hemv/bli_hemv_unb_var4.c +++ b/frame/2/hemv/bli_hemv_unb_var4.c @@ -122,7 +122,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointers. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c index d36dc0098..a449909a5 100644 --- a/frame/2/hemv/bli_hemv_unf_var1.c +++ b/frame/2/hemv/bli_hemv_unf_var1.c @@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ + kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ \ for ( i = 0; i < m; i += f ) \ diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c index 31ab1515f..d0af57393 100644 --- a/frame/2/hemv/bli_hemv_unf_var1a.c +++ b/frame/2/hemv/bli_hemv_unf_var1a.c @@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ + kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c index d8db9bc78..baaff098d 100644 --- a/frame/2/hemv/bli_hemv_unf_var3.c +++ b/frame/2/hemv/bli_hemv_unf_var3.c @@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ + kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ \ for ( i = 0; i < m; i += f ) \ diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c index 54ab0f6ce..55c1929ff 100644 --- a/frame/2/hemv/bli_hemv_unf_var3a.c +++ b/frame/2/hemv/bli_hemv_unf_var3a.c @@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ + kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her/bli_her_unb_var1.c b/frame/2/her/bli_her_unb_var1.c index e7f718680..8cd6bd397 100644 --- a/frame/2/her/bli_her_unb_var1.c +++ b/frame/2/her/bli_her_unb_var1.c @@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her/bli_her_unb_var2.c b/frame/2/her/bli_her_unb_var2.c index 4b39e1df0..f68798dce 100644 --- a/frame/2/her/bli_her_unb_var2.c +++ b/frame/2/her/bli_her_unb_var2.c @@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c index 37423bfcb..b5c182639 100644 --- a/frame/2/her2/bli_her2_unb_var1.c +++ b/frame/2/her2/bli_her2_unb_var1.c @@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c index 22d6de07a..602e922a8 100644 --- a/frame/2/her2/bli_her2_unb_var2.c +++ b/frame/2/her2/bli_her2_unb_var2.c @@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c index 297b9b702..1d5872d5d 100644 --- a/frame/2/her2/bli_her2_unb_var3.c +++ b/frame/2/her2/bli_her2_unb_var3.c @@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c index 58adb0e70..922fe7db7 100644 --- a/frame/2/her2/bli_her2_unb_var4.c +++ b/frame/2/her2/bli_her2_unb_var4.c @@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c index a0aec48f7..3824880c6 100644 --- a/frame/2/her2/bli_her2_unf_var1.c +++ b/frame/2/her2/bli_her2_unf_var1.c @@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ + kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c index 3dea31d53..6b2b0e9ac 100644 --- a/frame/2/her2/bli_her2_unf_var4.c +++ b/frame/2/her2/bli_her2_unf_var4.c @@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ + kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/trmv/bli_trmv_unb_var1.c b/frame/2/trmv/bli_trmv_unb_var1.c index 31bfa6a83..367a34e6c 100644 --- a/frame/2/trmv/bli_trmv_unb_var1.c +++ b/frame/2/trmv/bli_trmv_unb_var1.c @@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trmv/bli_trmv_unb_var2.c b/frame/2/trmv/bli_trmv_unb_var2.c index 00d4d95f3..fa21776b3 100644 --- a/frame/2/trmv/bli_trmv_unb_var2.c +++ b/frame/2/trmv/bli_trmv_unb_var2.c @@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c index 6dc3cea36..9e576fc77 100644 --- a/frame/2/trmv/bli_trmv_unf_var1.c +++ b/frame/2/trmv/bli_trmv_unf_var1.c @@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c index 8bbd51820..052595935 100644 --- a/frame/2/trmv/bli_trmv_unf_var2.c +++ b/frame/2/trmv/bli_trmv_unf_var2.c @@ -90,7 +90,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c index c7493e33d..2f24b10a8 100644 --- a/frame/2/trsv/bli_trsv_unb_var1.c +++ b/frame/2/trsv/bli_trsv_unb_var1.c @@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotv_ker_ft) kfp_tv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_tv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ + kfp_tv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c index a78e7eef0..1a8e81634 100644 --- a/frame/2/trsv/bli_trsv_unb_var2.c +++ b/frame/2/trsv/bli_trsv_unb_var2.c @@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c index 3b03b43e5..824f26d15 100644 --- a/frame/2/trsv/bli_trsv_unf_var1.c +++ b/frame/2/trsv/bli_trsv_unf_var1.c @@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c index 10741d291..bd1f8e3b0 100644 --- a/frame/2/trsv/bli_trsv_unf_var2.c +++ b/frame/2/trsv/bli_trsv_unf_var2.c @@ -102,7 +102,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c index bde30c527..1d4608799 100644 --- a/frame/3/bli_l3_schema.c +++ b/frame/3/bli_l3_schema.c @@ -57,7 +57,7 @@ void bli_l3_set_schemas // projection of dt to query the preference of the corresponding native // real-domain microkernel. This is what ultimately determines which // variant of 1m is applicable. - if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ) ) { schema_a = BLIS_PACKED_ROW_PANELS_1E; schema_b = BLIS_PACKED_COL_PANELS_1R; diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index 72ec405ab..7e37e1f22 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -63,7 +63,7 @@ err_t bli_gemmsup // Return early if a microkernel preference-induced transposition would // have been performed and shifted the dimensions outside of the space // of sup-handled problems. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( c, BLIS_GEMM_VIR_UKR, cntx ) ) { const num_t dt = bli_obj_dt( c ); const dim_t m = bli_obj_length( c ); diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c index e54e01d7c..3da3954fa 100644 --- a/frame/3/bli_l3_sup_int.c +++ b/frame/3/bli_l3_sup_int.c @@ -85,7 +85,7 @@ err_t bli_gemmsup_int const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); - const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx ); const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); @@ -259,7 +259,7 @@ err_t bli_gemmtsup_int const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); - const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx ); const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index 85fb246f0..519dc5ccd 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -122,6 +122,14 @@ void PASTEMAC(ch,varname) \ ldc = cs_c; \ ldp = cs_p; \ } \ +\ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ +\ + /* Query the context for the unpackm kernel corresponding to the current + panel dimension, or kernel id. */ \ + PASTECH2(ch,packm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* Compute the total number of iterations we'll need. */ \ n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ @@ -171,12 +179,11 @@ void PASTEMAC(ch,varname) \ or round-robin partitioning was requested at configure-time. */ \ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ - PASTEMAC(ch,packm_cxk) \ + f \ ( \ conjc, \ schema, \ panel_dim_i, \ - panel_dim_max, \ panel_len_i, \ panel_len_max_i, \ kappa_cast, \ diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h index 7c315192d..ead9925e6 100644 --- a/frame/3/bli_l3_sup_vars.h +++ b/frame/3/bli_l3_sup_vars.h @@ -127,7 +127,7 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases cntx_t* cntx ) { - const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( *eff_id ), cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 4ff45036f..cd8827bd9 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -99,7 +99,7 @@ void bli_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 6de361194..874a12439 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -201,7 +201,7 @@ void bli_gemm_ker_var2 // column-stored as well. char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c index e257cdf28..6202cfffd 100644 --- a/frame/3/gemm/bli_gemm_md.c +++ b/frame/3/gemm/bli_gemm_md.c @@ -173,7 +173,7 @@ mddm_t bli_gemm_md_ccr // preference. const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); const bool row_pref - = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx ); + = bli_cntx_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx ); // We can only perform this case of mixed-domain gemm, C += A*B where // B is real, if the microkernel prefers column output. If it prefers @@ -236,8 +236,8 @@ mddm_t bli_gemm_md_ccr // Use the default pack schemas in the objects. - // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) - func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); + // static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) + func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx ); // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. @@ -278,7 +278,7 @@ mddm_t bli_gemm_md_crc // preference. const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); const bool col_pref - = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx ); + = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx ); // We can only perform this case of mixed-domain gemm, C += A*B where // A is real, if the microkernel prefers row output. If it prefers @@ -341,8 +341,8 @@ mddm_t bli_gemm_md_crc // Use the default pack schemas in the objects. - // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) - func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); + // static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) + func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx ); // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. @@ -430,13 +430,11 @@ mddm_t bli_gemm_md_rcc const num_t dt_complex = bli_obj_dt( a ); cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex ); - func_t* cntx_funcs = bli_cntx_packm_kers_buf( *cntx ); - func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m ); + func_t* cntx_funcs = bli_cntx_ukrs_buf( *cntx ); + func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m ); - for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i ) - { - cntx_funcs[ i ] = cntx_1m_funcs[ i ]; - } + cntx_funcs[ BLIS_PACKM_MRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_MRXK_KER ]; + cntx_funcs[ BLIS_PACKM_NRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_NRXK_KER ]; // Return the computation and execution domains. return doms; diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c index bbd9190a9..a4797ad4f 100644 --- a/frame/3/gemm/bli_gemm_md_c2r_ref.c +++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c @@ -57,8 +57,8 @@ void PASTEMAC2(ch,opname,suf) \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c index 62d2a9e04..c5cf935b8 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2.c @@ -198,7 +198,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c index 289e4ddf5..946e3048c 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2rr.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c @@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c index d75838fb4..f5159bbb9 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2sl.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c @@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 2a9d91759..d53838470 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -86,7 +86,7 @@ void bli_gemmt_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index fea4efec0..3aedc6e9a 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index 4b849bbc6..b3a9fe8a1 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c index 0bf4b1a0f..ece351ef7 100644 --- a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c index 1655bea55..f00e769b5 100644 --- a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 9835de9c1..15460125d 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -117,7 +117,7 @@ void bli_hemm_front // micro-kernel to access elements of C in its preferred manner. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_toggle_conj( &a_local ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index be94c44c1..8108b607f 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -117,7 +117,7 @@ void bli_symm_front // micro-kernel to access elements of C in its preferred manner. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &b_local ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 1de28958e..d973b6eb6 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -135,7 +135,7 @@ void bli_trmm_front // of row- vs. column storage breaks down. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c index 9ab64e470..706e14d43 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c index 6fef4e0c9..699892635 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ @@ -337,7 +337,7 @@ void PASTEMAC(ch,varname) \ dim_t jr_inc; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for - the initial rectangular region of C (if it exists). + the initial rectangular region of C (if it exists). NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c index e0d9cc75f..eb5577593 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c index 0abcfd77a..738711f58 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c index 8c505f88a..df53b2011 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c index 3bb0deaa3..fbcd4f9aa 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c index 672caaa05..7775d9217 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c index 9d9e3809c..c1354a962 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c index 8bac0ec4a..7cf8eeef0 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c index fc2991b13..1d0f31708 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c index 00a0dc3f0..d8ae4f8bb 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c index 889fa49fa..c05a082d4 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 3b9753960..9cd04963b 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -127,7 +127,7 @@ void bli_trmm3_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index f50f739e7..7b1133c2a 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -180,7 +180,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 4f3514143..2059d1c9f 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -180,7 +180,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index b4937134f..cace3622a 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -185,7 +185,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 09942d311..4b0c7f083 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -185,7 +185,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c index dc57eac5f..26da1b004 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c @@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c index 38768242e..607b40e54 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c index 78ffe1758..3299b5f8e 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c index 7c4cea976..b02ff0955 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c @@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c index 8d050c62b..e78cef477 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c index b49a1144e..93cac371a 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c index a11936389..1e903c3c1 100644 --- a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c @@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c index 7ad1e4271..a44d64f45 100644 --- a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c @@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 3a698871b..218325d5a 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -43,253 +43,76 @@ void bli_cntx_clear( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) +void bli_cntx_set_blkszs( cntx_t* cntx, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default blocksizes. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // blocksizes across all datatypes. + // bli_cntx_init__ref() so that the context begins with + // default blocksizes across all datatypes. /* Example prototypes: void bli_cntx_set_blkszs ( - ind_t method = BLIS_NAT, - dim_t n_bs, + cntx_t* cntx, bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, - ... - cntx_t* cntx - ); - - void bli_cntx_set_blkszs - ( - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, - ... - cntx_t* cntx + ..., + BLIS_VA_END ); */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bszid_t* bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_bs ); - - // Handle native and induced method cases separately. - if ( method == BLIS_NAT ) - { - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object, - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); - bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - blkszs[ i ] = blksz; - bmults[ i ] = bm_id; - } - } - else // if induced method execution was indicated - { - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object, - // - the bszid_t of the multiple we need to associate with - // the blksz_t object, - // - the scalars we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes (for default - // and maximum blocksizes). - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); - bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); - double dsclr = ( double )va_arg( args, double ); - double msclr = ( double )va_arg( args, double ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - blkszs[ i ] = blksz; - bmults[ i ] = bm_id; - dsclrs[ i ] = dsclr; - msclrs[ i ] = msclr; - } - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - // Save the execution type into the context. - bli_cntx_set_method( method, cntx ); + bli_cntx_set_method( BLIS_NAT, cntx ); // Query the context for the addresses of: // - the blocksize object array // - the blocksize multiple array - blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx ); - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. Notice that the blksz_t* pointers were saved, rather than - // the objects themselves, but we copy the contents of the objects - // when copying into the context. + // Initialize variable argument environment. + va_list args; + va_start( args, cntx ); - // Handle native and induced method cases separately. - if ( method == BLIS_NAT ) + // Process blocksizes until we get a BLIS_VA_END. + while ( true ) { - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) - { - // Read the current blocksize id, blksz_t* pointer, blocksize - // multiple id, and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; + int bs_id0 = va_arg( args, int ); - blksz_t* blksz = blkszs[ i ]; + // If we find a bszid_t id of BLIS_VA_END, then we are done. + if ( bs_id0 == BLIS_VA_END ) break; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + // Here, we query the variable argument list for: + // - the bszid_t of the blocksize we're about to process (already done), + // - the address of the blksz_t object, + // - the bszid_t of the multiple we need to associate with + // the blksz_t object. + bszid_t bs_id = ( bszid_t )bs_id0; + blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); + bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); - // Copy the blksz_t object contents into the appropriate - // location within the context's blksz_t array. Do the same - // for the blocksize multiple id. - //cntx_blkszs[ bs_id ] = *blksz; - //bli_blksz_copy( blksz, cntx_blksz ); - bli_blksz_copy_if_pos( blksz, cntx_blksz ); + // Copy the blksz_t object contents into the appropriate + // location within the context's blksz_t array. Do the same + // for the blocksize multiple id. + //cntx_blkszs[ bs_id ] = *blksz; + //bli_blksz_copy( blksz, cntx_blksz ); + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + bli_blksz_copy_if_pos( blksz, cntx_blksz ); - // Copy the blocksize multiple id into the context. - cntx_bmults[ bs_id ] = bm_id; - } - } - else - { - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) - { - // Read the current blocksize id, blksz_t pointer, blocksize - // multiple id, and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; - double dsclr = dsclrs[ i ]; - double msclr = msclrs[ i ]; - - blksz_t* blksz = blkszs[ i ]; - - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; - - // Copy the real domain values of the source blksz_t object into - // the context, duplicating into the complex domain fields. - bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - - // If the default blocksize scalar is non-unit, we need to scale - // the complex domain default blocksizes. - if ( dsclr != 1.0 ) - { - // Scale the complex domain default blocksize values in the - // blocksize object. - bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - } - - // Similarly, if the maximum blocksize scalar is non-unit, we need - // to scale the complex domain maximum blocksizes. - if ( msclr != 1.0 ) - { - // Scale the complex domain maximum blocksize values in the - // blocksize object. - bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); - } - - // Copy the blocksize multiple id into the context. - cntx_bmults[ bs_id ] = bm_id; - } + // Copy the blocksize multiple id into the context. + cntx_bmults[ bs_id ] = bm_id; } - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( blkszs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( bszids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( bmults ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( dsclrs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( msclrs ); + // Shutdown variable argument environment and clean up stack. + va_end( args ); } // ----------------------------------------------------------------------------- -void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ) +void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... ) { /* Example prototypes: @@ -297,1269 +120,268 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ) ( ind_t method != BLIS_NAT, num_t dt, - dim_t n_bs, + cntx_t* cntx, bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0, bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1, bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2, - ... - cntx_t* cntx + ..., + BLIS_VA_END ); - + NOTE: This function modifies an existing context that is presumed to have been initialized for native execution. */ - va_list args; - dim_t i; - err_t r_val; - // Project the given datatype to the real domain. This will be used later on. num_t dt_real = bli_dt_proj_to_real( dt ); // Return early if called with BLIS_NAT. if ( method == BLIS_NAT ) return; - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_bs ); - - { - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the scalars we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes (for default - // and maximum blocksizes). - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - double dsclr = ( double )va_arg( args, double ); - double msclr = ( double )va_arg( args, double ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - dsclrs[ i ] = dsclr; - msclrs[ i ] = msclr; - } - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - // Save the execution type into the context. bli_cntx_set_method( method, cntx ); - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. + // Initialize variable argument environment. + va_list args; + va_start( args, cntx ); + // Process blocksizes until we get a BLIS_VA_END. + while ( true ) { - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) + int bs_id0 = va_arg( args, int ); + + // If we find a bszid_t id of BLIS_VA_END, then we are done. + if ( bs_id0 == BLIS_VA_END ) break; + + // Here, we query the variable argument list for: + // - the bszid_t of the blocksize we're about to process (already done), + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = ( bszid_t )bs_id0; + double dsclr = ( double )va_arg( args, double ); + double msclr = ( double )va_arg( args, double ); + + // Query the context for the blksz_t object assoicated with the + // current blocksize id, and also query the object corresponding + // to the blocksize multiple. + blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx ); + + // Copy the real domain value of the blksz_t object into the + // corresponding complex domain slot of the same object. + bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz ); + + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Read the current blocksize id, blocksize multiple id, - // and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - double dsclr = dsclrs[ i ]; - double msclr = msclrs[ i ]; + // Scale the default blocksize value corresponding to the given + // datatype. + bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz ); + } - //blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; - - // Query the context for the blksz_t object assoicated with the - // current blocksize id, and also query the object corresponding - // to the blocksize multiple. - blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx ); - - // Copy the real domain value of the blksz_t object into the - // corresponding complex domain slot of the same object. - bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz ); - - // If the default blocksize scalar is non-unit, we need to scale - // the complex domain default blocksizes. - if ( dsclr != 1.0 ) - { - // Scale the default blocksize value corresponding to the given - // datatype. - bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz ); - } - - // Similarly, if the maximum blocksize scalar is non-unit, we need - // to scale the complex domain maximum blocksizes. - if ( msclr != 1.0 ) - { - // Scale the maximum blocksize value corresponding to the given - // datatype. - bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz ); - } + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the maximum blocksize value corresponding to the given + // datatype. + bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz ); } } - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bli_free_intl( bszids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bli_free_intl( dsclrs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bli_free_intl( msclrs ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-3 microkernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // microkernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l3_nat_ukrs - ( - dim_t n_ukrs, - l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, bool pref0, - l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, bool pref1, - l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, bool pref2, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_ukrs ); - - // Process n_ukrs tuples. - for ( i = 0; i < n_ukrs; ++i ) - { - // Here, we query the variable argument list for: - // - the l3ukr_t of the kernel we're about to process, - // - the datatype of the kernel, - // - the kernel function pointer, and - // - the kernel function storage preference - // that we need to store to the context. - - // NOTE: Though bool_t is no longer used, the following comment is - // being kept for historical reasons. - // The type that we pass into the va_arg() macro for the ukr - // preference matters. Using 'bool_t' may cause breakage on 64-bit - // systems that define int as 32 bits and long int and pointers as - // 64 bits. The problem is that TRUE or FALSE are defined as 1 and - // 0, respectively, and when "passed" into the variadic function - // they come with no contextual typecast. Thus, default rules of - // argument promotion kick in to treat these integer literals as - // being of type int. Thus, we need to let va_arg() treat the TRUE - // or FALSE value as an int, even if we cast it to and store it - // within a bool_t afterwards. - const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t ); - const num_t ukr_dt = ( num_t )va_arg( args, num_t ); - void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); - const bool ukr_pref = ( bool )va_arg( args, int ); - - // Store the values in our temporary arrays. - ukr_ids[ i ] = ukr_id; - ukr_dts[ i ] = ukr_dt; - ukr_fps[ i ] = ukr_fp; - ukr_prefs[ i ] = ukr_pref; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - // Shutdown variable argument environment and clean up stack. va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 virtual ukernel func_t array - // - the l3 native ukernel func_t array - // - the l3 native ukernel preferences array - func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); - func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); - mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_ukrs; ++i ) - { - // Read the current ukernel id, ukernel datatype, ukernel function - // pointer, and ukernel preference. - const l3ukr_t ukr_id = ukr_ids[ i ]; - const num_t ukr_dt = ukr_dts[ i ]; - void_fp ukr_fp = ukr_fps[ i ]; - const bool ukr_pref = ukr_prefs[ i ]; - - // Index into the func_t and mbool_t for the current kernel id - // being processed. - func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ]; - func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ]; - mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ]; - - // Store the ukernel function pointer and preference values into - // the context. Notice that we redundantly store the native - // ukernel address in both the native and virtual ukernel slots - // in the context. This is standard practice when creating a - // native context. (Induced method contexts will overwrite the - // virtual function pointer with the address of the appropriate - // virtual ukernel.) - bli_func_set_dt( ukr_fp, ukr_dt, vukrs ); - bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); - bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); - } - - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_fps ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_prefs ); } // ----------------------------------------------------------------------------- -void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ) +void bli_cntx_set_ukrs( cntx_t* cntx , ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use - // non-default level-3 virtual microkernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // microkernels across all datatypes. + // non-default microkernels. It should be called after + // bli_cntx_init__ref() so that the context begins with + // default microkernels across all datatypes. /* Example prototypes: - void bli_cntx_set_l3_vir_ukrs + void bli_cntx_set_ukrs ( - dim_t n_ukrs, - l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, - l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, - l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, - ... - cntx_t* cntx + cntx_t* cntx, + ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, + ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, + ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, + ..., + BLIS_VA_END ); */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- + // Query the context for the address of the ukernel func_t array + func_t* cntx_ukrs = bli_cntx_ukrs_buf( cntx ); // Initialize variable argument environment. - va_start( args, n_ukrs ); + va_list args; + va_start( args, cntx ); - // Process n_ukrs tuples. - for ( i = 0; i < n_ukrs; ++i ) + // Process ukernels until BLIS_VA_END is reached. + while ( true ) { + const int ukr_id0 = va_arg( args, int ); + + // If we find a ukernel id of BLIS_VA_END, then we are done. + if ( ukr_id0 == BLIS_VA_END ) break; + // Here, we query the variable argument list for: - // - the l3ukr_t of the kernel we're about to process, + // - the ukr_t of the kernel we're about to process (already done), // - the datatype of the kernel, and - // - the kernel function pointer. - // that we need to store to the context. - const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t ); - const num_t ukr_dt = ( num_t )va_arg( args, num_t ); - void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ukr_ids[ i ] = ukr_id; - ukr_dts[ i ] = ukr_dt; - ukr_fps[ i ] = ukr_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 virtual ukernel func_t array - func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_ukrs; ++i ) - { - // Read the current ukernel id, ukernel datatype, ukernel function - // pointer, and ukernel preference. - const l3ukr_t ukr_id = ukr_ids[ i ]; - const num_t ukr_dt = ukr_dts[ i ]; - void_fp ukr_fp = ukr_fps[ i ]; + // - the kernel function pointer + const ukr_t ukr_id = ( ukr_t )ukr_id0; + const num_t ukr_dt = ( num_t )va_arg( args, num_t ); + void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); // Index into the func_t and mbool_t for the current kernel id // being processed. - func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ]; + func_t* ukrs = &cntx_ukrs[ ukr_id ]; - // Store the ukernel function pointer and preference values into - // the context. Notice that we redundantly store the native + // Store the ukernel function pointer into the context. + // Notice that we redundantly store the native // ukernel address in both the native and virtual ukernel slots // in the context. This is standard practice when creating a // native context. (Induced method contexts will overwrite the // virtual function pointer with the address of the appropriate // virtual ukernel.) - bli_func_set_dt( ukr_fp, ukr_dt, vukrs ); + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); + + // Locate the virtual ukernel func_t pointer that corresponds to the + // ukernel id provided by the caller. + switch ( ukr_id ) + { + case BLIS_GEMM_UKR: ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ]; break; + case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ]; break; + case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break; + case BLIS_TRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break; + case BLIS_TRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break; + default: ukrs = NULL; break; + }; + + if ( ukrs ) + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); } - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - bli_free_intl( ukr_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - bli_free_intl( ukr_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - bli_free_intl( ukr_fps ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default thresholds for small/unpacked matrix handling. It should - // be called after bli_cntx_init_defaults() so that the context begins - // with default thresholds. - - /* Example prototypes: - - void bli_cntx_set_l3_sup_thresh - ( - dim_t n_thresh, - threshid_t th0_id, blksz_t* blksz0, - threshid_t th1_id, blksz_t* blksz1, - ... - cntx_t* cntx - ); - - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - blksz_t** threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_thresh ); - - // Process n_thresh tuples. - for ( i = 0; i < n_thresh; ++i ) - { - // Here, we query the variable argument list for: - // - the threshid_t of the threshold we're about to process, - // - the address of the blksz_t object, - threshid_t th_id = ( threshid_t )va_arg( args, threshid_t ); - blksz_t* thresh = ( blksz_t* )va_arg( args, blksz_t* ); - - // Store the values in our temporary arrays. - threshids[ i ] = th_id; - threshs[ i ] = thresh; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - // Shutdown variable argument environment and clean up stack. va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the threshold array - blksz_t* cntx_threshs = bli_cntx_l3_sup_thresh_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. Notice that the blksz_t* pointers were saved, rather than - // the objects themselves, but we copy the contents of the objects - // when copying into the context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_thresh; ++i ) - { - // Read the current blocksize id, blksz_t* pointer, blocksize - // multiple id, and blocksize scalar. - threshid_t th_id = threshids[ i ]; - blksz_t* thresh = threshs[ i ]; - - blksz_t* cntx_thresh = &cntx_threshs[ th_id ]; - - // Copy the blksz_t object contents into the appropriate - // location within the context's blksz_t array. - //cntx_threshs[ th_id ] = *thresh; - //bli_blksz_copy( thresh, cntx_thresh ); - bli_blksz_copy_if_pos( thresh, cntx_thresh ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - bli_free_intl( threshs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - bli_free_intl( threshids ); } // ----------------------------------------------------------------------------- -void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ) +void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... ) +{ + // This function can be called from the bli_cntx_init_*() function for + // a particular architecture if the kernel developer wishes to use + // non-default microkernel preferences. It should be called after + // bli_cntx_init__ref() so that the context begins with + // default preferences across all datatypes. + + /* Example prototypes: + + void bli_cntx_set_ukr_prefs + ( + cntx_t* cntx, + ukr_pref_t ukr_pref0_id, num_t dt0, bool ukr_pref0, + ukr_pref_t ukr_pref1_id, num_t dt1, bool ukr_pref1, + ukr_pref_t ukr_pref2_id, num_t dt2, bool ukr_pref2, + ..., + BLIS_VA_END + ); + */ + + // Query the context for the address of the ukernel preference mbool_t array + mbool_t* cntx_ukr_prefs = bli_cntx_ukr_prefs_buf( cntx ); + + // Initialize variable argument environment. + va_list args; + va_start( args, cntx ); + + // Process ukernel preferences until BLIS_VA_END is reached. + while ( true ) + { + const int ukr_pref_id0 = va_arg( args, int ); + + // If we find a ukernel pref id of BLIS_VA_END, then we are done. + if ( ukr_pref_id0 == BLIS_VA_END ) break; + + // Here, we query the variable argument list for: + // - the ukr_t of the kernel we're about to process (already done), + // - the datatype of the kernel, and + // - the kernel function pointer + const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0; + const bool ukr_pref_dt = ( num_t )va_arg( args, num_t ); + const bool ukr_pref = ( bool )va_arg( args, int ); + + // Index into the func_t and mbool_t for the current kernel id + // being processed. + mbool_t* ukr_prefs = &cntx_ukr_prefs[ ukr_pref_id ]; + + // Store the ukernel preference value into the context. + bli_mbool_set_dt( ukr_pref, ukr_pref_dt, ukr_prefs ); + } + + // Shutdown variable argument environment and clean up stack. + va_end( args ); +} + +// ----------------------------------------------------------------------------- + +void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default level-3 operation handler for small/unpacked matrices. It - // should be called after bli_cntx_init_defaults() so that the context - // begins with default sup handlers across all datatypes. + // should be called after bli_cntx_init__ref() so that the + // context begins with default sup handlers across all datatypes. /* Example prototypes: void bli_cntx_set_l3_sup_handlers ( - dim_t n_ops, - opid_t op0_id, void* handler0_fp, - opid_t op1_id, void* handler1_fp, - opid_t op2_id, void* handler2_fp, - ... cntx_t* cntx + opid_t op0_id, void_fp handler0_fp, + opid_t op1_id, void_fp handler1_fp, + opid_t op2_id, void_fp handler2_fp, + ..., + BLIS_VA_END ); */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - void** op_fps = bli_malloc_intl( n_ops * sizeof( void* ), &r_val ); - - // -- Begin variable argument section -- + // Query the context for the address of the l3 sup handlers array. + void_fp* cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx ); // Initialize variable argument environment. - va_start( args, n_ops ); + va_list args; + va_start( args, cntx ); - // Process n_ukrs tuples. - for ( i = 0; i < n_ops; ++i ) + // Process sup handlers until BLIS_VA_END is reached. + while ( true ) { + const int op_id0 = va_arg( args, int ); + + // If we find an operation id of BLIS_VA_END, then we are done. + if ( op_id0 == BLIS_VA_END ) break; + // Here, we query the variable argument list for: // - the opid_t of the operation we're about to process, // - the sup handler function pointer - // that we need to store to the context. - const opid_t op_id = ( opid_t )va_arg( args, opid_t ); - void* op_fp = ( void* )va_arg( args, void* ); - - // Store the values in our temporary arrays. - op_ids[ i ] = op_id; - op_fps[ i ] = op_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 small/unpacked handlers array - void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each operation id tuple provided. - for ( i = 0; i < n_ops; ++i ) - { - // Read the current operation id and handler function pointer. - const opid_t op_id = op_ids[ i ]; - void* op_fp = op_fps[ i ]; + const opid_t op_id = ( opid_t )op_id0; + void_fp op_fp = ( void_fp )va_arg( args, void_fp ); // Store the sup handler function pointer into the slot for the // specified operation id. cntx_l3_sup_handlers[ op_id ] = op_fp; } - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - bli_free_intl( op_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - bli_free_intl( op_fps ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default l3 sup blocksizes. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // blocksizes across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_blkszs - ( - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, - bszid_t bs1_id, blksz_t* blksz1, - bszid_t bs2_id, blksz_t* blksz2, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_bs ); - - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object. - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - blkszs[ i ] = blksz; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - // Shutdown variable argument environment and clean up stack. va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the blocksize object array - blksz_t* cntx_l3_sup_blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. Notice that the blksz_t* pointers were saved, rather than - // the objects themselves, but we copy the contents of the objects - // when copying into the context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) - { - // Read the current blocksize id, blksz_t* pointer, blocksize - // multiple id, and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - blksz_t* blksz = blkszs[ i ]; - - blksz_t* cntx_l3_sup_blksz = &cntx_l3_sup_blkszs[ bs_id ]; - - // Copy the blksz_t object contents into the appropriate - // location within the context's blksz_t array. - //cntx_l3_sup_blkszs[ bs_id ] = *blksz; - //bli_blksz_copy( blksz, cntx_l3_sup_blksz ); - bli_blksz_copy_if_pos( blksz, cntx_l3_sup_blksz ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( blkszs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( bszids ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-3 microkernels for small/unpacked matrices. It - // should be called after bli_cntx_init_defaults() so that the context - // begins with default sup micro/millikernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l3_sup_kers - ( - dim_t n_ukrs, - stor3_t stor_id0, num_t dt0, void* ukr0_fp, bool pref0, - stor3_t stor_id1, num_t dt1, void* ukr1_fp, bool pref1, - stor3_t stor_id2, num_t dt2, void* ukr2_fp, bool pref2, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - stor3_t* st3_ids = bli_malloc_intl( n_ukrs * sizeof( stor3_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_ukrs ); - - // Process n_ukrs tuples. - for ( i = 0; i < n_ukrs; ++i ) - { - // Here, we query the variable argument list for: - // - the stor3_t storage case being assigned to the kernel we're - // about to process, - // - the datatype of the kernel, - // - the kernel function pointer, and - // - the kernel function storage preference - // that we need to store to the context. - const stor3_t st3_id = ( stor3_t )va_arg( args, stor3_t ); - const num_t ukr_dt = ( num_t )va_arg( args, num_t ); - void* ukr_fp = ( void* )va_arg( args, void* ); - const bool ukr_pref = ( bool )va_arg( args, int ); - - // Store the values in our temporary arrays. - st3_ids[ i ] = st3_id; - ukr_dts[ i ] = ukr_dt; - ukr_fps[ i ] = ukr_fp; - ukr_prefs[ i ] = ukr_pref; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 small/unpacked ukernel func_t array - // - the l3 small/unpacked ukernel preferences array - func_t* cntx_l3_sup_kers = bli_cntx_l3_sup_kers_buf( cntx ); - mbool_t* cntx_l3_sup_kers_prefs = bli_cntx_l3_sup_kers_prefs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - -#if 0 - dim_t sup_map[ BLIS_NUM_LEVEL3_SUP_UKRS ][2]; - - // Create the small/unpacked ukernel mappings: - // - rv -> rrr 0, rcr 2 - // - rg -> rrc 1, rcc 3 - // - cv -> ccr 6, ccc 7 - // - cg -> crr 4, crc 5 - // - rd -> rrc 1 - // - cd -> crc 5 - // - rc -> rcc 3 - // - cr -> crr 4 - // - gx -> xxx 8 - // NOTE: We only need to set one slot in the context l3_sup_kers array - // for the general-stride/generic ukernel type, but since the loop below - // needs to be set up to set two slots to accommodate the RV, RG, CV, and - // CG, ukernel types, we will just be okay with the GX ukernel being set - // redundantly. (The RD, CD, CR, and RC ukernel types are set redundantly - // for the same reason.) - sup_map[ BLIS_GEMMSUP_RV_UKR ][0] = BLIS_RRR; - sup_map[ BLIS_GEMMSUP_RV_UKR ][1] = BLIS_RCR; - sup_map[ BLIS_GEMMSUP_RG_UKR ][0] = BLIS_RRC; - sup_map[ BLIS_GEMMSUP_RG_UKR ][1] = BLIS_RCC; - sup_map[ BLIS_GEMMSUP_CV_UKR ][0] = BLIS_CCR; - sup_map[ BLIS_GEMMSUP_CV_UKR ][1] = BLIS_CCC; - sup_map[ BLIS_GEMMSUP_CG_UKR ][0] = BLIS_CRR; - sup_map[ BLIS_GEMMSUP_CG_UKR ][1] = BLIS_CRC; - - sup_map[ BLIS_GEMMSUP_RD_UKR ][0] = BLIS_RRC; - sup_map[ BLIS_GEMMSUP_RD_UKR ][1] = BLIS_RRC; - sup_map[ BLIS_GEMMSUP_CD_UKR ][0] = BLIS_CRC; - sup_map[ BLIS_GEMMSUP_CD_UKR ][1] = BLIS_CRC; - - sup_map[ BLIS_GEMMSUP_RC_UKR ][0] = BLIS_RCC; - sup_map[ BLIS_GEMMSUP_RC_UKR ][1] = BLIS_RCC; - sup_map[ BLIS_GEMMSUP_CR_UKR ][0] = BLIS_CRR; - sup_map[ BLIS_GEMMSUP_CR_UKR ][1] = BLIS_CRR; - - sup_map[ BLIS_GEMMSUP_GX_UKR ][0] = BLIS_XXX; - sup_map[ BLIS_GEMMSUP_GX_UKR ][1] = BLIS_XXX; -#endif - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_ukrs; ++i ) - { - // Read the current stor3_t id, ukernel datatype, ukernel function - // pointer, and ukernel preference. - const stor3_t st3_id = st3_ids[ i ]; - const num_t ukr_dt = ukr_dts[ i ]; - void* ukr_fp = ukr_fps[ i ]; - const bool ukr_pref = ukr_prefs[ i ]; - - // Index to the func_t and mbool_t for the current stor3_t id - // being processed. - func_t* ukrs = &cntx_l3_sup_kers[ st3_id ]; - mbool_t* prefs = &cntx_l3_sup_kers_prefs[ st3_id ]; - - // Store the ukernel function pointer and preference values into - // the stor3_t location in the context. - bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); - bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); - } - - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( st3_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( ukr_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( ukr_fps ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( ukr_prefs ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l1f_kers( dim_t n_kers, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-1f kernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default l1f - // kernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l1f_kers - ( - dim_t n_ukrs, - l1fkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, - l1fkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, - l1fkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - l1fkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1fkr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_kers ); - - // Process n_kers tuples. - for ( i = 0; i < n_kers; ++i ) - { - // Here, we query the variable argument list for: - // - the l1fkr_t of the kernel we're about to process, - // - the datatype of the kernel, and - // - the kernel function pointer - // that we need to store to the context. - const l1fkr_t ker_id = ( l1fkr_t )va_arg( args, l1fkr_t ); - const num_t ker_dt = ( num_t )va_arg( args, num_t ); - void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ker_ids[ i ] = ker_id; - ker_dts[ i ] = ker_dt; - ker_fps[ i ] = ker_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the address of: - // - the level-1f kernels func_t array - func_t* cntx_l1f_kers = bli_cntx_l1f_kers_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_kers; ++i ) - { - // Read the current kernel id, kernel datatype, and kernel function - // pointer. - const l1fkr_t ker_id = ker_ids[ i ]; - const num_t ker_dt = ker_dts[ i ]; - void_fp ker_fp = ker_fps[ i ]; - - // Index into the func_t and mbool_t for the current kernel id - // being processed. - func_t* kers = &cntx_l1f_kers[ ker_id ]; - - // Store the ukernel function pointer and preference values into - // the context. - bli_func_set_dt( ker_fp, ker_dt, kers ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - bli_free_intl( ker_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - bli_free_intl( ker_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - bli_free_intl( ker_fps ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l1v_kers( dim_t n_kers, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-1v kernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default l1v - // kernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l1v_kers - ( - dim_t n_ukrs, - l1vkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, - l1vkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, - l1vkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - l1vkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1vkr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_kers ); - - // Process n_kers tuples. - for ( i = 0; i < n_kers; ++i ) - { - // Here, we query the variable argument list for: - // - the l1vkr_t of the kernel we're about to process, - // - the datatype of the kernel, and - // - the kernel function pointer - // that we need to store to the context. - const l1vkr_t ker_id = ( l1vkr_t )va_arg( args, l1vkr_t ); - const num_t ker_dt = ( num_t )va_arg( args, num_t ); - void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ker_ids[ i ] = ker_id; - ker_dts[ i ] = ker_dt; - ker_fps[ i ] = ker_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the address of: - // - the level-1v kernels func_t array - func_t* cntx_l1v_kers = bli_cntx_l1v_kers_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_kers; ++i ) - { - // Read the current kernel id, kernel datatype, and kernel function - // pointer. - const l1vkr_t ker_id = ker_ids[ i ]; - const num_t ker_dt = ker_dts[ i ]; - void_fp ker_fp = ker_fps[ i ]; - - // Index into the func_t and mbool_t for the current kernel id - // being processed. - func_t* kers = &cntx_l1v_kers[ ker_id ]; - - // Store the ukernel function pointer and preference values into - // the context. - bli_func_set_dt( ker_fp, ker_dt, kers ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - bli_free_intl( ker_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - bli_free_intl( ker_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - bli_free_intl( ker_fps ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_packm_kers( dim_t n_kers, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default packing kernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default packm - // kernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_packm_kers - ( - dim_t n_ukrs, - l1mkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, - l1mkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, - l1mkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_kers ); - - // Process n_kers tuples. - for ( i = 0; i < n_kers; ++i ) - { - // Here, we query the variable argument list for: - // - the l1mkr_t of the kernel we're about to process, - // - the datatype of the kernel, and - // - the kernel function pointer - // that we need to store to the context. - const l1mkr_t ker_id = ( l1mkr_t )va_arg( args, l1mkr_t ); - const num_t ker_dt = ( num_t )va_arg( args, num_t ); - void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ker_ids[ i ] = ker_id; - ker_dts[ i ] = ker_dt; - ker_fps[ i ] = ker_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the address of: - // - the packm kernels func_t array - func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_kers; ++i ) - { - // Read the current kernel id, kernel datatype, and kernel function - // pointer. - const l1mkr_t ker_id = ker_ids[ i ]; - const num_t ker_dt = ker_dts[ i ]; - void_fp ker_fp = ker_fps[ i ]; - - // Index into the func_t and mbool_t for the current kernel id - // being processed. - func_t* kers = &cntx_packm_kers[ ker_id ]; - - // Store the ukernel function pointer and preference values into - // the context. - bli_func_set_dt( ker_fp, ker_dt, kers ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - bli_free_intl( ker_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - bli_free_intl( ker_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - bli_free_intl( ker_fps ); } // ----------------------------------------------------------------------------- @@ -1586,11 +408,11 @@ void bli_cntx_print( cntx_t* cntx ) ); } - for ( i = 0; i < BLIS_NUM_LEVEL3_UKRS; ++i ) + for ( i = 0; i < BLIS_NUM_UKRS; ++i ) { - func_t* ukr = bli_cntx_get_l3_vir_ukrs( i, cntx ); + func_t* ukr = bli_cntx_get_ukrs( i, cntx ); - printf( "l3 vir ukr %2lu: %16p %16p %16p %16p\n", + printf( "ukr %2lu: %16p %16p %16p %16p\n", ( unsigned long )i, bli_func_get_dt( BLIS_FLOAT, ukr ), bli_func_get_dt( BLIS_DOUBLE, ukr ), @@ -1599,42 +421,16 @@ void bli_cntx_print( cntx_t* cntx ) ); } - for ( i = 0; i < BLIS_NUM_3OP_RC_COMBOS; ++i ) + for ( i = 0; i < BLIS_NUM_UKR_PREFS; ++i ) { - func_t* ukr = bli_cntx_get_l3_sup_kers( i, cntx ); + mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx ); - printf( "l3 sup ukr %2lu: %16p %16p %16p %16p\n", + printf( "ukr pref %2lu: %d %d %d %d\n", ( unsigned long )i, - bli_func_get_dt( BLIS_FLOAT, ukr ), - bli_func_get_dt( BLIS_DOUBLE, ukr ), - bli_func_get_dt( BLIS_SCOMPLEX, ukr ), - bli_func_get_dt( BLIS_DCOMPLEX, ukr ) - ); - } - - for ( i = 0; i < BLIS_NUM_LEVEL1F_KERS; ++i ) - { - func_t* ker = bli_cntx_get_l1f_kers( i, cntx ); - - printf( "l1f ker %2lu: %16p %16p %16p %16p\n", - ( unsigned long )i, - bli_func_get_dt( BLIS_FLOAT, ker ), - bli_func_get_dt( BLIS_DOUBLE, ker ), - bli_func_get_dt( BLIS_SCOMPLEX, ker ), - bli_func_get_dt( BLIS_DCOMPLEX, ker ) - ); - } - - for ( i = 0; i < BLIS_NUM_LEVEL1V_KERS; ++i ) - { - func_t* ker = bli_cntx_get_l1v_kers( i, cntx ); - - printf( "l1v ker %2lu: %16p %16p %16p %16p\n", - ( unsigned long )i, - bli_func_get_dt( BLIS_FLOAT, ker ), - bli_func_get_dt( BLIS_DOUBLE, ker ), - bli_func_get_dt( BLIS_SCOMPLEX, ker ), - bli_func_get_dt( BLIS_DCOMPLEX, ker ) + bli_mbool_get_dt( BLIS_FLOAT, ukr_pref ), + bli_mbool_get_dt( BLIS_DOUBLE, ukr_pref ), + bli_mbool_get_dt( BLIS_SCOMPLEX, ukr_pref ), + bli_mbool_get_dt( BLIS_DCOMPLEX, ukr_pref ) ); } diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 76350f6bc..412430e9b 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -43,24 +43,13 @@ /* typedef struct cntx_s { - blksz_t* blkszs; - bszid_t* bmults; + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + bszid_t bmults[ BLIS_NUM_BLKSZS ]; - func_t* l3_vir_ukrs; - func_t* l3_nat_ukrs; - mbool_t* l3_nat_ukrs_prefs; + func_t ukrs[ BLIS_NUM_UKRS ]; + mbool_t ukr_prefs[ BLIS_NUM_UKR_PREFS ]; - blksz_t* l3_sup_thresh; - void** l3_sup_handlers; - blksz_t* l3_sup_blkszs; - func_t* l3_sup_kers; - mbool_t* l3_sup_kers_prefs; - - func_t* l1f_kers; - func_t* l1v_kers; - - func_t* packm_kers; - func_t* unpackm_kers; + void_fp l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; ind_t method; @@ -81,54 +70,18 @@ BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } -BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) +BLIS_INLINE func_t* bli_cntx_ukrs_buf( cntx_t* cntx ) { - return cntx->l3_vir_ukrs; + return cntx->ukrs; } -BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) +BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx ) { - return cntx->l3_nat_ukrs; + return cntx->ukr_prefs; } -BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) -{ - return cntx->l3_nat_ukrs_prefs; -} -BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_thresh; -} -BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) +BLIS_INLINE void_fp* bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } -BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_blkszs; -} -BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_kers; -} -BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_kers_prefs; -} -BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) -{ - return cntx->l1f_kers; -} -BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) -{ - return cntx->l1v_kers; -} -BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) -{ - return cntx->packm_kers; -} -BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) -{ - return cntx->unpackm_kers; -} BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; @@ -204,402 +157,147 @@ BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) // ----------------------------------------------------------------------------- -BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); + func_t* funcs = bli_cntx_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } -BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); + func_t* func = bli_cntx_get_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } -BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); - func_t* func = &funcs[ ukr_id ]; + switch ( ukr_id ) + { + case BLIS_GEMM_UKR: ukr_id = BLIS_GEMM_VIR_UKR; break; + case BLIS_TRSM_L_UKR: ukr_id = BLIS_TRSM_L_VIR_UKR; break; + case BLIS_TRSM_U_UKR: ukr_id = BLIS_TRSM_U_VIR_UKR; break; + case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break; + case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break; + default: break; + }; - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); - - return bli_func_get_dt( dt, func ); + return bli_cntx_get_ukr_dt( dt, ukr_id, cntx ); } // ----------------------------------------------------------------------------- -BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t ukr_id, cntx_t* cntx ) { - mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } -BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, cntx_t* cntx ) { - mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); + mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- -BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) -{ - blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); - blksz_t* thresh = &threshs[ thresh_id ]; - - // Return the address of the blksz_t identified by thresh_id. - return thresh; -} - -BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) -{ - blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); - dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); - - // Return the main (default) threshold value for the datatype given. - return thresh_dt; -} - BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { - if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; - if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; - if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; + if ( m < bli_cntx_get_blksz_def_dt( dt, BLIS_MT, cntx ) ) return TRUE; + if ( n < bli_cntx_get_blksz_def_dt( dt, BLIS_NT, cntx ) ) return TRUE; + if ( k < bli_cntx_get_blksz_def_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- -BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { - void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); - void* func = funcs[ op ]; + void_fp* funcs = bli_cntx_l3_sup_handlers_buf( cntx ); + void_fp func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- -BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; + // This initial value will get overwritten during the switch statement below. + ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; - // Return the address of the blksz_t identified by bs_id. - return blksz; -} - -BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_def( dt, blksz ); - - // Return the main (default) blocksize value for the datatype given. - return bs_dt; -} - -BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_max( dt, blksz ); - - // Return the auxiliary (maximum) blocksize value for the datatype given. - return bs_dt; -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); - func_t* func = &funcs[ stor_id ]; - - return func; -} - -BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) -{ - mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); - mbool_t* mbool = &mbools[ stor_id ]; - - return mbool; -} - -BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); - - return ( bool )bli_mbool_get_dt( dt, mbool ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); - func_t* func = &funcs[ ker_id ]; - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); - func_t* func = &funcs[ ker_id ]; - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = NULL; - - // Only index to the requested packm func_t if the packm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) + // Get the correct preference from the kernel ID. + switch ( ukr_id ) { - func_t* funcs = bli_cntx_packm_kers_buf( cntx ); - - func = &funcs[ ker_id ]; + case BLIS_GEMM_VIR_UKR: // fallthrough + case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break; + case BLIS_TRSM_L_VIR_UKR: // fallthrough + case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break; + case BLIS_TRSM_U_VIR_UKR: // fallthrough + case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break; + case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break; + case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RCR_UKR: ukr_pref_id = BLIS_GEMMSUP_RCR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RCC_UKR: ukr_pref_id = BLIS_GEMMSUP_RCC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CRR_UKR: ukr_pref_id = BLIS_GEMMSUP_CRR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CRC_UKR: ukr_pref_id = BLIS_GEMMSUP_CRC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CCR_UKR: ukr_pref_id = BLIS_GEMMSUP_CCR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CCC_UKR: ukr_pref_id = BLIS_GEMMSUP_CCC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_XXX_UKR: ukr_pref_id = BLIS_GEMMSUP_XXX_UKR_ROW_PREF; break; + default: break; // TODO: should be an error condition } - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - void_fp fp = NULL; - - // Only query the context for the packm func_t (and then extract the - // datatype-specific function pointer) if the packm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) - { - func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); - - fp = bli_func_get_dt( dt, func ); - } - - return fp; -} - -BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = NULL; - - // Only index to the requested unpackm func_t if the unpackm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) - { - func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); - - func = &funcs[ ker_id ]; - } - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - void_fp fp = NULL; - - // Only query the context for the unpackm func_t (and then extract the - // datatype-specific function pointer) if the unpackm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) - { - func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); - - fp = bli_func_get_dt( dt, func ); - } - - return fp; -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); - - // A ukernel preference of TRUE means the ukernel prefers row storage. - return ( bool ) - ( prefs == TRUE ); -} - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); - - // A ukernel preference of FALSE means the ukernel prefers column storage. - return ( bool ) - ( prefs == FALSE ); -} - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // Note that we use the computation datatype, which may differ from the - // storage datatype of C (when performing a mixed datatype operation). - const num_t dt = bli_obj_comp_dt( obj ); - const bool ukr_prefers_rows - = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); - const bool ukr_prefers_cols - = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); - bool r_val = FALSE; - - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; - - return r_val; -} - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - return ( bool ) - !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // For induced methods, return the ukernel storage preferences of the - // corresponding real micro-kernel. - // NOTE: This projection to real domain becomes unnecessary if you - // set the exec_dt for 1m to the real projection of the storage - // datatype. + // For virtual ukernels during non-native execution, use the real projection of + // the datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) - dt = bli_dt_proj_to_real( dt ); + { + switch ( ukr_id ) + { + case BLIS_GEMM_VIR_UKR: // fallthrough + case BLIS_TRSM_L_VIR_UKR: // fallthrough + case BLIS_TRSM_U_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break; + default: break; + } + } - return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); + return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx ); } -BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - // For induced methods, return the ukernel storage preferences of the - // corresponding real micro-kernel. - // NOTE: This projection to real domain becomes unnecessary if you - // set the exec_dt for 1m to the real projection of the storage - // datatype. - if ( bli_cntx_method( cntx ) != BLIS_NAT ) - dt = bli_dt_proj_to_real( dt ); - - return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); + return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } -BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx ) { - // Note that we use the computation datatype, which may differ from the - // storage datatype of C (when performing a mixed datatype operation). - const num_t dt = bli_obj_comp_dt( obj ); - const bool ukr_prefers_rows - = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); - const bool ukr_prefers_cols - = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); - bool r_val = FALSE; + const bool ukr_prefers_rows + = bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx ); - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; + if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) return TRUE; + else if ( bli_obj_is_col_stored( obj ) && !ukr_prefers_rows ) return TRUE; - return r_val; + return FALSE; } -BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx ) { - return ( bool ) - !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); + return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- -BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); - - // A ukernel preference of TRUE means the ukernel prefers row storage. - return ( bool ) - ( prefs == TRUE ); -} - -BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); - - // A ukernel preference of FALSE means the ukernel prefers column storage. - return ( bool ) - ( prefs == FALSE ); -} - -#if 0 -// NOTE: These static functions aren't needed yet. - -BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) -{ - const num_t dt = bli_obj_dt( obj ); - const bool ukr_prefers_rows - = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); - const bool ukr_prefers_cols - = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); - bool r_val = FALSE; - - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; - - return r_val; -} - -BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) -{ - return ( bool ) - !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); -} -#endif - -// ----------------------------------------------------------------------------- - // // -- cntx_t modification (complex) -------------------------------------------- // @@ -632,67 +330,64 @@ BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, c bli_blksz_set_max( bs, dt, blksz ); } -BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); + func_t* funcs = bli_cntx_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } -BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); + func_t* func = bli_cntx_get_ukrs( ker_id, cntx ); - funcs[ ukr_id ] = *func; + bli_func_set_dt( fp, dt, func ); } -BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) +BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { - mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } -BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); + ukr_t ukr_id = bli_stor3_ukr( stor_id ); - funcs[ ker_id ] = *func; + return bli_cntx_get_ukr_dt( dt, ukr_id, cntx ); } -BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); + switch ( bs_id ) + { + case BLIS_MR: bs_id = BLIS_MR_SUP; break; + case BLIS_NR: bs_id = BLIS_NR_SUP; break; + case BLIS_KR: bs_id = BLIS_KR_SUP; break; + case BLIS_MC: bs_id = BLIS_MC_SUP; break; + case BLIS_NC: bs_id = BLIS_NC_SUP; break; + case BLIS_KC: bs_id = BLIS_KC_SUP; break; + default: break; + }; - funcs[ ker_id ] = *func; + return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ); } -BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); + switch ( bs_id ) + { + case BLIS_MR: bs_id = BLIS_MR_SUP; break; + case BLIS_NR: bs_id = BLIS_NR_SUP; break; + case BLIS_KR: bs_id = BLIS_KR_SUP; break; + case BLIS_MC: bs_id = BLIS_MC_SUP; break; + case BLIS_NC: bs_id = BLIS_NC_SUP; break; + case BLIS_KC: bs_id = BLIS_KC_SUP; break; + default: break; + }; - funcs[ ker_id ] = *func; -} - -BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); - - bli_func_set_dt( fp, dt, func ); -} - -BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); - - funcs[ ker_id ] = *func; -} - -BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); - - bli_func_set_dt( fp, dt, func ); + return bli_cntx_get_blksz_max_dt( dt, bs_id, cntx ); } // ----------------------------------------------------------------------------- @@ -701,24 +396,17 @@ BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_ BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); -BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( cntx_t* cntx, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); +BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ); + #endif diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index cc17b33ff..1372a055a 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -185,7 +185,7 @@ void bli_gks_init( void ) bli_gks_register_cntx( BLIS_ARCH_POWER10, bli_cntx_init_power10, bli_cntx_init_power10_ref, bli_cntx_init_power10_ind ); -#endif +#endif #ifdef BLIS_CONFIG_POWER9 bli_gks_register_cntx( BLIS_ARCH_POWER9, bli_cntx_init_power9, bli_cntx_init_power9_ref, @@ -267,7 +267,7 @@ void bli_gks_finalize( void ) void bli_gks_init_index( void ) { // This function is called by bli_gks_init(). It simply initializes all - // architecture id elements of the internal arrays to NULL. + // architecture id elements of the internal arrays to NULL. const size_t gks_size = sizeof( cntx_t* ) * BLIS_NUM_ARCHS; const size_t fpa_size = sizeof( void_fp ) * BLIS_NUM_ARCHS; @@ -382,7 +382,7 @@ void bli_gks_register_cntx // functions for reference kernels and induced method execution. The // former will be used whenever we need to obtain reference kernels and // latter will be used later on if the user calls a level-3 function - // with induced execution enabled. + // with induced execution enabled. cntx_ref_init[ id ] = ref_fp; cntx_ind_init[ id ] = ind_fp; @@ -582,7 +582,7 @@ cntx_t* bli_gks_query_ind_cntx // function on the newly allocated structure, we must first copy // over the contents of the native context. *gks_id_ind = *gks_id_nat; - + // Use the architecture id to look up the function pointer to the // context initialization function for induced methods. ind_cntx_init_ft f = cntx_ind_init[ id ]; @@ -635,7 +635,7 @@ void bli_gks_init_ref_cntx bool bli_gks_cntx_l3_nat_ukr_is_ref ( num_t dt, - l3ukr_t ukr_id, + ukr_t ukr_id, cntx_t* cntx ) { @@ -647,8 +647,8 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref // Query each context for the micro-kernel function pointer for the // specified datatype. - void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, &ref_cntx ); - void_fp fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, cntx ); + void_fp ref_fp = bli_cntx_get_ukr_dt( dt, ukr_id, &ref_cntx ); + void_fp fp = bli_cntx_get_ukr_dt( dt, ukr_id, cntx ); // Return the result. return fp == ref_fp; @@ -668,7 +668,7 @@ static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] = // ----------------------------------------------------------------------------- -char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) +char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ) { kimpl_t ki; @@ -676,7 +676,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) // then query the ukernel function pointer for the given datatype from // that context. cntx_t* cntx = bli_gks_query_ind_cntx( method, dt ); - void_fp fp = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx ); + void_fp fp = bli_cntx_get_ukr_dt( dt, ukr, cntx ); // Check whether the ukernel function pointer is NULL for the given // datatype. If it is NULL, return the string for not applicable. @@ -691,7 +691,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) } #if 0 -char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ) +char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt ) { opid_t oper; ind_t method; @@ -716,7 +716,7 @@ char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ) } #endif -kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ) +kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt ) { // If the current available induced method is not native, it // must be virtual. @@ -731,8 +731,6 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ) // method to the typed function pointer within the known // reference ukrs object. - cntx_t ref_cntx_l; - // Query the architecture id. arch_t id = bli_arch_query_id(); @@ -743,23 +741,13 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ) bli_check_error_code( e_val ); } - // Obtain the function pointer to the context initialization function - // for reference kernels. - ref_cntx_init_ft f = cntx_ref_init[ id ]; - - // Initialize a local context with reference kernels and related values. - f( &ref_cntx_l ); - // Query the native context from the gks. cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id ); - // Query the native ukernel func_t from both the native and reference - // contexts. - void_fp nat_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, nat_cntx ); - void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, &ref_cntx_l ); - - if ( nat_fp == ref_fp ) return BLIS_REFERENCE_UKERNEL; - else return BLIS_OPTIMIZED_UKERNEL; + if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) ) + return BLIS_REFERENCE_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; } } diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h index 188dcd507..b8e4c4fe0 100644 --- a/frame/base/bli_gks.h +++ b/frame/base/bli_gks.h @@ -54,12 +54,12 @@ BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); -bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); +bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx ); -BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); -BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); +BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ); +BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt ); -//char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); +//char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt ); #endif diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 011ebcdfb..e863f7dcf 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -289,6 +289,13 @@ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) +// -- (four auxiliary arguments) -- + +#define INSERT_GENTFUNCCO_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ +\ +GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ +GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) + // -- Basic one-operand macro with integer instance -- diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 4de624f98..d273c353a 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -245,7 +245,111 @@ #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif +// -- MR and NR blocksizes (only for reference kernels) ------------------------ +// The build system defines BLIS_IN_REF_KERNEL, but only when compiling +// reference kernels. By using compile-time constants for MR and NR, the +// compiler can perform certain optimizations, such as unrolling and +// vectorization, that would not be otherwise be possible. +#ifdef BLIS_IN_REF_KERNEL + +#ifndef BLIS_MR_s +#define BLIS_MR_s 4 +#endif + +#ifndef BLIS_MR_d +#define BLIS_MR_d 4 +#endif + +#ifndef BLIS_MR_c +#define BLIS_MR_c 4 +#endif + +#ifndef BLIS_MR_z +#define BLIS_MR_z 4 +#endif + +#ifndef BLIS_NR_s +#define BLIS_NR_s 16 +#endif + +#ifndef BLIS_NR_d +#define BLIS_NR_d 8 +#endif + +#ifndef BLIS_NR_c +#define BLIS_NR_c 8 +#endif + +#ifndef BLIS_NR_z +#define BLIS_NR_z 4 +#endif + +#ifndef BLIS_BBM_s +#define BLIS_BBM_s 1 +#endif + +#ifndef BLIS_BBM_d +#define BLIS_BBM_d 1 +#endif + +#ifndef BLIS_BBM_c +#define BLIS_BBM_c 1 +#endif + +#ifndef BLIS_BBM_z +#define BLIS_BBM_z 1 +#endif + +#ifndef BLIS_BBN_s +#define BLIS_BBN_s 1 +#endif + +#ifndef BLIS_BBN_d +#define BLIS_BBN_d 1 +#endif + +#ifndef BLIS_BBN_c +#define BLIS_BBN_c 1 +#endif + +#ifndef BLIS_BBN_z +#define BLIS_BBN_z 1 +#endif + +#ifndef BLIS_PACKMR_s +#define BLIS_PACKMR_s (BLIS_MR_s*BLIS_BBM_s) +#endif + +#ifndef BLIS_PACKMR_d +#define BLIS_PACKMR_d (BLIS_MR_d*BLIS_BBM_d) +#endif + +#ifndef BLIS_PACKMR_c +#define BLIS_PACKMR_c (BLIS_MR_c*BLIS_BBM_c) +#endif + +#ifndef BLIS_PACKMR_z +#define BLIS_PACKMR_z (BLIS_MR_z*BLIS_BBM_z) +#endif + +#ifndef BLIS_PACKNR_s +#define BLIS_PACKNR_s (BLIS_NR_s*BLIS_BBN_s) +#endif + +#ifndef BLIS_PACKNR_d +#define BLIS_PACKNR_d (BLIS_NR_d*BLIS_BBN_d) +#endif + +#ifndef BLIS_PACKNR_c +#define BLIS_PACKNR_c (BLIS_NR_c*BLIS_BBN_c) +#endif + +#ifndef BLIS_PACKNR_z +#define BLIS_PACKNR_z (BLIS_NR_z*BLIS_BBN_z) +#endif + +#endif #endif diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h index 120338beb..903b4ece6 100644 --- a/frame/include/bli_misc_macro_defs.h +++ b/frame/include/bli_misc_macro_defs.h @@ -164,5 +164,11 @@ BLIS_INLINE void bli_toggle_bool( bool* b ) #define bli_iformatspec() "%6d" +// Sentinel constant used to indicate the end of a variable argument function +// (See bli_cntx.c) + +#define BLIS_VA_END (-1) + + #endif diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 286e79e2b..1822065da 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -754,7 +754,7 @@ BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, *offm_inc = 0; // If the diagonal intersects the right side of the matrix, - // ignore the area below that intersection. + // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; @@ -777,6 +777,14 @@ BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m bli_toggle_uplo( uplo ); } +// we don't know the type of a, so this must be a macro +// rs_a and cs_a must be variables and not expressions +#define bli_reflect_to_stored_part( diagoff, a, rs_a, cs_a ) \ +do { \ + a += ( diagoff ) * ( cs_a - rs_a ); \ + bli_swap_incs( &rs_a, &cs_a ); \ +} while (0) \ + BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; @@ -858,6 +866,22 @@ BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) #endif } +BLIS_INLINE ukr_t bli_stor3_ukr( stor3_t id ) +{ + switch ( id ) + { + case BLIS_RRR: return BLIS_GEMMSUP_RRR_UKR; + case BLIS_RRC: return BLIS_GEMMSUP_RRC_UKR; + case BLIS_RCR: return BLIS_GEMMSUP_RCR_UKR; + case BLIS_RCC: return BLIS_GEMMSUP_RCC_UKR; + case BLIS_CRR: return BLIS_GEMMSUP_CRR_UKR; + case BLIS_CRC: return BLIS_GEMMSUP_CRC_UKR; + case BLIS_CCR: return BLIS_GEMMSUP_CCR_UKR; + case BLIS_CCC: return BLIS_GEMMSUP_CCC_UKR; + default: return BLIS_GEMMSUP_XXX_UKR; + } +} + BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 293c80f91..f567e7ef3 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -49,8 +49,8 @@ // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. -#include "bli_setrs.h" // sets real component only -#include "bli_setis.h" // sets imaginary component only +#include "bli_setrs.h" // sets real component only +#include "bli_setis.h" // sets imaginary component only // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields @@ -194,6 +194,7 @@ #include "bli_adds_mxn.h" #include "bli_adds_mxn_uplo.h" #include "bli_set0s_mxn.h" +#include "bli_set0s_edge.h" #include "bli_copys_mxn.h" #include "bli_scal2s_mxn.h" #include "bli_xpbys_mxn.h" @@ -230,7 +231,7 @@ #include "bli_scal21rs.h" #include "bli_scal2j1rs.h" -// 1m (1e or 1r) +// 1m (1e or 1r) #include "bli_invert1ms_mxn_diag.h" #include "bli_scal1ms_mxn.h" diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c66505bde..4e64f3711 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -626,7 +626,8 @@ typedef enum typedef enum { - BLIS_ADDV_KER = 0, + // l1v kernels + BLIS_ADDV_KER, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, @@ -639,108 +640,82 @@ typedef enum BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, - BLIS_XPBYV_KER -} l1vkr_t; - -#define BLIS_NUM_LEVEL1V_KERS 14 - - -typedef enum -{ - BLIS_AXPY2V_KER = 0, + BLIS_XPBYV_KER, + BLIS_AXPY2V_KER, BLIS_DOTAXPYV_KER, + + // l1f kernels BLIS_AXPYF_KER, BLIS_DOTXF_KER, - BLIS_DOTXAXPYF_KER -} l1fkr_t; + BLIS_DOTXAXPYF_KER, -#define BLIS_NUM_LEVEL1F_KERS 5 + // pack kernels + BLIS_PACKM_MRXK_KER, + BLIS_PACKM_NRXK_KER, + BLIS_PACKM_MRXK_1ER_KER, + BLIS_PACKM_NRXK_1ER_KER, + BLIS_PACKM_MRXMR_DIAG_KER, + BLIS_PACKM_NRXNR_DIAG_KER, + BLIS_PACKM_MRXMR_DIAG_1ER_KER, + BLIS_PACKM_NRXNR_DIAG_1ER_KER, + // unpack kernels + BLIS_UNPACKM_MRXK_KER, + BLIS_UNPACKM_NRXK_KER, -typedef enum -{ - BLIS_PACKM_0XK_KER = 0, - BLIS_PACKM_1XK_KER = 1, - BLIS_PACKM_2XK_KER = 2, - BLIS_PACKM_3XK_KER = 3, - BLIS_PACKM_4XK_KER = 4, - BLIS_PACKM_5XK_KER = 5, - BLIS_PACKM_6XK_KER = 6, - BLIS_PACKM_7XK_KER = 7, - BLIS_PACKM_8XK_KER = 8, - BLIS_PACKM_9XK_KER = 9, - BLIS_PACKM_10XK_KER = 10, - BLIS_PACKM_11XK_KER = 11, - BLIS_PACKM_12XK_KER = 12, - BLIS_PACKM_13XK_KER = 13, - BLIS_PACKM_14XK_KER = 14, - BLIS_PACKM_15XK_KER = 15, - BLIS_PACKM_16XK_KER = 16, - BLIS_PACKM_17XK_KER = 17, - BLIS_PACKM_18XK_KER = 18, - BLIS_PACKM_19XK_KER = 19, - BLIS_PACKM_20XK_KER = 20, - BLIS_PACKM_21XK_KER = 21, - BLIS_PACKM_22XK_KER = 22, - BLIS_PACKM_23XK_KER = 23, - BLIS_PACKM_24XK_KER = 24, - BLIS_PACKM_25XK_KER = 25, - BLIS_PACKM_26XK_KER = 26, - BLIS_PACKM_27XK_KER = 27, - BLIS_PACKM_28XK_KER = 28, - BLIS_PACKM_29XK_KER = 29, - BLIS_PACKM_30XK_KER = 30, - BLIS_PACKM_31XK_KER = 31, - - BLIS_UNPACKM_0XK_KER = 0, - BLIS_UNPACKM_1XK_KER = 1, - BLIS_UNPACKM_2XK_KER = 2, - BLIS_UNPACKM_3XK_KER = 3, - BLIS_UNPACKM_4XK_KER = 4, - BLIS_UNPACKM_5XK_KER = 5, - BLIS_UNPACKM_6XK_KER = 6, - BLIS_UNPACKM_7XK_KER = 7, - BLIS_UNPACKM_8XK_KER = 8, - BLIS_UNPACKM_9XK_KER = 9, - BLIS_UNPACKM_10XK_KER = 10, - BLIS_UNPACKM_11XK_KER = 11, - BLIS_UNPACKM_12XK_KER = 12, - BLIS_UNPACKM_13XK_KER = 13, - BLIS_UNPACKM_14XK_KER = 14, - BLIS_UNPACKM_15XK_KER = 15, - BLIS_UNPACKM_16XK_KER = 16, - BLIS_UNPACKM_17XK_KER = 17, - BLIS_UNPACKM_18XK_KER = 18, - BLIS_UNPACKM_19XK_KER = 19, - BLIS_UNPACKM_20XK_KER = 20, - BLIS_UNPACKM_21XK_KER = 21, - BLIS_UNPACKM_22XK_KER = 22, - BLIS_UNPACKM_23XK_KER = 23, - BLIS_UNPACKM_24XK_KER = 24, - BLIS_UNPACKM_25XK_KER = 25, - BLIS_UNPACKM_26XK_KER = 26, - BLIS_UNPACKM_27XK_KER = 27, - BLIS_UNPACKM_28XK_KER = 28, - BLIS_UNPACKM_29XK_KER = 29, - BLIS_UNPACKM_30XK_KER = 30, - BLIS_UNPACKM_31XK_KER = 31 - -} l1mkr_t; - -#define BLIS_NUM_PACKM_KERS 32 -#define BLIS_NUM_UNPACKM_KERS 32 - - -typedef enum -{ - BLIS_GEMM_UKR = 0, + // l3 native kernels + BLIS_GEMM_UKR, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR -} l3ukr_t; + BLIS_TRSM_U_UKR, -#define BLIS_NUM_LEVEL3_UKRS 5 + // l3 virtual kernels + BLIS_GEMM_VIR_UKR, + BLIS_GEMMTRSM_L_VIR_UKR, + BLIS_GEMMTRSM_U_VIR_UKR, + BLIS_TRSM_L_VIR_UKR, + BLIS_TRSM_U_VIR_UKR, + + // gemmsup kernels + BLIS_GEMMSUP_RRR_UKR, + BLIS_GEMMSUP_RRC_UKR, + BLIS_GEMMSUP_RCR_UKR, + BLIS_GEMMSUP_RCC_UKR, + BLIS_GEMMSUP_CRR_UKR, + BLIS_GEMMSUP_CRC_UKR, + BLIS_GEMMSUP_CCR_UKR, + BLIS_GEMMSUP_CCC_UKR, + BLIS_GEMMSUP_XXX_UKR, + + // BLIS_NUM_UKRS must be last! + BLIS_NUM_UKRS +} ukr_t; + + +typedef enum +{ + // l3 kernel row preferences + BLIS_GEMM_UKR_ROW_PREF, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, + BLIS_TRSM_L_UKR_ROW_PREF, + BLIS_TRSM_U_UKR_ROW_PREF, + + // gemmsup kernel row preferences + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, + BLIS_GEMMSUP_XXX_UKR_ROW_PREF, + + // BLIS_NUM_UKR_PREFS must be last! + BLIS_NUM_UKR_PREFS +} ukr_pref_t; typedef enum @@ -884,39 +859,45 @@ typedef enum // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. - - BLIS_KR = 0, + BLIS_KR, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, + // broadcast factors for packing + BLIS_BBM, + BLIS_BBN, + + // level-2 blocksizes BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension + // level-1f blocksizes BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor - BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. + // gemmsup thresholds + BLIS_MT, // level-3 small/unpacked matrix threshold in m dimension + BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension + BLIS_KT, // level-3 small/unpacked matrix threshold in k dimension + + // gemmsup block sizes + BLIS_KR_SUP, + BLIS_MR_SUP, + BLIS_NR_SUP, + BLIS_MC_SUP, + BLIS_KC_SUP, + BLIS_NC_SUP, + + // BLIS_NO_PART (= BLIS_NUM_BLKSZS) must be last! + BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable, + // such as when characterizing a packm operation. + BLIS_NUM_BLKSZS = BLIS_NO_PART } bszid_t; -#define BLIS_NUM_BLKSZS 11 - - -// -- Threshold ID type -- - -typedef enum -{ - BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension - BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension - BLIS_KT // level-3 small/unpacked matrix threshold in k dimension - -} threshid_t; - -#define BLIS_NUM_THRESH 3 - // -- Architecture ID type -- @@ -1430,21 +1411,10 @@ typedef struct cntx_s blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; - func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; + func_t ukrs[ BLIS_NUM_UKRS ]; + mbool_t ukr_prefs[ BLIS_NUM_UKR_PREFS ]; - blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; - void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; - blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; - func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; - mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; - - func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; - func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; - - func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; - func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; + void_fp l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; ind_t method; @@ -1577,6 +1547,7 @@ typedef enum // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), + BLIS_INVALID_UKR_ID = (-152), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), diff --git a/frame/include/level0/bli_set0s_edge.h b/frame/include/level0/bli_set0s_edge.h new file mode 100644 index 000000000..2c436812e --- /dev/null +++ b/frame/include/level0/bli_set0s_edge.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET0S_EDGE_H +#define BLIS_SET0S_EDGE_H + +// set0s_mxn + +// Notes: +// - The first char encodes the type of x. +// - The second char encodes the type of y. + +#define GENTFUNC(ctype,ch,op) \ +\ +BLIS_INLINE void PASTEMAC(ch,op) \ + ( \ + const dim_t i, \ + const dim_t m, \ + const dim_t j, \ + const dim_t n, \ + ctype* restrict p, \ + const inc_t ldp \ + ) \ +{ \ + if ( i < m ) \ + { \ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m - i, \ + j, \ + p + i*1, 1, ldp \ + ); \ + } \ +\ + if ( j < n ) \ + { \ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m, \ + n - j, \ + p + j*ldp, 1, ldp \ + ); \ + } \ +} + +INSERT_GENTFUNC_BASIC0(set0s_edge) + +#endif diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c index 53904b645..2dd7c7324 100644 --- a/kernels/penryn/1/bli_axpyv_penryn_int.c +++ b/kernels/penryn/1/bli_axpyv_penryn_int.c @@ -102,7 +102,7 @@ void bli_daxpyv_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); f ( diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c index 4d39b3641..2e88a577a 100644 --- a/kernels/penryn/1/bli_dotv_penryn_int.c +++ b/kernels/penryn/1/bli_dotv_penryn_int.c @@ -104,7 +104,7 @@ void bli_ddotv_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - ddotv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx ); + ddotv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c index 5e8a2a9a1..c809ebb41 100644 --- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c +++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c @@ -110,7 +110,7 @@ void bli_daxpy2v_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - daxpy2v_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx ); + daxpy2v_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c index 66bb88ec6..ce4c4f786 100644 --- a/kernels/penryn/1f/bli_axpyf_penryn_int.c +++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c @@ -115,7 +115,7 @@ void bli_daxpyf_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - daxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx ); + daxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c index 7602a7f28..6b9dab773 100644 --- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c +++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c @@ -112,7 +112,7 @@ void bli_ddotaxpyv_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - ddotaxpyv_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx ); + ddotaxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c index 2deb4a457..fe102d427 100644 --- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c +++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c @@ -104,7 +104,7 @@ void bli_ddotxaxpyf_penryn_int // If the vector lengths are zero, scale y by beta and return. if ( bli_zero_dim1( m ) ) { - dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); + dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); f ( @@ -149,7 +149,7 @@ void bli_ddotxaxpyf_penryn_int if ( use_ref == TRUE ) { - ddotxaxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx ); + ddotxaxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx ); f ( conjat, diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c index ad9dc5fbd..ac9887d59 100644 --- a/kernels/penryn/1f/bli_dotxf_penryn_int.c +++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c @@ -90,7 +90,7 @@ void bli_ddotxf_penryn_int // If the vector lengths are zero, scale r by beta and return. if ( bli_zero_dim1( m ) ) { - dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); + dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); f ( @@ -134,7 +134,7 @@ void bli_ddotxf_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - ddotxf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx ); + ddotxf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx ); f ( conjat, diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c index 9f76e88e1..fb17dd4b3 100644 --- a/kernels/zen/1/bli_scalv_zen_int.c +++ b/kernels/zen/1/bli_scalv_zen_int.c @@ -83,7 +83,7 @@ void bli_sscalv_zen_int if ( PASTEMAC(s,eq0)( *alpha ) ) { float* zero = bli_s0; - ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); + ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); f ( @@ -182,7 +182,7 @@ void bli_dscalv_zen_int if ( PASTEMAC(d,eq0)( *alpha ) ) { double* zero = bli_d0; - dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); + dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f ( diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index c8488890f..9f31b7200 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -84,7 +84,8 @@ void bli_sscalv_zen_int10 if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); + ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); + f ( BLIS_NO_CONJUGATE, @@ -93,7 +94,7 @@ void bli_sscalv_zen_int10 x, incx, cntx ); - + return; } @@ -275,9 +276,9 @@ void bli_dscalv_zen_int10 { double* zero = bli_d0; - if( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); + dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f ( @@ -287,7 +288,7 @@ void bli_dscalv_zen_int10 x, incx, cntx ); - + return; } @@ -458,7 +459,7 @@ void bli_cscalv_zen_int10 { const num_t dt = BLIS_SCOMPLEX; - cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); + cscalv_ker_ft f = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); f ( @@ -469,4 +470,3 @@ void bli_cscalv_zen_int10 cntx ); } - diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c index 5ddb56ac5..0ec5f44f5 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_4.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c @@ -36,7 +36,7 @@ #include "blis.h" - void bli_caxpyf_zen_int_4 +void bli_caxpyf_zen_int_4 ( conj_t conja, conj_t conjx, @@ -81,7 +81,7 @@ { if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); + caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c index 15a64d596..1566f9809 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_5.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c @@ -108,8 +108,9 @@ void bli_saxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if(cntx == NULL) cntx = bli_gks_query_cntx(); - saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -131,7 +132,7 @@ void bli_saxpyf_zen_int_5 cntx ); } - + return; } @@ -359,7 +360,9 @@ void bli_daxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -381,7 +384,7 @@ void bli_daxpyf_zen_int_5 cntx ); } - + return; } @@ -559,7 +562,7 @@ void bli_daxpyf_zen_int_5 // ----------------------------------------------------------------------------- -static void bli_daxpyf_zen_int_16x2 +void bli_daxpyf_zen_int_16x2 ( conj_t conja, conj_t conjx, @@ -608,7 +611,7 @@ static void bli_daxpyf_zen_int_16x2 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -843,6 +846,7 @@ static void bli_daxpyf_zen_int_16x2 } // ----------------------------------------------------------------------------- + void bli_daxpyf_zen_int_16x4 ( conj_t conja, @@ -895,8 +899,9 @@ void bli_daxpyf_zen_int_16x4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if(cntx == NULL) cntx = bli_gks_query_cntx(); - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c index b958600ce..15fdf4651 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_8.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c @@ -104,7 +104,7 @@ void bli_saxpyf_zen_int_8 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); + saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -313,7 +313,7 @@ void bli_daxpyf_zen_int_8 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c index e40c785d8..1f4a671b6 100644 --- a/kernels/zen/1f/bli_dotxf_zen_int_8.c +++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c @@ -78,8 +78,8 @@ void bli_sdotxf_zen_int_8 // simplifies to updating y. if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) ) { - sscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx ); - + sscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx ); + f ( BLIS_NO_CONJUGATE, @@ -95,7 +95,7 @@ void bli_sdotxf_zen_int_8 // operation as a loop over dotxv. if ( b_n != fuse_fac ) { - sdotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx ); + sdotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx ); for ( dim_t i = 0; i < b_n; ++i ) { @@ -468,8 +468,8 @@ void bli_ddotxf_zen_int_8 // simplifies to updating y. if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) ) { - dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); - + dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); + f ( BLIS_NO_CONJUGATE, @@ -485,7 +485,7 @@ void bli_ddotxf_zen_int_8 // operation as a loop over dotxv. if ( b_n != fuse_fac ) { - ddotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx ); + ddotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx ); for ( dim_t i = 0; i < b_n; ++i ) { diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c index 2e648bbd6..2da4bc928 100644 --- a/ref_kernels/1/bli_axpbyv_ref.c +++ b/ref_kernels/1/bli_axpbyv_ref.c @@ -60,7 +60,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ + PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ @@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); \ + PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); \ \ scalv_p \ ( \ @@ -105,7 +105,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ @@ -123,7 +123,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ @@ -141,7 +141,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_XPBYV_KER, cntx ); \ + PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_ukr_dt( dt, BLIS_XPBYV_KER, cntx ); \ \ xpbyv_p \ ( \ @@ -163,7 +163,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx ); \ + PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_ukr_dt( dt, BLIS_SCAL2V_KER, cntx ); \ \ scal2v_p \ ( \ @@ -182,7 +182,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ axpyv_p \ ( \ diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c index 31fece0a0..30076ddaf 100644 --- a/ref_kernels/1/bli_axpyv_ref.c +++ b/ref_kernels/1/bli_axpyv_ref.c @@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ @@ -148,7 +148,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c index 1dcb03839..ba0595990 100644 --- a/ref_kernels/1/bli_scal2v_ref.c +++ b/ref_kernels/1/bli_scal2v_ref.c @@ -57,7 +57,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ + PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ @@ -75,7 +75,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c index 4945b637b..3e6be7492 100644 --- a/ref_kernels/1/bli_scalv_ref.c +++ b/ref_kernels/1/bli_scalv_ref.c @@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ + PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c index 8101023d4..28286a5f8 100644 --- a/ref_kernels/1/bli_xpbyv_ref.c +++ b/ref_kernels/1/bli_xpbyv_ref.c @@ -54,7 +54,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ @@ -71,7 +71,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c index 9c08c96f1..6439ff8b0 100644 --- a/ref_kernels/1f/bli_axpy2v_ref.c +++ b/ref_kernels/1f/bli_axpy2v_ref.c @@ -110,7 +110,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ kfp_av \ ( \ diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c index f001108e2..5799a03a6 100644 --- a/ref_kernels/1f/bli_axpyf_ref.c +++ b/ref_kernels/1f/bli_axpyf_ref.c @@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( dim_t i = 0; i < b_n; ++i ) \ { \ diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c index faeef5dea..42936c650 100644 --- a/ref_kernels/1f/bli_dotaxpyv_ref.c +++ b/ref_kernels/1f/bli_dotaxpyv_ref.c @@ -132,10 +132,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotv_ker_ft) kfp_dv \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ kfp_dv \ ( \ diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c index c61217941..990133621 100644 --- a/ref_kernels/1f/bli_dotxaxpyf_ref.c +++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c @@ -165,10 +165,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotxf_ker_ft) kfp_df \ = \ - bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ PASTECH(ch,axpyf_ker_ft) kfp_af \ = \ - bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ \ kfp_df \ ( \ diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c index 33f5d1ba5..86781fd58 100644 --- a/ref_kernels/1f/bli_dotxf_ref.c +++ b/ref_kernels/1f/bli_dotxf_ref.c @@ -113,7 +113,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotxv_ker_ft) kfp_dv \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( dim_t i = 0; i < b_n; ++i ) \ { \ diff --git a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c index cc5852b37..e07090754 100644 --- a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c +++ b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c @@ -67,8 +67,8 @@ void PASTEMAC(ch,varname) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ - PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ + PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* A is m x n. */ \ /* y = beta * y + alpha * A^T w; */ \ diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c new file mode 100644 index 000000000..5cee5535b --- /dev/null +++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c @@ -0,0 +1,336 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define PACKM_SET1_1E( chr, mnk ) \ +do { \ + PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set0s)( *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set0s)( *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ +} while (0) + +#define PACKM_SET1_1R( chr, mnk ) \ +do { \ + PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \ +} while (0) + +#define PACKM_SCAL_1E( ch, mn, k, op ) \ +do { \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn *inca2 + 0 + k*lda2), \ + *(alpha1 + mn *inca2 + 1 + k*lda2), \ + *(pi1_ri + (mn*2 + 0)*dfac + d + k*ldp2), \ + *(pi1_ri + (mn*2 + 1)*dfac + d + k*ldp2) ); \ + PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 + mn *inca2 + 0 + k*lda2), \ + *(alpha1 + mn *inca2 + 1 + k*lda2), \ + *(pi1_ir + (mn*2 + 0)*dfac + d + k*ldp2), \ + *(pi1_ir + (mn*2 + 1)*dfac + d + k*ldp2) ); \ +} while (0) + +#define PACKM_SCAL_1R( ch, mn, k, op ) \ +do { \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0 + k*lda2), \ + *(alpha1 + mn*inca2 + 1 + k*lda2), \ + *(pi1_r + mn*dfac + d + k*ldp2), \ + *(pi1_i + mn*dfac + d + k*ldp2) ); \ +} while (0) + +#define PACKM_DIAG_1E_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \ +\ +do \ +{ \ + /* PACKM_SCAL_1E assumes inca2 and lda2 are the strides to use. */ \ + dim_t inca2 = inca2_lu; \ + dim_t lda2 = lda2_lu; \ + for ( dim_t k = 0; k < cdim; k++ ) \ + for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PACKM_SCAL_1E( ch, mn, k, op ); \ +} while(0) + +#define PACKM_DIAG_BODY_1E_L( ch, op ) \ + PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op ) + +#define PACKM_DIAG_BODY_1E_U( ch, op ) \ + PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op ) + +#define PACKM_DIAG_1R_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \ +\ +do \ +{ \ + /* PACKM_SCAL_1R assumes inca2 and lda2 are the strides to use. */ \ + dim_t inca2 = inca2_lu; \ + dim_t lda2 = lda2_lu; \ + for ( dim_t k = 0; k < cdim; k++ ) \ + for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PACKM_SCAL_1R( ch, mn, k, op ); \ +} while(0) + +#define PACKM_DIAG_BODY_1R_L( ch, op ) \ + PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op ) + +#define PACKM_DIAG_BODY_1R_U( ch, op ) \ + PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op ) + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ + conj_t conja, \ + pack_t schema, \ + bool invdiag, \ + dim_t cdim, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict a, inc_t inca, inc_t lda, \ + ctype* restrict p, inc_t ldp, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt_r, mnr0, cntx ); \ + const dim_t dfac = bli_cntx_get_blksz_def_dt( dt_r, bb0, cntx ); \ +\ + /* start by zeroing out the whole block */ \ + PASTEMAC(chr,set0s_mxn) \ + ( \ + cdim_pack, \ + 2*n_max, \ + ( ctype_r* )p, 1, ldp \ + ); \ +\ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ +\ + if ( bli_is_1e_packed( schema ) ) \ + { \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \ +\ + ctype_r* restrict pi1_ri = ( ctype_r* )p; \ + ctype_r* restrict pi1_ir = ( ctype_r* )p + ldp; \ +\ + /* write the strictly lower part if it exists */ \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_l2 = inca2; \ + dim_t lda_l2 = lda2; \ + conj_t conja_l = conja; \ +\ + if ( bli_is_upper( uploa ) ) \ + { \ + bli_swap_incs( &inca_l2, &lda_l2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_l ); \ + } \ +\ + if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1E_L( ch, scal2ris ); \ + } \ +\ + /* write the strictly upper part if it exists */ \ + /* assume either symmetric, hermitian, or triangular */ \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_u2 = inca2; \ + dim_t lda_u2 = lda2; \ + conj_t conja_u = conja; \ +\ + if ( bli_is_lower( uploa ) ) \ + { \ + bli_swap_incs( &inca_u2, &lda_u2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_u ); \ + } \ +\ + if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1E_U( ch, scal2ris ); \ + } \ +\ + /* write the diagonal */ \ + if ( bli_is_unit_diag( diaga ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1E( chr, mnk ); \ + } \ + else if ( bli_is_hermitian( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \ + PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,scal2s)( -kappa_i, mu_r, *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + } \ + } \ + else if ( bli_is_conj( conja )) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1E( ch, mnk, mnk, scal2jris ); \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1E( ch, mnk, mnk, scal2ris ); \ + } \ +\ + /* invert the diagonal if requested */ \ + if ( invdiag ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + PASTEMAC(ch,invertris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \ + *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(ch,copyjris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \ + *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2), \ + *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2), \ + *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + } \ + } \ +\ + /* if this an edge case in both directions, extend the diagonal with ones */ \ + for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1E( chr, mnk ); \ + } \ + else /* bli_is_1r_packed( schema ) */ \ + { \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \ +\ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + /* write the strictly lower part if it exists */ \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_l2 = inca2; \ + dim_t lda_l2 = lda2; \ + conj_t conja_l = conja; \ +\ + if ( bli_is_upper( uploa ) ) \ + { \ + bli_swap_incs( &inca_l2, &lda_l2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_l ); \ + } \ +\ + if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1R_L( ch, scal2ris ); \ + } \ +\ + /* write the strictly upper part if it exists */ \ + /* assume either symmetric, hermitian, or triangular */ \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_u2 = inca2; \ + dim_t lda_u2 = lda2; \ + conj_t conja_u = conja; \ +\ + if ( bli_is_lower( uploa ) ) \ + { \ + bli_swap_incs( &inca_u2, &lda_u2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_u ); \ + } \ +\ + if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1R_U( ch, scal2ris ); \ + } \ +\ + /* write the diagonal */ \ + if ( bli_is_unit_diag( diaga ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1R( chr, mnk ); \ + } \ + else if ( bli_is_hermitian( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \ + PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_r + mnk*(dfac + ldp2) + d) ); \ + PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_i + mnk*(dfac + ldp2) + d) ); \ + } \ + } \ + else if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1R( ch, mnk, mnk, scal2jris ); \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1R( ch, mnk, mnk, scal2ris ); \ + } \ +\ + /* invert the diagonal if requested */ \ + if ( invdiag ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,invertris)( *(pi1_r + mnk*(dfac + ldp2) + d), \ + *(pi1_i + mnk*(dfac + ldp2) + d) ); \ + } \ +\ + /* if this an edge case in both directions, extend the diagonal with ones */ \ + for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1R( chr, mnk ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC4( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC4( packm_nrxnr_diag_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) + diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c new file mode 100644 index 000000000..80ffcbc14 --- /dev/null +++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c @@ -0,0 +1,173 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define PACKM_DIAG_BODY( ctype, ch, mn_min, mn_max, inca, lda, op ) \ +\ +do \ +{ \ + for ( dim_t k = 0; k < cdim; k++ ) \ + for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca + k*lda), *(pi1 + mn*dfac + d + k*ldp) ); \ +} while(0) + +#define PACKM_DIAG_BODY_L( ctype, ch, op ) \ + PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op ) + +#define PACKM_DIAG_BODY_U( ctype, ch, op ) \ + PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op ) + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ + conj_t conja, \ + pack_t schema, \ + bool invdiag, \ + dim_t cdim, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict a, inc_t inca, inc_t lda, \ + ctype* restrict p, inc_t ldp, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \ + const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt, mnr0, cntx ); \ + const dim_t dfac = bli_cntx_get_blksz_def_dt( dt, bb0, cntx ); \ +\ + /* start by zeroing out the whole block */ \ + PASTEMAC(ch,set0s_mxn) \ + ( \ + cdim_pack, \ + n_max, \ + p, 1, ldp \ + ); \ +\ + ctype kappa_cast = *( ctype* )kappa; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + /* write the strictly lower part if it exists */ \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_l = inca; \ + dim_t lda_l = lda; \ + conj_t conja_l = conja; \ +\ + if ( bli_is_upper( uploa ) ) \ + { \ + bli_swap_incs( &inca_l, &lda_l ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_l ); \ + } \ +\ + if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctype, ch, scal2js ); \ + else PACKM_DIAG_BODY_L( ctype, ch, scal2s ); \ + } \ +\ + /* write the strictly upper part if it exists */ \ + /* assume either symmetric, hermitian, or triangular */ \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_u = inca; \ + dim_t lda_u = lda; \ + conj_t conja_u = conja; \ +\ + if ( bli_is_lower( uploa ) ) \ + { \ + bli_swap_incs( &inca_u, &lda_u ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_u ); \ + } \ +\ + if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctype, ch, scal2js ); \ + else PACKM_DIAG_BODY_U( ctype, ch, scal2s ); \ + } \ +\ + /* write the diagonal */ \ + if ( bli_is_unit_diag( diaga ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ + else if ( bli_is_hermitian( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + ctype mu; \ + PASTEMAC(ch,copys)( *(alpha1 + mnk*(inca + lda)), mu ); \ + PASTEMAC(ch,seti0s)( mu ); \ + PASTEMAC(ch,scal2s)( kappa_cast, mu, *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ + } \ + else if ( bli_is_conj( conja )) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ +\ + /* invert the diagonal if requested */ \ + if ( invdiag ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,inverts)( *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ +\ + /* if this an edge case in both directions, extend the diagonal with ones */ \ + for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \ +} + +INSERT_GENTFUNC_BASIC4( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( packm_nrxnr_diag, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) + diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c index 03ec46d14..56d8379be 100644 --- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c +++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c @@ -34,8 +34,48 @@ #include "blis.h" +#define PACKM_1E_BODY( ctype, ch, pragma, cdim, inca2, op ) \ +\ +do \ +{ \ + for ( dim_t k = n; k != 0; --k ) \ + { \ + pragma \ + for ( dim_t mn = 0; mn < cdim; ++mn ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \ + *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \ + PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \ + *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \ + } \ +\ + alpha1 += lda2; \ + pi1_ri += ldp2; \ + pi1_ir += ldp2; \ + } \ +} while(0) + +#define PACKM_1R_BODY( ctype, ch, pragma, cdim, inca2, op ) \ +\ +do \ +{ \ + for ( dim_t k = n; k != 0; --k ) \ + { \ + pragma \ + for ( dim_t mn = 0; mn < cdim; ++mn ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \ + *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \ +\ + alpha1 += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ +} while(0) + #undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ +#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ @@ -50,2169 +90,94 @@ void PASTEMAC3(ch,opname,arch,suf) \ cntx_t* restrict cntx \ ) \ { \ - if ( cdim == mnr ) \ + const dim_t dfac = PASTECH2(bb0, _, chr); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + if ( bli_is_1e_packed( schema ) ) \ { \ - if ( bli_is_1e_packed( schema ) ) \ + /* cdim and mnr are in units of complex values */ \ + const dim_t mnr = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \ +\ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p; \ + ctype_r* restrict pi1_ir = ( ctype_r* )p + ldp; \ +\ + if ( cdim == mnr && mnr != -1 ) \ { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + if ( inca == 1 ) \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \ + else PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \ } \ else \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \ + else PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \ } \ } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ + else \ { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2jris ); \ + else PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2ris ); \ } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ +\ + PASTEMAC(chr,set0s_edge) \ ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ + 2*cdim*dfac, 2*cdim_max*dfac, \ + 2*n, 2*n_max, \ + ( ctype_r* )p, ldp \ ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ } \ -\ - if ( n < n_max ) \ + else /* ( bli_is_1r_packed( schema ) ) */ \ { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ + const dim_t mnr = PASTECH2(mnr0, _, chr); \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \ \ - PASTEMAC(ch,set1ms_mxn) \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( cdim == mnr && mnr != -1 ) \ + { \ + if ( inca == 1 ) \ + { \ + if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \ + else PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \ + else PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2jris ); \ + else PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2ris ); \ + } \ +\ + PASTEMAC(chr,set0s_edge) \ ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ + cdim*dfac, cdim_max*dfac, \ + 2*n, 2*n_max, \ + ( ctype_r* )p, ldp \ ); \ } \ } -INSERT_GENTFUNCCO_BASIC3( packm_2xk_1er, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_4xk_1er, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_6xk_1er, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_8xk_1er, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_10xk_1er, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_12xk_1er, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_14xk_1er, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_16xk_1er, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC4( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC4( packm_nrxk_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/ref_kernels/1m/bli_packm_cxk_bb_ref.c b/ref_kernels/1m/bli_packm_cxk_bb_ref.c deleted file mode 100644 index e7498a735..000000000 --- a/ref_kernels/1m/bli_packm_cxk_bb_ref.c +++ /dev/null @@ -1,656 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// -- 6xk, duplication factor 2 ------------------------------------------------ - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - const dim_t dfac = 2; \ -\ - /* Handle the packing of B (column panel schemas) separately from packing - of A (row panel schemas). */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2bbs_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, dfac, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*dfac; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2s_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - -// -- 6xk, duplication factor 4 ------------------------------------------------ - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - const dim_t dfac = 4; \ -\ - /* Handle the packing of B (column panel schemas) separately from packing - of A (row panel schemas). */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2bbs_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, dfac, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*dfac; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2s_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_6xk_bb4, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c index c98f1b250..eefdb464b 100644 --- a/ref_kernels/1m/bli_packm_cxk_ref.c +++ b/ref_kernels/1m/bli_packm_cxk_ref.c @@ -34,8 +34,24 @@ #include "blis.h" +#define PACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \ +\ +do \ +{ \ + for ( dim_t k = n; k != 0; --k ) \ + { \ + pragma \ + for ( dim_t mn = 0; mn < cdim; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ +} while(0) + #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ +#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ @@ -50,1673 +66,42 @@ void PASTEMAC3(ch,opname,arch,suf) \ cntx_t* restrict cntx \ ) \ { \ - ctype* restrict kappa_cast = kappa; \ + const dim_t mnr = PASTECH2(mnr0, _, ch); \ + const num_t dt = PASTEMAC(ch,type); \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \ + const dim_t dfac = PASTECH2(bb0, _, ch); \ +\ + ctype kappa_cast = *( ctype* )kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ - dim_t n_iter = n / 4; \ - dim_t n_left = n % 4; \ -\ - if ( cdim == mnr ) \ + if ( cdim == mnr && mnr != -1 ) \ { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + if ( inca == 1 ) \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \ -\ - alpha1 += 4*lda; \ - pi1 += 4*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \ + else PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \ } \ else \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \ + else PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \ + else PACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \ } \ \ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ + PASTEMAC(ch,set0s_edge) \ + ( \ + cdim*dfac, cdim_max*dfac, \ + n, n_max, \ + p, ldp \ + ); \ } -INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - dim_t n_iter = n / 4; \ - dim_t n_left = n % 4; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 2*lda), *(pi1 + 2 + 2*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 3*lda), *(pi1 + 2 + 3*ldp) ); \ -\ - alpha1 += 4*lda; \ - pi1 += 4*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - dim_t n_iter = n / 2; \ - dim_t n_left = n % 2; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \ -\ - alpha1 += 2*lda; \ - pi1 += 2*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - dim_t n_iter = n / 2; \ - dim_t n_left = n % 2; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 0*lda), *(pi1 + 4 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 0*lda), *(pi1 + 5 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 0*lda), *(pi1 + 6 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 0*lda), *(pi1 + 7 + 0*ldp) ); \ -\ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 1*lda), *(pi1 + 4 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 1*lda), *(pi1 + 5 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 1*lda), *(pi1 + 6 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 1*lda), *(pi1 + 7 + 1*ldp) ); \ -\ - alpha1 += 2*lda; \ - pi1 += 2*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,copys)( *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,copys)( *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,copys)( *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,copys)( *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,copys)( *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,copys)( *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,copys)( *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,copys)( *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_24xk, 24, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( packm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c index 00dc02eb4..73d98e268 100644 --- a/ref_kernels/1m/bli_unpackm_cxk_ref.c +++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c @@ -34,816 +34,64 @@ #include "blis.h" +#define UNPACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \ +\ +do \ +{ \ + for ( dim_t k = n; k != 0; --k ) \ + { \ + pragma \ + for ( dim_t mn = 0; mn < cdim; mn++ ) \ + PASTEMAC(ch,op)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ +} while(0) + #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ - conj_t conjp, \ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + ctype* restrict kappa, \ + ctype* restrict p, inc_t ldp, \ + ctype* restrict a, inc_t inca, inc_t lda, \ + cntx_t* restrict cntx \ ) \ { \ + const dim_t mnr = PASTECH2(mnr0, _, ch); \ + /* It's not clear if unpack needs to care about BB storage... */ \ + const dim_t dfac = PASTECH2(bb0, _, ch); \ +\ ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + if ( cdim == mnr && mnr != -1 ) \ { \ - if ( bli_is_conj( conjp ) ) \ + if ( inca == 1 ) \ { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ + if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \ + else UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \ } \ else \ { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ + if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \ + else UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \ } \ } \ - else \ + else /* if ( cdim < mnr ) */ \ { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ + if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \ + else UNPACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \ } \ } -INSERT_GENTFUNC_BASIC2( unpackm_2xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_4xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_6xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_8xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_10xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_12xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_14xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_16xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( unpackm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( unpackm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/ref_kernels/3/bb/bli_gemmbb_ref.c b/ref_kernels/3/bb/bli_gemmbb_ref.c deleted file mode 100644 index 4c75c064c..000000000 --- a/ref_kernels/3/bb/bli_gemmbb_ref.c +++ /dev/null @@ -1,141 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// An implementation that indexes through B with the assumption that all -// elements were broadcast (duplicated) by a factor of NP/NR. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -\ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - dim_t l, j, i; \ -\ - ctype ai; \ - ctype bj; \ -\ -\ - /* Initialize the accumulator elements in ab to zero. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,set0s)( *(ab + i) ); \ - } \ -\ - /* Perform a series of k rank-1 updates into ab. */ \ - for ( l = 0; l < k; ++l ) \ - { \ - ctype* restrict abij = ab; \ -\ - /* In an optimized implementation, these two loops over MR and NR - are typically fully unrolled. */ \ - for ( j = 0; j < n; ++j ) \ - { \ - bj = *(b + j*cs_b); \ -\ - for ( i = 0; i < m; ++i ) \ - { \ - ai = *(a + i); \ -\ - PASTEMAC(ch,dots)( ai, bj, *abij ); \ -\ - abij += rs_ab; \ - } \ - } \ -\ - a += cs_a; \ - b += rs_b; \ - } \ -\ - /* Scale the result in ab by alpha. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ - } \ -\ - /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, - scale by beta and then add the scaled redult in ab. */ \ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,copys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - c, rs_c, cs_c ); \ - } \ - else \ - { \ - PASTEMAC(ch,xpbys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - beta, \ - c, rs_c, cs_c ); \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - diff --git a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c deleted file mode 100644 index dd4e1f153..000000000 --- a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// An implementation that indexes through B with the assumption that all -// elements were broadcast (duplicated) by a factor of NP/NR. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1x, \ - ctype* restrict a11, \ - ctype* restrict bx1, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -/* -printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \ -printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \ -*/ \ -\ - ctype* minus_one = PASTEMAC(ch,m1); \ -\ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ - PASTECH(ch,trsm_ukr_ft) \ - trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ -\ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \ - (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \ -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \ - (double*)b11, rs_b, 1, "%5.2f", "" ); \ -*/ \ -\ - /* lower: b11 = alpha * b11 - a10 * b01; */ \ - /* upper: b11 = alpha * b11 - a12 * b21; */ \ - gemm_ukr \ - ( \ - mr, \ - nr, \ - k, \ - minus_one, \ - a1x, \ - bx1, \ - alpha, \ - b11, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \ - (double*)b11, rs_b, 1, "%5.2f", "" ); \ -*/ \ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - trsm_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \ - (double*)b11, rs_b, 1, "%5.2f", "" ); \ -*/ \ -\ - /* Broadcast the elements of the updated b11 submatrix to their - duplicated neighbors. */ \ - PASTEMAC(ch,bcastbbs_mxn) \ - ( \ - mr, \ - nr, \ - b11, rs_b, cs_b \ - ); \ -\ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \ - ( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \ - ( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR ) -INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR ) - diff --git a/ref_kernels/3/bb/bli_trsmbb_ref.c b/ref_kernels/3/bb/bli_trsmbb_ref.c deleted file mode 100644 index e3f5500cc..000000000 --- a/ref_kernels/3/bb/bli_trsmbb_ref.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// An implementation that indexes through B with the assumption that all -// elements were broadcast (duplicated) by a factor of NP/NR. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ - ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \ - ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \ - ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype beta11c = *beta11; \ - ctype rho11; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(ch,set0s)( rho11 ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype* restrict alpha10 = a10t + (l )*cs_a; \ - ctype* restrict beta01 = b01 + (l )*rs_b; \ -\ - PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \ - } \ - PASTEMAC(ch,subs)( rho11, beta11c ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: When preinversion is enabled, the INVERSE of alpha11 - (1.0/alpha11) is stored during packing instead alpha11 so we - can multiply rather than divide. When preinversion is disabled, - alpha11 is stored and division happens below explicitly. */ \ - PASTEMAC(ch,scals)( *alpha11, beta11c ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ -\ - /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ - } \ - } \ -} - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION -INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) -#else -INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) -#endif - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ - ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ - ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype beta11c = *beta11; \ - ctype rho11; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(ch,set0s)( rho11 ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype* restrict alpha12 = a12t + (l )*cs_a; \ - ctype* restrict beta21 = b21 + (l )*rs_b; \ -\ - PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \ - } \ - PASTEMAC(ch,subs)( rho11, beta11c ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: When preinversion is enabled, the INVERSE of alpha11 - (1.0/alpha11) is stored during packing instead alpha11 so we - can multiply rather than divide. When preinversion is disabled, - alpha11 is stored and division happens below explicitly. */ \ - PASTEMAC(ch,diagop)( *alpha11, beta11c ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ -\ - /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ - } \ - } \ -} - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION -INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) -#else -INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) -#endif - diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c index 51ff9df4b..f284acb98 100644 --- a/ref_kernels/3/bli_gemm_ref.c +++ b/ref_kernels/3/bli_gemm_ref.c @@ -34,13 +34,114 @@ #include "blis.h" -#if 1 +// Completely generic gemm ukr implementation which checks MR/NR at +// runtime. Very slow, but has to be used in certain cases. + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +static void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ +\ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = m; \ +\ + dim_t l, j, i; \ +\ + ctype ai; \ + ctype bj; \ +\ +\ + /* Initialize the accumulator elements in ab to zero. */ \ + for ( i = 0; i < m * n; ++i ) \ + { \ + PASTEMAC(ch,set0s)( *(ab + i) ); \ + } \ +\ + /* Perform a series of k rank-1 updates into ab. */ \ + for ( l = 0; l < k; ++l ) \ + { \ + ctype* restrict abij = ab; \ +\ + /* In an optimized implementation, these two loops over MR and NR + are typically fully unrolled. */ \ + for ( j = 0; j < n; ++j ) \ + { \ + bj = *(b + j*cs_b); \ +\ + for ( i = 0; i < m; ++i ) \ + { \ + ai = *(a + i*rs_a); \ +\ + PASTEMAC(ch,dots)( ai, bj, *abij ); \ +\ + abij += rs_ab; \ + } \ + } \ +\ + a += cs_a; \ + b += rs_b; \ + } \ +\ + /* Scale the result in ab by alpha. */ \ + for ( i = 0; i < m * n; ++i ) \ + { \ + PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ + } \ +\ + /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, + scale by beta and then add the scaled redult in ab. */ \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,copys_mxn)( m, \ + n, \ + ab, rs_ab, cs_ab, \ + c, rs_c, cs_c ); \ + } \ + else \ + { \ + PASTEMAC(ch,xpbys_mxn)( m, \ + n, \ + ab, rs_ab, cs_ab, \ + beta, \ + c, rs_c, cs_c ); \ + } \ +} + +INSERT_GENTFUNC_BASIC2( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) // An implementation that attempts to facilitate emission of vectorized // instructions via constant loop bounds + #pragma omp simd directives. +// If compile-time MR/NR are not available (indicated by BLIS_[MN]R_x = -1), +// then the non-unrolled version (above) is used. #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \ +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ @@ -56,14 +157,38 @@ void PASTEMAC3(ch,opname,arch,suf) \ cntx_t* restrict cntx \ ) \ { \ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = nr; \ - const inc_t cs_ab = 1; \ \ - const inc_t cs_a = mr; \ - const inc_t rs_b = nr; \ + const dim_t mr = PASTECH(BLIS_MR_,ch); \ + const dim_t nr = PASTECH(BLIS_NR_,ch); \ +\ + if ( mr == -1 || nr == -1 ) \ + { \ + PASTEMAC3(ch,gemm_gen,arch,suf) \ + ( \ + m, \ + n, \ + k, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + data, \ + cntx \ + ); \ + return; \ + } \ +\ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = nr; \ + const inc_t cs_ab = 1; \ +\ + const inc_t rs_a = PASTECH(BLIS_BBM_,ch); \ + const inc_t cs_a = PASTECH(BLIS_PACKMR_,ch); \ + const inc_t rs_b = PASTECH(BLIS_PACKNR_,ch); \ + const inc_t cs_b = PASTECH(BLIS_BBN_,ch); \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ @@ -83,8 +208,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ PASTEMAC(ch,dots) \ ( \ - a[ i ], \ - b[ j ], \ + a[ i*rs_a ], \ + b[ j*cs_b ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ @@ -157,115 +282,6 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -//INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) -GENTFUNC( float, s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 ) -GENTFUNC( double, d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) -GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) -GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 ) - -#else - -// An implementation that uses variable loop bounds (queried from the context) -// and makes no use of #pragma omp simd. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - dim_t l, j, i; \ -\ - ctype ai; \ - ctype bj; \ -\ -\ - /* Initialize the accumulator elements in ab to zero. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,set0s)( *(ab + i) ); \ - } \ -\ - /* Perform a series of k rank-1 updates into ab. */ \ - for ( l = 0; l < k; ++l ) \ - { \ - ctype* restrict abij = ab; \ -\ - /* In an optimized implementation, these two loops over MR and NR - are typically fully unrolled. */ \ - for ( j = 0; j < n; ++j ) \ - { \ - bj = *(b + j); \ -\ - for ( i = 0; i < m; ++i ) \ - { \ - ai = *(a + i); \ -\ - PASTEMAC(ch,dots)( ai, bj, *abij ); \ -\ - abij += rs_ab; \ - } \ - } \ -\ - a += cs_a; \ - b += rs_b; \ - } \ -\ - /* Scale the result in ab by alpha. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ - } \ -\ - /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, - scale by beta and then add the scaled redult in ab. */ \ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,copys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - c, rs_c, cs_c ); \ - } \ - else \ - { \ - PASTEMAC(ch,xpbys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - beta, \ - c, rs_c, cs_c ); \ - } \ -} - INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) -#endif diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c index 30fc3fcd6..046aa5617 100644 --- a/ref_kernels/3/bli_gemmtrsm_ref.c +++ b/ref_kernels/3/bli_gemmtrsm_ref.c @@ -34,6 +34,9 @@ #include "blis.h" +// An implementation that indexes through B with the assumption that all +// elements were broadcast (duplicated) by a factor of NP/NR. + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \ \ @@ -60,21 +63,38 @@ void PASTEMAC3(ch,opname,arch,suf) \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ +/* +printf( "bli_gemmtrsm_ref(): cs_b = %d\n", (int)cs_b ); \ +printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \ +*/ \ \ ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ - trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ + trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \ + (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \ + (double*)b11, rs_b, 1, "%5.2f", "" ); \ +*/ \ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR - instead? */ \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + /* to FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR + instead? + + to DAM: Given that this reference kernel is implemented in terms of gemm, + I think that is the preference we want to query. There might be other + circumstances where we would want the gemmtrsm_? operations to have + and exercise their own IO preferences -- I'd have to think about it -- + but this doesn't seem to be one of them. */ \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : nr ); \ const inc_t cs_ct = ( col_pref ? mr : 1 ); \ \ @@ -106,6 +126,19 @@ void PASTEMAC3(ch,opname,arch,suf) \ data, \ cntx \ ); \ +/* +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \ + (double*)b11, rs_b, 1, "%5.2f", "" ); \ +*/ \ +\ + /* Broadcast the elements of the updated b11 submatrix to their + duplicated neighbors. */ \ + PASTEMAC(ch,bcastbbs_mxn) \ + ( \ + m, \ + n, \ + b11, rs_b, cs_b \ + ); \ \ /* b11 = inv(a11) * b11; c11 = b11; */ \ @@ -117,6 +150,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ data, \ cntx \ ); \ +/* +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \ + (double*)b11, rs_b, 1, "%5.2f", "" ); \ +*/ \ \ if ( use_ct ) \ { \ diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c index 786f1129d..8234a84cc 100644 --- a/ref_kernels/3/bli_trsm_ref.c +++ b/ref_kernels/3/bli_trsm_ref.c @@ -34,17 +34,8 @@ #include "blis.h" -#if 0 - -// An implementation that attempts to facilitate emission of vectorized -// instructions via constant loop bounds + #pragma omp simd directives. - -// (Deleted. See 'old' directory.) - -#else - -// An implementation that uses variable loop bounds (queried from the context) -// and makes no use of #pragma omp simd. +// An implementation that indexes through B with the assumption that all +// elements were broadcast (duplicated) by a factor of NP/NR. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ @@ -69,11 +60,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ @@ -114,13 +105,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ - PASTEMAC(ch,diagop)( *alpha11, beta11c ); \ + PASTEMAC(ch,scals)( *alpha11, beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \ } \ } \ } @@ -155,19 +147,16 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ \ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ - for ( iter = 0; iter < m; ++iter ) \ + for ( dim_t iter = 0; iter < m; ++iter ) \ { \ - i = m - iter - 1; \ - n_behind = iter; \ + dim_t i = m - iter - 1; \ + dim_t n_behind = iter; \ \ ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ @@ -176,7 +165,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ + for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \ @@ -186,7 +175,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* beta11 = beta11 - a12t * b21; */ \ PASTEMAC(ch,set0s)( rho11 ); \ - for ( l = 0; l < n_behind; ++l ) \ + for ( dim_t l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha12 = a12t + (l )*cs_a; \ ctype* restrict beta21 = b21 + (l )*rs_b; \ @@ -206,7 +195,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \ } \ } \ } @@ -217,4 +207,3 @@ INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) #endif -#endif diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 33e74ecaa..69c546cd4 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -107,60 +107,30 @@ // -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------ -#undef packm_2xk_ker_name -#define packm_2xk_ker_name GENARNAME(packm_2xk) -#undef packm_3xk_ker_name -#define packm_3xk_ker_name GENARNAME(packm_3xk) -#undef packm_4xk_ker_name -#define packm_4xk_ker_name GENARNAME(packm_4xk) -#undef packm_6xk_ker_name -#define packm_6xk_ker_name GENARNAME(packm_6xk) -#undef packm_8xk_ker_name -#define packm_8xk_ker_name GENARNAME(packm_8xk) -#undef packm_10xk_ker_name -#define packm_10xk_ker_name GENARNAME(packm_10xk) -#undef packm_12xk_ker_name -#define packm_12xk_ker_name GENARNAME(packm_12xk) -#undef packm_14xk_ker_name -#define packm_14xk_ker_name GENARNAME(packm_14xk) -#undef packm_16xk_ker_name -#define packm_16xk_ker_name GENARNAME(packm_16xk) -#undef packm_24xk_ker_name -#define packm_24xk_ker_name GENARNAME(packm_24xk) +#undef packm_mrxk_ker_name +#define packm_mrxk_ker_name GENARNAME(packm_mrxk) +#undef packm_nrxk_ker_name +#define packm_nrxk_ker_name GENARNAME(packm_nrxk) -#undef unpackm_2xk_ker_name -#define unpackm_2xk_ker_name GENARNAME(unpackm_2xk) -#undef unpackm_4xk_ker_name -#define unpackm_4xk_ker_name GENARNAME(unpackm_4xk) -#undef unpackm_6xk_ker_name -#define unpackm_6xk_ker_name GENARNAME(unpackm_6xk) -#undef unpackm_8xk_ker_name -#define unpackm_8xk_ker_name GENARNAME(unpackm_8xk) -#undef unpackm_10xk_ker_name -#define unpackm_10xk_ker_name GENARNAME(unpackm_10xk) -#undef unpackm_12xk_ker_name -#define unpackm_12xk_ker_name GENARNAME(unpackm_12xk) -#undef unpackm_14xk_ker_name -#define unpackm_14xk_ker_name GENARNAME(unpackm_14xk) -#undef unpackm_16xk_ker_name -#define unpackm_16xk_ker_name GENARNAME(unpackm_16xk) +#undef packm_mrxk_1er_ker_name +#define packm_mrxk_1er_ker_name GENARNAME(packm_mrxk_1er) +#undef packm_nrxk_1er_ker_name +#define packm_nrxk_1er_ker_name GENARNAME(packm_nrxk_1er) -#undef packm_2xk_1er_ker_name -#define packm_2xk_1er_ker_name GENARNAME(packm_2xk_1er) -#undef packm_4xk_1er_ker_name -#define packm_4xk_1er_ker_name GENARNAME(packm_4xk_1er) -#undef packm_6xk_1er_ker_name -#define packm_6xk_1er_ker_name GENARNAME(packm_6xk_1er) -#undef packm_8xk_1er_ker_name -#define packm_8xk_1er_ker_name GENARNAME(packm_8xk_1er) -#undef packm_10xk_1er_ker_name -#define packm_10xk_1er_ker_name GENARNAME(packm_10xk_1er) -#undef packm_12xk_1er_ker_name -#define packm_12xk_1er_ker_name GENARNAME(packm_12xk_1er) -#undef packm_14xk_1er_ker_name -#define packm_14xk_1er_ker_name GENARNAME(packm_14xk_1er) -#undef packm_16xk_1er_ker_name -#define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1er) +#undef packm_mrxmr_diag_ker_name +#define packm_mrxmr_diag_ker_name GENARNAME(packm_mrxmr_diag) +#undef packm_nrxnr_diag_ker_name +#define packm_nrxnr_diag_ker_name GENARNAME(packm_nrxnr_diag) + +#undef packm_mrxmr_diag_1er_ker_name +#define packm_mrxmr_diag_1er_ker_name GENARNAME(packm_mrxmr_diag_1er) +#undef packm_nrxnr_diag_1er_ker_name +#define packm_nrxnr_diag_1er_ker_name GENARNAME(packm_nrxnr_diag_1er) + +#undef unpackm_mrxk_ker_name +#define unpackm_mrxk_ker_name GENARNAME(unpackm_mrxk) +#undef unpackm_nrxk_ker_name +#define unpackm_nrxk_ker_name GENARNAME(unpackm_nrxk) // Instantiate prototypes for above functions via the level-1m kernel API // template. @@ -259,11 +229,10 @@ void GENBARNAME(cntx_init) ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; func_t* funcs; mbool_t* mbools; dim_t i; - void** vfuncs; + void_fp* vfuncs; // -- Clear the context ---------------------------------------------------- @@ -273,73 +242,27 @@ void GENBARNAME(cntx_init) // -- Set blocksizes ------------------------------------------------------- - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_KR ], 1, 1, 1, 1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 4, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); - bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 ); - bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 ); - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 6, 6, 6, 6 ); - bli_blksz_init_easy( &blkszs[ BLIS_XF ], 4, 4, 4, 4 ); - - // Initialize the context with the default blocksize objects and their - // multiples. - bli_cntx_set_blkszs - ( - BLIS_NAT, 11, - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR, - BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2, - BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2, - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF, - cntx - ); - - - // -- Set level-3 virtual micro-kernels ------------------------------------ - - funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); - - // NOTE: We set the virtual micro-kernel slots to contain the addresses - // of the native micro-kernels. In general, the ukernels in the virtual - // ukernel slots are always called, and if the function called happens to - // be a virtual micro-kernel, it will then know to find its native ukernel - // (i.e., in the native ukernel slots). - gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); - gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); - gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); - gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); - gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); - - - // -- Set level-3 native micro-kernels and preferences --------------------- - - funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); - mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); - - gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); - gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); - gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); - gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); - gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); - - // s d c z - bli_mbool_init( &mbools[ BLIS_GEMM_UKR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); - bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); - bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); - bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); - + // NOTE: The macro values for register blocksizes and packm broadcast factors are + // used here as defined in the bli_kernel_defs_.h or generic values from + // bli_kernel_macro_defs.h otherwise. Configurations should also initialize the + // blocksizes in the context explicitly, but using the correct values here helps + // to prevent accidents. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_KR ], 1, 1, 1, 1 ); + bli_blksz_init ( &blkszs[ BLIS_MR ], BLIS_MR_s, BLIS_MR_d, BLIS_MR_c, BLIS_MR_z, + BLIS_PACKMR_s, BLIS_PACKMR_d, BLIS_PACKMR_c, BLIS_PACKMR_z ); + bli_blksz_init ( &blkszs[ BLIS_NR ], BLIS_NR_s, BLIS_NR_d, BLIS_NR_c, BLIS_NR_z, + BLIS_PACKNR_s, BLIS_PACKNR_d, BLIS_PACKNR_c, BLIS_PACKNR_z ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); + bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 ); + bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 6, 6, 6, 6 ); + bli_blksz_init_easy( &blkszs[ BLIS_XF ], 4, 4, 4, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_BBM ], BLIS_BBM_s, BLIS_BBM_d, BLIS_BBM_c, BLIS_BBM_z ); + bli_blksz_init_easy( &blkszs[ BLIS_BBN ], BLIS_BBN_s, BLIS_BBN_d, BLIS_BBN_c, BLIS_BBN_z ); // -- Set level-3 small/unpacked thresholds -------------------------------- @@ -352,89 +275,102 @@ void GENBARNAME(cntx_init) // chosen over "less than or equal to" so that threshold values of 0 would // effectively disable sup (even for matrix dimensions of 0). // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 0, 0, 0, 0 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 0, 0, 0, 0 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 0, 0, 0, 0 ); - // Initialize the context with the default thresholds. - bli_cntx_set_l3_sup_thresh + // Initialize the context with the default blocksize objects and their + // multiples. + bli_cntx_set_blkszs ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + cntx, + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR, + BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2, + BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2, + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF, + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, + BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM, + BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN, + BLIS_VA_END ); - // -- Set level-3 small/unpacked handlers ---------------------------------- + // -- Set level-3 virtual micro-kernels ------------------------------------ - vfuncs = bli_cntx_l3_sup_handlers_buf( cntx ); + funcs = bli_cntx_ukrs_buf( cntx ); - // Initialize all of the function pointers to NULL; - for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL; + // NOTE: We set the virtual micro-kernel slots to contain the addresses + // of the native micro-kernels. In general, the ukernels in the virtual + // ukernel slots are always called, and if the function called happens to + // be a virtual micro-kernel, it will then know to find its native ukernel + // (i.e., in the native ukernel slots). + gen_func_init( &funcs[ BLIS_GEMM_VIR_UKR ], gemm_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_L_VIR_UKR ], trsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_U_VIR_UKR ], trsm_u_ukr_name ); - // The level-3 sup handlers are oapi-based, so we only set one slot per - // operation. - // Set the gemm slot to the default gemm sup handler. - vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref; - vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref; + // -- Set level-3 native micro-kernels and preferences --------------------- + + mbools = bli_cntx_ukr_prefs_buf( cntx ); + + gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); + + // s d c z + bli_mbool_init( &mbools[ BLIS_GEMM_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); + bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); + bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); + bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); // -- Set level-3 small/unpacked micro-kernels and preferences ------------- - funcs = bli_cntx_l3_sup_kers_buf( cntx ); - mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); - -#if 0 - // Adhere to the small/unpacked ukernel mappings: - // - rv -> rrr, rcr - // - rg -> rrc, rcc - // - cv -> ccr, ccc - // - cg -> crr, crc - gen_sup_func_init( &funcs[ BLIS_RRR ], - &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); - gen_sup_func_init( &funcs[ BLIS_RRC ], - &funcs[ BLIS_RCC ], gemmsup_rg_ukr_name ); - gen_sup_func_init( &funcs[ BLIS_CCR ], - &funcs[ BLIS_CCC ], gemmsup_cv_ukr_name ); - gen_sup_func_init( &funcs[ BLIS_CRR ], - &funcs[ BLIS_CRC ], gemmsup_cg_ukr_name ); -#endif - gen_func_init( &funcs[ BLIS_RRR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_RRC ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_RCC ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CRR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CRC ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CCR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CCC ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RRR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RRC_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RCR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RCC_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CRR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CRC_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CCR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CCC_UKR ], gemmsup_rv_ukr_name ); // Register the general-stride/generic ukernel to the "catch-all" slot // associated with the BLIS_XXX enum value. This slot will be queried if // *any* operand is stored with general stride. - gen_func_init( &funcs[ BLIS_XXX ], gemmsup_gx_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_XXX_UKR ], gemmsup_gx_ukr_name ); // Set the l3 sup ukernel storage preferences. - // s d c z - bli_mbool_init( &mbools[ BLIS_RRR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_RRC ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_RCR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_RCC ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CRR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CRC ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CCR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CCC ], TRUE, TRUE, TRUE, TRUE ); + // s d c z + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_XXX ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_XXX_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); // -- Set level-1f kernels ------------------------------------------------- - funcs = bli_cntx_l1f_kers_buf( cntx ); - gen_func_init( &funcs[ BLIS_AXPY2V_KER ], axpy2v_ker_name ); gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ], dotaxpyv_ker_name ); gen_func_init( &funcs[ BLIS_AXPYF_KER ], axpyf_ker_name ); @@ -444,8 +380,6 @@ void GENBARNAME(cntx_init) // -- Set level-1v kernels ------------------------------------------------- - funcs = bli_cntx_l1v_kers_buf( cntx ); - gen_func_init( &funcs[ BLIS_ADDV_KER ], addv_ker_name ); gen_func_init( &funcs[ BLIS_AMAXV_KER ], amaxv_ker_name ); gen_func_init( &funcs[ BLIS_AXPBYV_KER ], axpbyv_ker_name ); @@ -464,41 +398,35 @@ void GENBARNAME(cntx_init) // -- Set level-1m (packm/unpackm) kernels --------------------------------- - funcs = bli_cntx_packm_kers_buf( cntx ); + gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ], packm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ], packm_nrxk_ker_name ); - // Initialize all packm kernel func_t entries to NULL. - for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i ) - { - bli_func_init_null( &funcs[ i ] ); - } + gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ], packm_mrxk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ], packm_nrxk_1er_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ], packm_3xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_MRXMR_DIAG_KER ], packm_mrxmr_diag_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_NRXNR_DIAG_KER ], packm_nrxnr_diag_ker_name ); - funcs = bli_cntx_unpackm_kers_buf( cntx ); + gen_func_init_co( &funcs[ BLIS_PACKM_MRXMR_DIAG_1ER_KER ], packm_mrxmr_diag_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXNR_DIAG_1ER_KER ], packm_nrxnr_diag_1er_ker_name ); - // Initialize all packm kernel func_t entries to NULL. - for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i ) - { - bli_func_init_null( &funcs[ i ] ); - } + gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ], unpackm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ], unpackm_nrxk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ], unpackm_2xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ], unpackm_4xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ], unpackm_6xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ], unpackm_8xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name ); + + // -- Set level-3 small/unpacked handlers ---------------------------------- + + vfuncs = bli_cntx_l3_sup_handlers_buf( cntx ); + + // Initialize all of the function pointers to NULL; + for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL; + + // The level-3 sup handlers are oapi-based, so we only set one slot per + // operation. + + // Set the gemm slot to the default gemm sup handler. + vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref; + vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref; // -- Set miscellaneous fields --------------------------------------------- @@ -515,7 +443,6 @@ void GENBAINAME(cntx_init) ) { func_t* funcs; - dim_t i; // This function is designed to modify a copy of an existing native // context to enable computation via an induced method for complex @@ -525,23 +452,23 @@ void GENBAINAME(cntx_init) // -- Set induced method level-3 virtual micro-kernels --------------------- - funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); + funcs = bli_cntx_ukrs_buf( cntx ); if ( method == BLIS_1M ) { - gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm1m_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm1m_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm1m_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ], gemm1m_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm1m_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm1m_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ], trsm1m_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ], trsm1m_u_ukr_name ); } else // if ( method == BLIS_NAT ) { - gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ], gemm_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ], trsm_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ], trsm_u_ukr_name ); } // For 1m, we employ an optimization which requires that we copy the native @@ -556,8 +483,8 @@ void GENBAINAME(cntx_init) // beta has a zero imaginary component and C is either row- or column-stored). if ( method == BLIS_1M ) { - func_t* gemm_nat_ukrs = bli_cntx_get_l3_nat_ukrs( BLIS_GEMM_UKR, cntx ); - func_t* gemm_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, cntx ); + func_t* gemm_nat_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx ); + func_t* gemm_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx ); bli_func_copy_dt( BLIS_FLOAT, gemm_nat_ukrs, BLIS_FLOAT, gemm_vir_ukrs ); bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs ); @@ -566,39 +493,23 @@ void GENBAINAME(cntx_init) // -- Set induced method packm kernels ------------------------------------- - funcs = bli_cntx_packm_kers_buf( cntx ); - - // Initialize all packm kernel func_t entries to NULL. - for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i ) - { - bli_func_init_null( &funcs[ i ] ); - } - if ( method == BLIS_1M ) { - gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ], packm_mrxk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ], packm_nrxk_1er_ker_name ); } else // if ( method == BLIS_NAT ) { - gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ], packm_3xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ], packm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ], packm_nrxk_ker_name ); } + gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ], packm_mrxk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ], packm_nrxk_1er_ker_name ); + + gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ], unpackm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ], unpackm_nrxk_ker_name ); + // -- Set induced method cache and register blocksizes --------------------- @@ -628,50 +539,44 @@ void GENBAINAME(cntx_init_blkszs) cntx_t* cntx ) { - // We MUST set the induced method in the context prior to calling - // bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries - // the induced method. That function needs the induced method value in - // order to determine whether to evaluate the "prefers column storage" - // predicate using the storage preference of the kernel for dt, or - // the storage preference of the kernel for the real projection of - // dt. Failing to set the induced method here can lead to strange - // undefined behavior at runtime if the native complex kernel's - // storage preference happens to not equal that of the native real - // kernel. + // Set the induced method in the context. bli_cntx_set_method( method, cntx ); + num_t dt_r = bli_dt_proj_to_real( dt ); + // Initialize the blocksizes according to the micro-kernel preference as // well as the algorithm. - if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + //if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + if ( ! bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ) ) { // This branch is used for algorithm 1m_c_bp. bli_cntx_set_ind_blkszs ( - method, dt, 6, + method, dt, cntx, BLIS_NC, 1.0, 1.0, BLIS_KC, 2.0, 2.0, // halve kc... BLIS_MC, 2.0, 2.0, // halve mc... BLIS_NR, 1.0, 1.0, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) BLIS_KR, 1.0, 1.0, - cntx + BLIS_VA_END ); } - else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + else // if ( bli_cntx_get_ukr_prefs_dt( dt, BLIS_GEMM_UKR_ROW_PREF, cntx ) ) { // This branch is used for algorithm 1m_r_bp. bli_cntx_set_ind_blkszs ( - method, dt, 6, + method, dt, cntx, BLIS_NC, 2.0, 2.0, // halve nc... BLIS_KC, 2.0, 2.0, // halve kc... BLIS_MC, 1.0, 1.0, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) BLIS_MR, 1.0, 1.0, BLIS_KR, 1.0, 1.0, - cntx + BLIS_VA_END ); } } diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c index fbd15d695..2f0808389 100644 --- a/ref_kernels/ind/bli_gemm1m_ref.c +++ b/ref_kernels/ind/bli_gemm1m_ref.c @@ -55,8 +55,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c index 08823f073..6cfb83cae 100644 --- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c +++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c @@ -56,12 +56,12 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ \ PASTECH(ch,trsm_ukr_ft) \ ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ \ - const bool col_pref_r = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref_r = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -89,7 +89,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype_r* restrict bx1_r = ( ctype_r* )bx1; \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \ \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ @@ -106,7 +106,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR instead? */ \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : nr ); \ const inc_t cs_ct = ( col_pref ? mr : 1 ); \ \ @@ -192,24 +192,25 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ if ( bli_is_1e_packed( schema_b ) ) \ { \ - const inc_t ld_b = rs_b; \ + const inc_t ld_b = rs_b; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 2 * cs_b; \ \ - ctype* restrict b11_ri = ( ctype* )b11; \ - ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \ -\ - dim_t i, j; \ + ctype_r* restrict b11_ri = ( ctype_r* )b11; \ + ctype_r* restrict b11_ir = ( ctype_r* )b11 + ld_b; \ \ /* b11 = alpha * b11 + bt; */ \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < nr; ++j ) \ + for ( dim_t i = 0; i < mr; ++i ) \ + for ( dim_t d = 0; d < cs_b; ++d ) \ { \ - ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ - ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ - ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ - ctype* restrict beta11_ri = b11_ri + i*rs_b + j*cs_b; \ - ctype_r* restrict beta11_r = &PASTEMAC(ch,real)( *beta11_ri ); \ - ctype_r* restrict beta11_i = &PASTEMAC(ch,imag)( *beta11_ri ); \ - ctype* restrict beta11_ir = b11_ir + i*rs_b + j*cs_b; \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype_r* restrict beta11_ri_r = b11_ri + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \ + ctype_r* restrict beta11_ri_i = b11_ri + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \ + ctype_r* restrict beta11_ir_r = b11_ir + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \ + ctype_r* restrict beta11_ir_i = b11_ir + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \ \ PASTEMAC3(ch,chr,ch,xpbyris) \ ( \ @@ -217,12 +218,12 @@ void PASTEMAC3(ch,opname,arch,suf) \ *beta11t_i, \ alpha_r, \ alpha_i, /* alpha_i not referenced */ \ - *beta11_r, \ - *beta11_i \ + *beta11_ri_r, \ + *beta11_ri_i \ ); \ \ - PASTEMAC(ch,sets)( -*beta11_i, \ - *beta11_r, *beta11_ir ); \ + PASTEMAC(ch,copyris)( -*beta11_ri_i, *beta11_ri_r, \ + *beta11_ir_r, *beta11_ir_i ); \ } \ } \ else /* if ( bli_is_1r_packed( schema_b ) ) */ \ @@ -233,18 +234,17 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ ctype_r* restrict b11_r = ( ctype_r* )b11; \ ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \ -\ - dim_t i, j; \ \ /* b11 = alpha * b11 + bt; */ \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < nr; ++j ) \ + for ( dim_t i = 0; i < mr; ++i ) \ + for ( dim_t d = 0; d < cs_b; ++d ) \ { \ ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ - ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2; \ - ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2; \ + ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2 + d; \ + ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2 + d; \ \ PASTEMAC3(ch,chr,ch,xpbyris) \ ( \ diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c index 68717f7a6..5eda20f20 100644 --- a/ref_kernels/ind/bli_trsm1m_ref.c +++ b/ref_kernels/ind/bli_trsm1m_ref.c @@ -48,6 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -58,11 +59,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \ + const inc_t cs_a = packmr; \ \ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \ \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ @@ -77,12 +78,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ const inc_t rs_a2 = 1 * rs_a; \ const inc_t cs_a2 = 2 * cs_a; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 2 * cs_b; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ \ - ctype* restrict b_ri = ( ctype* )b; \ - ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ + ctype_r* restrict b_ri = ( ctype_r* )b; \ + ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ @@ -93,20 +96,22 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ - ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict B0_ri = b_ri + (0 )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_ri = b_ri + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_ir = b_ir + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_ri = b_ri + (0 )*rs_b2 + (0 )*cs_b2; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ - ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict b01_ri = B0_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ - ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r* restrict beta11_ri_r = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ri_i = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict beta11_ir_r = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ir_i = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict b01_ri = B0_ri + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_ri_r; \ + ctype_r beta11c_i = *beta11_ri_i; \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ @@ -117,9 +122,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a2; \ ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a2; \ - ctype* restrict beta01_ri = b01_ri + (l )*rs_b; \ - ctype_r* restrict beta01_r = &PASTEMAC(ch,real)( *beta01_ri ); \ - ctype_r* restrict beta01_i = &PASTEMAC(ch,imag)( *beta01_ri ); \ + ctype_r* restrict beta01_r = b01_ri + (l )*rs_b2 + 0*cs_b; \ + ctype_r* restrict beta01_i = b01_ri + (l )*rs_b2 + 1*cs_b; \ \ PASTEMAC(ch,axpyris)( *alpha10_r, \ *alpha10_i, \ @@ -147,8 +151,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ - PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + { \ + PASTEMAC(ch,copyris)( beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \ + PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \ + } \ } \ } \ } \ @@ -229,10 +236,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11c_r, \ - beta11c_i, \ - *beta11_r, \ - *beta11_i ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *(beta11_r + d), \ + *(beta11_i + d) ); \ } \ } \ } \ @@ -258,6 +266,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -268,11 +277,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \ + const inc_t cs_a = packmr; \ \ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \ \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ @@ -287,12 +296,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ const inc_t rs_a2 = 1 * rs_a; \ const inc_t cs_a2 = 2 * cs_a; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 2 * cs_b; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ \ - ctype* restrict b_ri = ( ctype* )b; \ - ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ + ctype_r* restrict b_ri = ( ctype_r* )b; \ + ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ @@ -303,20 +314,22 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict a12t_r = a_r + (i )*rs_a2 + (i+1)*cs_a2; \ ctype_r* restrict a12t_i = a_i + (i )*rs_a2 + (i+1)*cs_a2; \ - ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict B2_ri = b_ri + (i+1)*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_ri = b_ri + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_ir = b_ir + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_ri = b_ri + (i+1)*rs_b2 + (0 )*cs_b2; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ - ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict b21_ri = B2_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ - ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r* restrict beta11_ri_r = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ri_i = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict beta11_ir_r = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ir_i = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict b21_ri = B2_ri + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_ri_r; \ + ctype_r beta11c_i = *beta11_ri_i; \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ @@ -325,11 +338,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ rho11_i ); \ for ( l = 0; l < n_behind; ++l ) \ { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ - ctype* restrict beta21_ri = b21_ri + (l )*rs_b; \ - ctype_r* restrict beta21_r = &PASTEMAC(ch,real)( *beta21_ri ); \ - ctype_r* restrict beta21_i = &PASTEMAC(ch,imag)( *beta21_ri ); \ + ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ + ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ + ctype_r* restrict beta21_r = b21_ri + (l )*rs_b2 + 0*cs_b; \ + ctype_r* restrict beta21_i = b21_ri + (l )*rs_b2 + 1*cs_b; \ \ PASTEMAC(ch,axpyris)( *alpha12_r, \ *alpha12_i, \ @@ -357,8 +369,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ - PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + { \ + PASTEMAC(ch,copyris)( beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \ + PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \ + } \ } \ } \ } \ @@ -439,10 +454,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11c_r, \ - beta11c_i, \ - *beta11_r, \ - *beta11_i ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *(beta11_r + d), \ + *(beta11_i + d) ); \ } \ } \ } \ diff --git a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c index 957cd5794..8caccf923 100644 --- a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c +++ b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c @@ -157,7 +157,7 @@ void PASTECH2(bls_,ch,varname) \ function pointer type. */ \ /* PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ */ \ \ /* Temporary C buffer for edge cases. Note that the strides of this @@ -168,7 +168,7 @@ void PASTECH2(bls_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ @@ -524,7 +524,7 @@ void PASTECH2(bls_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -533,7 +533,7 @@ void PASTECH2(bls_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index f2f8b7e25..ec5d8d5b1 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -134,7 +134,7 @@ void bls_gemm_ex // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index 62dc462d5..1e3e5ea03 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -156,7 +156,7 @@ void PASTECH2(bls_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c index ca11c207c..2ed178c65 100644 --- a/sandbox/gemmlike/bls_packm_cxk.c +++ b/sandbox/gemmlike/bls_packm_cxk.c @@ -54,15 +54,16 @@ void PASTECH2(bls_,ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index b07da91cc..9568dfee7 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -305,7 +305,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); if ( bli_obj_is_complex( &b ) ) *perf *= 4.0; // Perform checks. - libblis_test_trsm_ukr_check( params, side, &ap, &c, &b, resid ); + libblis_test_trsm_ukr_check( params, side, &a, &c, &b, resid ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c, perf, resid ); @@ -418,9 +418,11 @@ void libblis_test_trsm_ukr_check bli_printm( "a11", a, "%5.2f", "" ); #endif +#if 0 // Restore the diagonal of a11 to its original, un-inverted state // (needed for trsv). bli_invertd( a ); +#endif if ( bli_is_left( side ) ) {