mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Added support for pre-broadcast when packing B.
Details: - Added support for being able to duplicate (broadcast) elements in memory when packing matrix B (ie: the left-hand operand) in level-3 operations. This turns out advantageous for some architectures that can afford the cost of the extra bandwidth and somehow benefit from the pre-broadcast elements (and thus being able to avoid using broadcast-style load instructions on micro-rows of B in the gemm microkernel). - Support optionally disabling right-side hemm and symm. If this occurs, hemm_r is implemented in terms of hemm_l (and symm_r in terms of symm_l). This is needed when broadcasting during packing because the alternative--supporting the broadcast of B while also allowing matrix B to be Hermitian/symmetric--would be an absolute mess. - Support alignment factors for packed blocks of A, B, and C separately (as well as for general-purpose buffers). In addition, we support byte offsets from those alignment values (which is different from aligning by align+offset bytes to begin with). The default alignment values are BLIS_PAGE_SIZE in all four cases, with the offset values defaulting to zero. - Pass pack_t schema into bli_?packm_cxk() so that it can be then passed into the packm kernel, where it will be needed by packm kernels that perform broadcasts of B, since the idea is that we *only* want to broadcast when packing micropanels of B and not A. - Added definition for variadic bli_cntx_set_l3_vir_ukrs(), which can be used to set custom virtual level-3 microkernels in the cntx_t, which would typically be done in the bli_cntx_init_*() function defined in the subconfiguration of interest. - Added a "broadcast B" kernel function for use with NP/NR = 12/6, defined in in ref_kernels/1m/bli_packm_cxk_bb_ref.c. - Added a gemm, gemmtrsm, and trsm "broadcast B" reference kernels defined in ref_kernels/3/bb. (These kernels have been tested with double real with NP/NR = 12/6.) - Added #ifndef ... #endif guards around several macro constants defined in frame/include/bli_kernel_macro_defs.h. - Defined a few "broadcast B" static functions in frame/include/level0/bb for use by "broadcast B"-style packm reference kernels. For now, only the real domain kernels are tested and fully defined. - Output the alignment and offset values for packed blocks of A and B in the testsuite's "BLIS configuration info" section. - Comment updates to various files. - Bumped so_version to 3.0.0.
This commit is contained in:
239
config/old/haswellbb/bli_cntx_init_haswell.c
Normal file
239
config/old/haswellbb/bli_cntx_init_haswell.c
Normal file
@@ -0,0 +1,239 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// Instantiate prototypes for packm kernels.
|
||||
PACKM_KER_PROT( double, d, packm_6xk_bb2_haswell_ref )
|
||||
|
||||
// Instantiate prototypes for level-3 kernels.
|
||||
GEMM_UKR_PROT( double, d, gemmbb_haswell_ref )
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_haswell_ref )
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_haswell_ref )
|
||||
TRSM_UKR_PROT( double, d, trsmbb_l_haswell_ref )
|
||||
TRSM_UKR_PROT( double, d, trsmbb_u_haswell_ref )
|
||||
|
||||
void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_haswell_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
#if 0
|
||||
8,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
||||
#else
|
||||
3,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_haswell_ref, FALSE,
|
||||
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_haswell_ref, FALSE,
|
||||
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_haswell_ref, FALSE,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with customized virtual [gemm]trsm micro-kernels.
|
||||
bli_cntx_set_l3_vir_ukrs
|
||||
(
|
||||
2,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_haswell_ref,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_haswell_ref,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized packm kernels.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
1,
|
||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_haswell_ref,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
#if 0
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
#if 0
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
#else
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
|
||||
-1, 12, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
||||
#endif
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 7,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
// level-1f
|
||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 1, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 1, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 1, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
8,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
|
||||
-1, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
163
config/old/haswellbb/bli_family_haswell.h
Normal file
163
config/old/haswellbb/bli_family_haswell.h
Normal file
@@ -0,0 +1,163 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096
|
||||
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 32
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 64
|
||||
|
||||
|
||||
#if 0
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
|
||||
|
||||
// -- sgemm micro-kernel --
|
||||
|
||||
#if 0
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 24
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
|
||||
#define BLIS_DEFAULT_MC_S 144
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 6
|
||||
#define BLIS_DEFAULT_NR_S 16
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
|
||||
#define BLIS_DEFAULT_MC_S 144
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 16
|
||||
#define BLIS_DEFAULT_NR_S 6
|
||||
#endif
|
||||
|
||||
// -- dgemm micro-kernel --
|
||||
|
||||
#if 0
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12
|
||||
#define BLIS_DEFAULT_MC_D 152
|
||||
#define BLIS_DEFAULT_KC_D 160
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 12
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
|
||||
#define BLIS_DEFAULT_MC_D 72
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 6
|
||||
#define BLIS_DEFAULT_NR_D 8
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
|
||||
#define BLIS_DEFAULT_MC_D 72
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 8
|
||||
#define BLIS_DEFAULT_NR_D 6
|
||||
#endif
|
||||
|
||||
// -- cgemm micro-kernel --
|
||||
|
||||
#if 1
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8
|
||||
#define BLIS_DEFAULT_MC_C 144
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4080
|
||||
#define BLIS_DEFAULT_MR_C 3
|
||||
#define BLIS_DEFAULT_NR_C 8
|
||||
|
||||
#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3
|
||||
#define BLIS_DEFAULT_MC_C 144
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4080
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 3
|
||||
#endif
|
||||
|
||||
// -- zgemm micro-kernel --
|
||||
|
||||
#if 1
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
|
||||
#define BLIS_DEFAULT_MC_Z 72
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 4080
|
||||
#define BLIS_DEFAULT_MR_Z 3
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3
|
||||
#define BLIS_DEFAULT_MC_Z 72
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 4080
|
||||
#define BLIS_DEFAULT_MR_Z 4
|
||||
#define BLIS_DEFAULT_NR_Z 3
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
98
config/old/haswellbb/make_defs.mk
Normal file
98
config/old/haswellbb/make_defs.mk
Normal file
@@ -0,0 +1,98 @@
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# Declare the name of the current configuration and add it to the
|
||||
# running list of configurations included by common.mk.
|
||||
THIS_CONFIG := haswell
|
||||
#CONFIGS_INCL += $(THIS_CONFIG)
|
||||
|
||||
#
|
||||
# --- Determine the C compiler and related flags ---
|
||||
#
|
||||
|
||||
# NOTE: The build system will append these variables with various
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
|
||||
ifeq ($(GCC_OT_4_9_0),yes)
|
||||
# If gcc is older than 4.9.0, we must use a different label for -march.
|
||||
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
|
||||
endif
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CKVECFLAGS := -xCORE-AVX2
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
|
||||
else
|
||||
$(error gcc, icc, or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -86,6 +86,7 @@ INSERT_GENTDEF( packm )
|
||||
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
@@ -73,36 +74,51 @@ void PASTEMAC(ch,opname) \
|
||||
the outer (panel_dim_max - panel_dim) rows or columns of the
|
||||
micropanel. (Note that these rows/columns correspond to values
|
||||
beyond the edge of matrix A.) The kernel intrinsically knows its
|
||||
own panel_dim_max, since that corresponds to the kernel's register
|
||||
blocksize. However, we need to pass in panel_len_max because the
|
||||
bottom-right edge case of trsm_lu will need all elements above the
|
||||
extended diagonal and beyond (to the right of) the bottom-right
|
||||
element to be initialized to zero so the trsm portion of the
|
||||
computational kernel will operate with zeros for those iterations.
|
||||
own panel_dim_max, since that corresponds to the packm kernel's
|
||||
leading dimension. However, we *do* need to pass in panel_len_max
|
||||
because the bottom-right edge case of trsm_lu will need all
|
||||
elements above the extended diagonal and beyond (to the right of)
|
||||
the bottom-right element to be initialized to zero so the trsm
|
||||
portion of the computational kernel will operate with zeros for
|
||||
those iterations.
|
||||
|
||||
As an example, if trsm_lu is executed on a 6x6 matrix, and the
|
||||
gemmtrsm kernel uses MR = 6, the computation will begin with the
|
||||
edge case, which is the bottom 2x2 matrix marked with x's. Code
|
||||
in bli_packm_tri_cxk() will extend the diagonal as identity into
|
||||
the remaining portion of the micropanel. But before that happens,
|
||||
the packm kernel must have set the 0's shown below. (Unreferenced
|
||||
elements are marked with '.'.)
|
||||
For example, if trsm_lu is executed on an 10x10 triangular matrix,
|
||||
and the gemmtrsm kernel uses MR = 6, the computation will begin
|
||||
with the edge case, which is the bottom-right 4x4 upper triangular
|
||||
matrix. Code in bli_packm_tri_cxk() will extend the diagonal as
|
||||
identity into the remaining portion of the micropanel. But before
|
||||
that happens, the packm kernel must have set the 0's added in
|
||||
step (3) below.
|
||||
|
||||
x x 0 0 0 0
|
||||
. x 0 0 0 0
|
||||
. . 1 0 0 0
|
||||
. . . 1 0 0
|
||||
. . . . 1 0
|
||||
. . . . . 1
|
||||
packm kernel packm kernel packm kernel packm_tri_cxk
|
||||
step 1: step 2: step 3: step 4:
|
||||
|
||||
In this case, panel_dim will be 2 because two rows of data are
|
||||
copied from A, panel_len will be 2 because those two rows span
|
||||
two columns of A, and panel_len_max will be 6 because there are a
|
||||
total of 6 columns that can be written to, 4 of which lie beyond
|
||||
the values copied from A. */ \
|
||||
x x x x . . x x x x . . x x x x 0 0 x x x x 0 0
|
||||
? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0
|
||||
? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0
|
||||
? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0
|
||||
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
|
||||
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
|
||||
|
||||
x Copied from A; valid element.
|
||||
? Copied from A, but value is unknown and unused.
|
||||
. Uninitialized.
|
||||
0 Initialized to zero.
|
||||
1 Initialized to one.
|
||||
|
||||
NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
|
||||
to zero. This is not needed to support trsm, but rather to
|
||||
support trmm. (Both use the same packing format and code.)
|
||||
|
||||
In this case, panel_dim will be 4 because four rows of data are
|
||||
copied from A, panel_len will be 4 because those four rows span
|
||||
four columns of A, and panel_len_max will be 6 because there are a
|
||||
total of 6 columns that can be written to in the packed micropanel,
|
||||
2 of which lie beyond the values copied from A. */ \
|
||||
f \
|
||||
( \
|
||||
conja, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
panel_len_max, \
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
|
||||
@@ -261,7 +261,7 @@ siz_t bli_packm_init_pack
|
||||
bli_obj_toggle_uplo( p );
|
||||
}
|
||||
|
||||
// If we are packing micro-panels, mark P as dense. Otherwise, we are
|
||||
// If we are packing micropanels, mark P as dense. Otherwise, we are
|
||||
// probably being called in the context of a level-2 operation, in
|
||||
// which case we do not want to overwrite the uplo field of P (inherited
|
||||
// from A) with BLIS_DENSE because that information may be needed by
|
||||
@@ -368,28 +368,28 @@ siz_t bli_packm_init_pack
|
||||
// default (logical) blocksize multiple in the m dimension.
|
||||
m_panel = bmult_m_def;
|
||||
|
||||
// The "column stride" of a row panel packed object is interpreted as
|
||||
// the column stride WITHIN a panel. Thus, this is equal to the
|
||||
// packing (storage) blocksize multiple (which may be equal to the
|
||||
// default (logical) blocksize multiple.
|
||||
// The "column stride" of a row-micropanel packed object is interpreted
|
||||
// as the column stride WITHIN a micropanel. Thus, this is equal to the
|
||||
// packing (storage) blocksize multiple, which may be equal to the
|
||||
// default (logical) blocksize multiple).
|
||||
cs_p = bmult_m_pack;
|
||||
|
||||
// The "row stride" of a row panel packed object is interpreted
|
||||
// as the row stride WITHIN a panel. Thus, it is unit.
|
||||
// The "row stride" of a row-micropanel packed object is interpreted
|
||||
// as the row stride WITHIN a micropanel. Thus, it is unit.
|
||||
rs_p = 1;
|
||||
|
||||
// The "panel stride" of a panel packed object is interpreted as the
|
||||
// distance between the (0,0) element of panel k and the (0,0)
|
||||
// The "panel stride" of a micropanel packed object is interpreted as
|
||||
// the distance between the (0,0) element of panel k and the (0,0)
|
||||
// element of panel k+1. We use the padded width computed above to
|
||||
// allow for zero-padding (if necessary/desired) along the far end
|
||||
// of each panel (ie: the right edge of the matrix). Zero-padding
|
||||
// can also occur along the long edge of the last panel if the m
|
||||
// of each micropanel (ie: the right edge of the matrix). Zero-padding
|
||||
// can also occur along the long edge of the last micropanel if the m
|
||||
// dimension of the matrix is not a whole multiple of MR.
|
||||
ps_p = cs_p * n_p_pad;
|
||||
|
||||
// As a general rule, we don't want panel strides to be odd. This
|
||||
// As a general rule, we don't want micropanel strides to be odd. This
|
||||
// is primarily motivated by our desire to support interleaved 3m
|
||||
// micro-panels, in which case we have to scale the panel stride
|
||||
// micropanels, in which case we have to scale the panel stride
|
||||
// by 3/2. That division by 2 means the numerator (prior to being
|
||||
// scaled by 3) must be even.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
@@ -399,7 +399,7 @@ siz_t bli_packm_init_pack
|
||||
|
||||
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
|
||||
// always interpreted as being in units of the datatype of the object
|
||||
// which is not necessarily how the micro-panels will be stored. For
|
||||
// which is not necessarily how the micropanels will be stored. For
|
||||
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
|
||||
// we halve ps_p. Why? Because the macro-kernel indexes in units of
|
||||
// the complex datatype. So these changes "trick" it into indexing
|
||||
@@ -418,11 +418,11 @@ siz_t bli_packm_init_pack
|
||||
// If it is indeed odd, we nudge it higher.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Despite the fact that the packed micro-panels will contain
|
||||
// Despite the fact that the packed micropanels will contain
|
||||
// real elements, the panel stride that we store in the obj_t
|
||||
// (which is passed into the macro-kernel) needs to be in units
|
||||
// of complex elements, since the macro-kernel will index through
|
||||
// micro-panels via complex pointer arithmetic for trmm/trsm.
|
||||
// micropanels via complex pointer arithmetic for trmm/trsm.
|
||||
// Since the indexing "increment" will be twice as large as each
|
||||
// actual stored element, we divide the panel_stride by 2.
|
||||
ps_p = ps_p / 2;
|
||||
@@ -431,10 +431,10 @@ siz_t bli_packm_init_pack
|
||||
// Set the imaginary stride (in units of fundamental elements) for
|
||||
// 3m and 4m (separated or interleaved). We use ps_p_orig since
|
||||
// that variable tracks the number of real part elements contained
|
||||
// within each micro-panel of the source matrix. Therefore, this
|
||||
// within each micropanel of the source matrix. Therefore, this
|
||||
// is the number of real elements that must be traversed before
|
||||
// reaching the imaginary part (3mi/4mi) of the packed micro-panel,
|
||||
// or the real part of the next micro-panel (3ms).
|
||||
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
|
||||
// or the real part of the next micropanel (3ms).
|
||||
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
|
||||
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
|
||||
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel );
|
||||
@@ -461,28 +461,29 @@ siz_t bli_packm_init_pack
|
||||
// default (logical) blocksize multiple in the n dimension.
|
||||
n_panel = bmult_n_def;
|
||||
|
||||
// The "row stride" of a column panel packed object is interpreted as
|
||||
// the row stride WITHIN a panel. Thus, this is equal to the
|
||||
// The "row stride" of a column-micropanel packed object is interpreted
|
||||
// as the row stride WITHIN a micropanel. Thus, this is equal to the
|
||||
// packing (storage) blocksize multiple (which may be equal to the
|
||||
// default (logical) blocksize multiple.
|
||||
rs_p = bmult_n_pack;
|
||||
|
||||
// The "column stride" of a column panel packed object is interpreted
|
||||
// as the column stride WITHIN a panel. Thus, it is unit.
|
||||
// The "column stride" of a column-micropanel packed object is
|
||||
// interpreted as the column stride WITHIN a micropanel. Thus, it is
|
||||
// unit.
|
||||
cs_p = 1;
|
||||
|
||||
// The "panel stride" of a panel packed object is interpreted as the
|
||||
// distance between the (0,0) element of panel k and the (0,0)
|
||||
// The "panel stride" of a micropanel packed object is interpreted as
|
||||
// the distance between the (0,0) element of panel k and the (0,0)
|
||||
// element of panel k+1. We use the padded length computed above to
|
||||
// allow for zero-padding (if necessary/desired) along the far end
|
||||
// of each panel (ie: the bottom edge of the matrix). Zero-padding
|
||||
// can also occur along the long edge of the last panel if the n
|
||||
// of each micropanel (ie: the bottom edge of the matrix). Zero-padding
|
||||
// can also occur along the long edge of the last micropanel if the n
|
||||
// dimension of the matrix is not a whole multiple of NR.
|
||||
ps_p = m_p_pad * rs_p;
|
||||
|
||||
// As a general rule, we don't want panel strides to be odd. This
|
||||
// As a general rule, we don't want micropanel strides to be odd. This
|
||||
// is primarily motivated by our desire to support interleaved 3m
|
||||
// micro-panels, in which case we have to scale the panel stride
|
||||
// micropanels, in which case we have to scale the panel stride
|
||||
// by 3/2. That division by 2 means the numerator (prior to being
|
||||
// scaled by 3) must be even.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
@@ -492,7 +493,7 @@ siz_t bli_packm_init_pack
|
||||
|
||||
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
|
||||
// always interpreted as being in units of the datatype of the object
|
||||
// which is not necessarily how the micro-panels will be stored. For
|
||||
// which is not necessarily how the micropanels will be stored. For
|
||||
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
|
||||
// we halve ps_p. Why? Because the macro-kernel indexes in units of
|
||||
// the complex datatype. So these changes "trick" it into indexing
|
||||
@@ -511,11 +512,11 @@ siz_t bli_packm_init_pack
|
||||
// If it is indeed odd, we nudge it higher.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Despite the fact that the packed micro-panels will contain
|
||||
// Despite the fact that the packed micropanels will contain
|
||||
// real elements, the panel stride that we store in the obj_t
|
||||
// (which is passed into the macro-kernel) needs to be in units
|
||||
// of complex elements, since the macro-kernel will index through
|
||||
// micro-panels via complex pointer arithmetic for trmm/trsm.
|
||||
// micropanels via complex pointer arithmetic for trmm/trsm.
|
||||
// Since the indexing "increment" will be twice as large as each
|
||||
// actual stored element, we divide the panel_stride by 2.
|
||||
ps_p = ps_p / 2;
|
||||
@@ -524,10 +525,10 @@ siz_t bli_packm_init_pack
|
||||
// Set the imaginary stride (in units of fundamental elements) for
|
||||
// 3m and 4m (separated or interleaved). We use ps_p_orig since
|
||||
// that variable tracks the number of real part elements contained
|
||||
// within each micro-panel of the source matrix. Therefore, this
|
||||
// within each micropanel of the source matrix. Therefore, this
|
||||
// is the number of real elements that must be traversed before
|
||||
// reaching the imaginary part (3mi/4mi) of the packed micro-panel,
|
||||
// or the real part of the next micro-panel (3ms).
|
||||
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
|
||||
// or the real part of the next micropanel (3ms).
|
||||
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
|
||||
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
|
||||
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel );
|
||||
|
||||
@@ -100,6 +100,7 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
@@ -338,6 +339,7 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
@@ -436,6 +438,7 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc10, \
|
||||
schema, \
|
||||
p10_dim, \
|
||||
panel_dim_max, \
|
||||
p10_len, \
|
||||
@@ -455,6 +458,7 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc12, \
|
||||
schema, \
|
||||
p12_dim, \
|
||||
panel_dim_max, \
|
||||
p12_len, \
|
||||
@@ -561,6 +565,7 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
|
||||
@@ -108,9 +108,9 @@ void bli_gemm_front
|
||||
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
|
||||
// have made a copy and modified the schemas, so reading them from the
|
||||
// context should be a safe bet at this point.) This is a sort of hack for
|
||||
// communicating the desired pack schemas for to bli_gemm_cntl_create()
|
||||
// (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows
|
||||
// us to subsequently access the schemas from the control tree, which
|
||||
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
|
||||
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
|
||||
// to subsequently access the schemas from the control tree, which
|
||||
// hopefully reduces some confusion, particularly in bli_packm_init().
|
||||
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
@@ -69,10 +69,44 @@ void bli_hemm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
#ifdef BLIS_DISABLE_HEMM_RIGHT
|
||||
// NOTE: This case casts right-side hemm/symm in terms of left side. This
|
||||
// is necessary when the current subconfiguration uses a gemm microkernel
|
||||
// that assumes that the packing kernel will have already duplicated
|
||||
// (broadcast) element of B in the packed copy of B. Supporting
|
||||
// duplication within the logic that packs micropanels from Hermitian/
|
||||
// symmetric matrices would be ugly, and so we simply don't support it.
|
||||
// As a consequence, those subconfigurations need a way to force the
|
||||
// Hermitian/symmetric matrix to be on the left (and thus the general
|
||||
// matrix to the on the right). So our solution is that in those cases,
|
||||
// the subconfigurations simply #define BLIS_DISABLE_HEMM_RIGHT.
|
||||
|
||||
// If A is being multiplied from the right, transpose all operands
|
||||
// so that we can perform the computation as if A were being multiplied
|
||||
// from the left.
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
#else
|
||||
// NOTE: This case computes right-side hemm/symm natively by packing
|
||||
// elements of the Hermitian/symmetric matrix A to micropanels of the
|
||||
// right-hand packed matrix operand "B", and elements of the general
|
||||
// matrix B to micropanels of the left-hand packed matrix operand "A".
|
||||
// This code path always gives us the opportunity to transpose the
|
||||
// entire operation so that the effective storage format of the output
|
||||
// matrix matches the microkernel's output preference. Thus, from a
|
||||
// performance perspective, this case is preferred.
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( !bli_obj_is_1x1( &c_local ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
@@ -81,12 +115,21 @@ void bli_hemm_front
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Swap A and B if multiplying A from the right so that "B" contains
|
||||
// the Hermitian matrix.
|
||||
// If the Hermitian/symmetric matrix A is being multiplied from the right,
|
||||
// swap A and B so that the Hermitian/symmetric matrix will actually be on
|
||||
// the right.
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
bli_obj_set_as_root( &a_local );
|
||||
bli_obj_set_as_root( &b_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
|
||||
@@ -69,10 +69,44 @@ void bli_symm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
#ifdef BLIS_DISABLE_HEMM_RIGHT
|
||||
// NOTE: This case casts right-side hemm/symm in terms of left side. This
|
||||
// is necessary when the current subconfiguration uses a gemm microkernel
|
||||
// that assumes that the packing kernel will have already duplicated
|
||||
// (broadcast) element of B in the packed copy of B. Supporting
|
||||
// duplication within the logic that packs micropanels from Hermitian/
|
||||
// symmetric matrices would be ugly, and so we simply don't support it.
|
||||
// As a consequence, those subconfigurations need a way to force the
|
||||
// Hermitian/symmetric matrix to be on the left (and thus the general
|
||||
// matrix to the on the right). So our solution is that in those cases,
|
||||
// the subconfigurations simply #define BLIS_DISABLE_HEMM_RIGHT.
|
||||
|
||||
// If A is being multiplied from the right, transpose all operands
|
||||
// so that we can perform the computation as if A were being multiplied
|
||||
// from the left.
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
#else
|
||||
// NOTE: This case computes right-side hemm/symm natively by packing
|
||||
// elements of the Hermitian/symmetric matrix A to micropanels of the
|
||||
// right-hand packed matrix operand "B", and elements of the general
|
||||
// matrix B to micropanels of the left-hand packed matrix operand "A".
|
||||
// This code path always gives us the opportunity to transpose the
|
||||
// entire operation so that the effective storage format of the output
|
||||
// matrix matches the microkernel's output preference. Thus, from a
|
||||
// performance perspective, this case is preferred.
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( !bli_obj_is_1x1( &c_local ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
@@ -80,12 +114,21 @@ void bli_symm_front
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Swap A and B if multiplying A from the right so that "B" contains
|
||||
// the symmetric matrix.
|
||||
// If the Hermitian/symmetric matrix A is being multiplied from the right,
|
||||
// swap A and B so that the Hermitian/symmetric matrix will actually be on
|
||||
// the right.
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
bli_obj_set_as_root( &a_local );
|
||||
bli_obj_set_as_root( &b_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
|
||||
@@ -389,6 +389,7 @@ pool_t* bli_apool_array_elem
|
||||
const siz_t num_blocks = 1;
|
||||
const siz_t block_ptrs_len = 25;
|
||||
const siz_t align_size = 16;
|
||||
const siz_t offset_size = 0;
|
||||
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
|
||||
free_ft free_fp = BLIS_FREE_POOL;
|
||||
|
||||
@@ -425,6 +426,7 @@ pool_t* bli_apool_array_elem
|
||||
block_ptrs_len,
|
||||
block_size,
|
||||
align_size,
|
||||
offset_size,
|
||||
malloc_fp,
|
||||
free_fp,
|
||||
pool
|
||||
|
||||
@@ -654,6 +654,128 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... )
|
||||
{
|
||||
// This function can be called from the bli_cntx_init_*() function for
|
||||
// a particular architecture if the kernel developer wishes to use
|
||||
// non-default level-3 virtual microkernels. It should be called after
|
||||
// bli_cntx_init_defaults() so that the context begins with default
|
||||
// microkernels across all datatypes.
|
||||
|
||||
/* Example prototypes:
|
||||
|
||||
void bli_cntx_set_l3_vir_ukrs
|
||||
(
|
||||
dim_t n_ukrs,
|
||||
l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
|
||||
l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
|
||||
l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
*/
|
||||
|
||||
va_list args;
|
||||
dim_t i;
|
||||
|
||||
// Allocate some temporary local arrays.
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_l3_vir_ukrs(): " );
|
||||
#endif
|
||||
l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_l3_vir_ukrs(): " );
|
||||
#endif
|
||||
num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_l3_vir_ukrs(): " );
|
||||
#endif
|
||||
void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ) );
|
||||
|
||||
// -- Begin variable argument section --
|
||||
|
||||
// Initialize variable argument environment.
|
||||
va_start( args, n_ukrs );
|
||||
|
||||
// Process n_ukrs tuples.
|
||||
for ( i = 0; i < n_ukrs; ++i )
|
||||
{
|
||||
// Here, we query the variable argument list for:
|
||||
// - the l3ukr_t of the kernel we're about to process,
|
||||
// - the datatype of the kernel, and
|
||||
// - the kernel function pointer.
|
||||
// that we need to store to the context.
|
||||
const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t );
|
||||
const num_t ukr_dt = ( num_t )va_arg( args, num_t );
|
||||
void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
ukr_ids[ i ] = ukr_id;
|
||||
ukr_dts[ i ] = ukr_dt;
|
||||
ukr_fps[ i ] = ukr_fp;
|
||||
}
|
||||
|
||||
// The last argument should be the context pointer.
|
||||
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
|
||||
|
||||
// Shutdown variable argument environment and clean up stack.
|
||||
va_end( args );
|
||||
|
||||
// -- End variable argument section --
|
||||
|
||||
// Query the context for the addresses of:
|
||||
// - the l3 virtual ukernel func_t array
|
||||
func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
|
||||
// Now that we have the context address, we want to copy the values
|
||||
// from the temporary buffers into the corresponding buffers in the
|
||||
// context.
|
||||
|
||||
// Process each blocksize id tuple provided.
|
||||
for ( i = 0; i < n_ukrs; ++i )
|
||||
{
|
||||
// Read the current ukernel id, ukernel datatype, ukernel function
|
||||
// pointer, and ukernel preference.
|
||||
const l3ukr_t ukr_id = ukr_ids[ i ];
|
||||
const num_t ukr_dt = ukr_dts[ i ];
|
||||
void_fp ukr_fp = ukr_fps[ i ];
|
||||
|
||||
// Index into the func_t and mbool_t for the current kernel id
|
||||
// being processed.
|
||||
func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ];
|
||||
|
||||
// Store the ukernel function pointer and preference values into
|
||||
// the context. Notice that we redundantly store the native
|
||||
// ukernel address in both the native and virtual ukernel slots
|
||||
// in the context. This is standard practice when creating a
|
||||
// native context. (Induced method contexts will overwrite the
|
||||
// virtual function pointer with the address of the appropriate
|
||||
// virtual ukernel.)
|
||||
bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
|
||||
}
|
||||
|
||||
// Free the temporary local arrays.
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_l3_vir_ukrs(): " );
|
||||
#endif
|
||||
bli_free_intl( ukr_ids );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_l3_vir_ukrs(): " );
|
||||
#endif
|
||||
bli_free_intl( ukr_dts );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_l3_vir_ukrs(): " );
|
||||
#endif
|
||||
bli_free_intl( ukr_fps );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... )
|
||||
{
|
||||
// This function can be called from the bli_cntx_init_*() function for
|
||||
|
||||
@@ -738,6 +738,7 @@ BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... );
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... );
|
||||
|
||||
@@ -43,25 +43,32 @@
|
||||
static char* bli_version_str = BLIS_VERSION_STRING;
|
||||
static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
|
||||
|
||||
char* bli_info_get_version_str( void ) { return bli_version_str; }
|
||||
char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; }
|
||||
char* bli_info_get_version_str( void ) { return bli_version_str; }
|
||||
char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; }
|
||||
|
||||
|
||||
|
||||
// -- General configuration-related --------------------------------------------
|
||||
|
||||
gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZE; }
|
||||
gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; }
|
||||
gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; }
|
||||
gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; }
|
||||
gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_NUM_REGISTERS; }
|
||||
gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_SIZE; }
|
||||
gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_stack_buf_max_size( void ) { return BLIS_STACK_BUF_MAX_SIZE; }
|
||||
gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_heap_addr_align_size( void ) { return BLIS_HEAP_ADDR_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_heap_stride_align_size( void ) { return BLIS_HEAP_STRIDE_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_pool_addr_align_size( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZE; }
|
||||
gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; }
|
||||
gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; }
|
||||
gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; }
|
||||
gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_NUM_REGISTERS; }
|
||||
gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_SIZE; }
|
||||
gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_stack_buf_max_size( void ) { return BLIS_STACK_BUF_MAX_SIZE; }
|
||||
gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_heap_addr_align_size( void ) { return BLIS_HEAP_ADDR_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_heap_stride_align_size( void ) { return BLIS_HEAP_STRIDE_ALIGN_SIZE; }
|
||||
gint_t bli_info_get_pool_addr_align_size_a( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_A; }
|
||||
gint_t bli_info_get_pool_addr_align_size_b( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_B; }
|
||||
gint_t bli_info_get_pool_addr_align_size_c( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_C; }
|
||||
gint_t bli_info_get_pool_addr_align_size_gen( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_GEN; }
|
||||
gint_t bli_info_get_pool_addr_offset_size_a( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_A; }
|
||||
gint_t bli_info_get_pool_addr_offset_size_b( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_B; }
|
||||
gint_t bli_info_get_pool_addr_offset_size_c( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_C; }
|
||||
gint_t bli_info_get_pool_addr_offset_size_gen( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_GEN; }
|
||||
gint_t bli_info_get_enable_stay_auto_init( void )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_STAY_AUTO_INITIALIZED
|
||||
|
||||
@@ -53,7 +53,14 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void );
|
||||
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void );
|
||||
|
||||
@@ -52,11 +52,12 @@ void bli_membrk_init
|
||||
{
|
||||
membrk_t* restrict membrk = bli_membrk_query();
|
||||
|
||||
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
|
||||
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN;
|
||||
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
|
||||
free_ft free_fp = BLIS_FREE_POOL;
|
||||
|
||||
// These fields are used for general-purpose allocation.
|
||||
// These fields are used for general-purpose allocation (ie: buf_type
|
||||
// equal to BLIS_BUFFER_FOR_GEN_USE) within bli_membrk_acquire_m().
|
||||
bli_membrk_set_align_size( align_size, membrk );
|
||||
bli_membrk_set_malloc_fp( malloc_fp, membrk );
|
||||
bli_membrk_set_free_fp( free_fp, membrk );
|
||||
@@ -348,8 +349,15 @@ void bli_membrk_init_pools
|
||||
const dim_t block_ptrs_len_b = 80;
|
||||
const dim_t block_ptrs_len_c = 0;
|
||||
|
||||
// Use the address alignment size designated (at configure-time) for pools.
|
||||
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
|
||||
// Use the address alignment sizes designated (at configure-time) for pools.
|
||||
const siz_t align_size_a = BLIS_POOL_ADDR_ALIGN_SIZE_A;
|
||||
const siz_t align_size_b = BLIS_POOL_ADDR_ALIGN_SIZE_B;
|
||||
const siz_t align_size_c = BLIS_POOL_ADDR_ALIGN_SIZE_C;
|
||||
|
||||
// Use the offsets from the above alignments.
|
||||
const siz_t offset_size_a = BLIS_POOL_ADDR_OFFSET_SIZE_A;
|
||||
const siz_t offset_size_b = BLIS_POOL_ADDR_OFFSET_SIZE_B;
|
||||
const siz_t offset_size_c = BLIS_POOL_ADDR_OFFSET_SIZE_C;
|
||||
|
||||
// Use the malloc() and free() designated (at configure-time) for pools.
|
||||
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
|
||||
@@ -362,12 +370,12 @@ void bli_membrk_init_pools
|
||||
cntx );
|
||||
|
||||
// Initialize the memory pools for A, B, and C.
|
||||
bli_pool_init( num_blocks_a, block_ptrs_len_a, block_size_a, align_size,
|
||||
malloc_fp, free_fp, pool_a );
|
||||
bli_pool_init( num_blocks_b, block_ptrs_len_b, block_size_b, align_size,
|
||||
malloc_fp, free_fp, pool_b );
|
||||
bli_pool_init( num_blocks_c, block_ptrs_len_c, block_size_c, align_size,
|
||||
malloc_fp, free_fp, pool_c );
|
||||
bli_pool_init( num_blocks_a, block_ptrs_len_a, block_size_a, align_size_a,
|
||||
offset_size_a, malloc_fp, free_fp, pool_a );
|
||||
bli_pool_init( num_blocks_b, block_ptrs_len_b, block_size_b, align_size_b,
|
||||
offset_size_b, malloc_fp, free_fp, pool_b );
|
||||
bli_pool_init( num_blocks_c, block_ptrs_len_c, block_size_c, align_size_c,
|
||||
offset_size_c, malloc_fp, free_fp, pool_c );
|
||||
}
|
||||
|
||||
void bli_membrk_finalize_pools
|
||||
|
||||
@@ -43,6 +43,7 @@ void bli_pool_init
|
||||
siz_t block_ptrs_len,
|
||||
siz_t block_size,
|
||||
siz_t align_size,
|
||||
siz_t offset_size,
|
||||
malloc_ft malloc_fp,
|
||||
free_ft free_fp,
|
||||
pool_t* restrict pool
|
||||
@@ -67,8 +68,8 @@ void bli_pool_init
|
||||
for ( dim_t i = 0; i < num_blocks; ++i )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_pool_init(): allocating block %d of size %d (align %d).\n",
|
||||
( int )i, ( int )block_size, ( int )align_size );
|
||||
printf( "bli_pool_init(): allocating block %d of size %d (align %d, offset %d).\n",
|
||||
( int )i, ( int )block_size, ( int )align_size, ( int )offset_size );
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
@@ -76,6 +77,7 @@ void bli_pool_init
|
||||
(
|
||||
block_size,
|
||||
align_size,
|
||||
offset_size,
|
||||
malloc_fp,
|
||||
&(block_ptrs[i])
|
||||
);
|
||||
@@ -99,6 +101,7 @@ void bli_pool_init
|
||||
bli_pool_set_num_blocks( num_blocks, pool );
|
||||
bli_pool_set_block_size( block_size, pool );
|
||||
bli_pool_set_align_size( align_size, pool );
|
||||
bli_pool_set_offset_size( offset_size, pool );
|
||||
bli_pool_set_malloc_fp( malloc_fp, pool );
|
||||
bli_pool_set_free_fp( free_fp, pool );
|
||||
}
|
||||
@@ -135,12 +138,16 @@ void bli_pool_finalize
|
||||
free_ft free_fp = bli_pool_free_fp( pool );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_pool_finalize(): freeing %d blocks of size %d (align %d).\n",
|
||||
printf( "bli_pool_finalize(): freeing %d blocks of size %d (align %d, offset %d).\n",
|
||||
( int )num_blocks, ( int )bli_pool_block_size( pool ),
|
||||
( int )bli_pool_align_size( pool ) );
|
||||
( int )bli_pool_align_size( pool ),
|
||||
( int )bli_pool_offset_size( pool ) );
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
// Query the offset size of the pool.
|
||||
const siz_t offset_size = bli_pool_offset_size( pool );
|
||||
|
||||
// Free the individual blocks currently in the pool.
|
||||
for ( dim_t i = 0; i < num_blocks; ++i )
|
||||
{
|
||||
@@ -148,7 +155,7 @@ void bli_pool_finalize
|
||||
printf( "bli_pool_finalize(): block %d: ", ( int )i );
|
||||
#endif
|
||||
|
||||
bli_pool_free_block( free_fp, &(block_ptrs[i]) );
|
||||
bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) );
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
@@ -169,6 +176,7 @@ void bli_pool_finalize
|
||||
bli_pool_set_top_index( 0, pool );
|
||||
bli_pool_set_block_size( 0, pool );
|
||||
bli_pool_set_align_size( 0, pool );
|
||||
bli_pool_set_offset_size( 0, pool );
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -178,6 +186,7 @@ void bli_pool_reinit
|
||||
siz_t block_ptrs_len_new,
|
||||
siz_t block_size_new,
|
||||
siz_t align_size_new,
|
||||
siz_t offset_size_new,
|
||||
pool_t* restrict pool
|
||||
)
|
||||
{
|
||||
@@ -202,6 +211,7 @@ void bli_pool_reinit
|
||||
block_ptrs_len_new,
|
||||
block_size_new,
|
||||
align_size_new,
|
||||
offset_size_new,
|
||||
malloc_fp,
|
||||
free_fp,
|
||||
pool
|
||||
@@ -223,6 +233,7 @@ void bli_pool_checkout_block
|
||||
const siz_t num_blocks_new = bli_pool_num_blocks( pool );
|
||||
const siz_t block_ptrs_len_new = bli_pool_block_ptrs_len( pool );
|
||||
const siz_t align_size_new = bli_pool_align_size( pool );
|
||||
const siz_t offset_size_new = bli_pool_offset_size( pool );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_pool_checkout_block(): old block size %d < req size %d; "
|
||||
@@ -237,6 +248,7 @@ void bli_pool_checkout_block
|
||||
block_ptrs_len_new,
|
||||
req_size,
|
||||
align_size_new,
|
||||
offset_size_new,
|
||||
pool
|
||||
);
|
||||
}
|
||||
@@ -293,10 +305,13 @@ void bli_pool_checkin_block
|
||||
// has since been reinitialized to a different (larger) block size.
|
||||
if ( bli_pblk_block_size( block ) != bli_pool_block_size( pool ) )
|
||||
{
|
||||
// Query the offset size of the pool.
|
||||
const siz_t offset_size = bli_pool_offset_size( pool );
|
||||
|
||||
// Query the free() function pointer for the pool.
|
||||
free_ft free_fp = bli_pool_free_fp( pool );
|
||||
|
||||
bli_pool_free_block( free_fp, block );
|
||||
bli_pool_free_block( offset_size, free_fp, block );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -308,9 +323,10 @@ void bli_pool_checkin_block
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_pool_checkin_block(): checking in block %d of size %d "
|
||||
"(align %d).\n",
|
||||
"(align %d, offset %d).\n",
|
||||
( int )top_index - 1, ( int )bli_pool_block_size( pool ),
|
||||
( int )bli_pool_align_size( pool ) );
|
||||
( int )bli_pool_align_size( pool ),
|
||||
( int )bli_pool_offset_size( pool ) );
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
@@ -396,8 +412,9 @@ void bli_pool_grow
|
||||
pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
|
||||
|
||||
// Query the block size and alignment size of the pool.
|
||||
const siz_t block_size = bli_pool_block_size( pool );
|
||||
const siz_t align_size = bli_pool_align_size( pool );
|
||||
const siz_t block_size = bli_pool_block_size( pool );
|
||||
const siz_t align_size = bli_pool_align_size( pool );
|
||||
const siz_t offset_size = bli_pool_offset_size( pool );
|
||||
|
||||
// Query the malloc() function pointer for the pool.
|
||||
malloc_ft malloc_fp = bli_pool_malloc_fp( pool );
|
||||
@@ -415,6 +432,7 @@ void bli_pool_grow
|
||||
(
|
||||
block_size,
|
||||
align_size,
|
||||
offset_size,
|
||||
malloc_fp,
|
||||
&(block_ptrs[i])
|
||||
);
|
||||
@@ -456,13 +474,16 @@ void bli_pool_shrink
|
||||
// Compute the new total number of blocks.
|
||||
const siz_t num_blocks_new = num_blocks - num_blocks_sub;
|
||||
|
||||
// Query the offset size of the pool.
|
||||
const siz_t offset_size = bli_pool_offset_size( pool );
|
||||
|
||||
// Query the free() function pointer for the pool.
|
||||
free_ft free_fp = bli_pool_free_fp( pool );
|
||||
|
||||
// Free the individual blocks.
|
||||
for ( dim_t i = num_blocks_new; i < num_blocks; ++i )
|
||||
{
|
||||
bli_pool_free_block( free_fp, &(block_ptrs[i]) );
|
||||
bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) );
|
||||
}
|
||||
|
||||
// Update the pool_t struct.
|
||||
@@ -477,22 +498,25 @@ void bli_pool_alloc_block
|
||||
(
|
||||
siz_t block_size,
|
||||
siz_t align_size,
|
||||
siz_t offset_size,
|
||||
malloc_ft malloc_fp,
|
||||
pblk_t* restrict block
|
||||
)
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d)\n",
|
||||
( int )block_size, ( int )align_size );
|
||||
printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d, offset %d)\n",
|
||||
( int )block_size, ( int )align_size, ( int )offset_size );
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
// Allocate the block via the bli_fmalloc_align() wrapper, which performs
|
||||
// alignment logic and opaquely saves the original pointer so that it can
|
||||
// be recovered when it's time to free the block.
|
||||
// be recovered when it's time to free the block. Note that we have to
|
||||
// add offset_size to the number of bytes requested since we will skip
|
||||
// that many bytes at the beginning of the allocated memory.
|
||||
void* restrict buf
|
||||
=
|
||||
bli_fmalloc_align( malloc_fp, block_size, align_size );
|
||||
bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size );
|
||||
|
||||
#if 0
|
||||
// NOTE: This code is disabled because it is not needed, since
|
||||
@@ -517,6 +541,9 @@ void bli_pool_alloc_block
|
||||
}
|
||||
#endif
|
||||
|
||||
// Advance the pointer by offset_size bytes.
|
||||
buf = ( void* )( ( char* )buf + offset_size );
|
||||
|
||||
// Save the results in the pblk_t structure.
|
||||
bli_pblk_set_buf( buf, block );
|
||||
bli_pblk_set_block_size( block_size, block );
|
||||
@@ -524,6 +551,7 @@ void bli_pool_alloc_block
|
||||
|
||||
void bli_pool_free_block
|
||||
(
|
||||
siz_t offset_size,
|
||||
free_ft free_fp,
|
||||
pblk_t* restrict block
|
||||
)
|
||||
@@ -538,6 +566,10 @@ void bli_pool_free_block
|
||||
// bli_fmalloc_align() when the block was allocated.
|
||||
void* restrict buf = bli_pblk_buf( block );
|
||||
|
||||
// Undo the pointer advancement by offset_size bytes performed previously
|
||||
// by bli_pool_alloc_block().
|
||||
buf = ( void* )( ( char* )buf - offset_size );
|
||||
|
||||
// Free the block via the bli_ffree_align() wrapper, which recovers the
|
||||
// original pointer that was returned by the pool's malloc() function when
|
||||
// the block was allocated.
|
||||
@@ -555,7 +587,7 @@ void bli_pool_print
|
||||
siz_t num_blocks = bli_pool_num_blocks( pool );
|
||||
siz_t block_size = bli_pool_block_size( pool );
|
||||
siz_t align_size = bli_pool_align_size( pool );
|
||||
dim_t i;
|
||||
siz_t offset_size = bli_pool_offset_size( pool );
|
||||
|
||||
printf( "pool struct ---------------\n" );
|
||||
printf( " block_ptrs: %p\n", block_ptrs );
|
||||
@@ -564,8 +596,10 @@ void bli_pool_print
|
||||
printf( " num_blocks: %d\n", ( int )num_blocks );
|
||||
printf( " block_size: %d\n", ( int )block_size );
|
||||
printf( " align_size: %d\n", ( int )align_size );
|
||||
printf( " offset_size: %d\n", ( int )offset_size );
|
||||
printf( " pblks sys align\n" );
|
||||
for ( i = 0; i < num_blocks; ++i )
|
||||
|
||||
for ( dim_t i = 0; i < num_blocks; ++i )
|
||||
{
|
||||
printf( " %d: %p\n", ( int )i, bli_pblk_buf( &block_ptrs[i] ) );
|
||||
}
|
||||
|
||||
@@ -126,6 +126,11 @@ static siz_t bli_pool_align_size( pool_t* pool )
|
||||
return pool->align_size;
|
||||
}
|
||||
|
||||
static siz_t bli_pool_offset_size( pool_t* pool )
|
||||
{
|
||||
return pool->offset_size;
|
||||
}
|
||||
|
||||
static malloc_ft bli_pool_malloc_fp( pool_t* pool )
|
||||
{
|
||||
return pool->malloc_fp;
|
||||
@@ -174,6 +179,11 @@ static void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \
|
||||
pool->align_size = align_size;
|
||||
}
|
||||
|
||||
static void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \
|
||||
{
|
||||
pool->offset_size = offset_size;
|
||||
}
|
||||
|
||||
static void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \
|
||||
{
|
||||
pool->malloc_fp = malloc_fp;
|
||||
@@ -197,6 +207,7 @@ void bli_pool_init
|
||||
siz_t block_ptrs_len,
|
||||
siz_t block_size,
|
||||
siz_t align_size,
|
||||
siz_t offset_size,
|
||||
malloc_ft malloc_fp,
|
||||
free_ft free_fp,
|
||||
pool_t* restrict pool
|
||||
@@ -211,6 +222,7 @@ void bli_pool_reinit
|
||||
siz_t block_ptrs_len_new,
|
||||
siz_t block_size_new,
|
||||
siz_t align_size_new,
|
||||
siz_t offset_size_new,
|
||||
pool_t* restrict pool
|
||||
);
|
||||
|
||||
@@ -241,11 +253,13 @@ void bli_pool_alloc_block
|
||||
(
|
||||
siz_t block_size,
|
||||
siz_t align_size,
|
||||
siz_t offset_size,
|
||||
malloc_ft malloc_fp,
|
||||
pblk_t* restrict block
|
||||
);
|
||||
void bli_pool_free_block
|
||||
(
|
||||
siz_t offset_size,
|
||||
free_ft free_fp,
|
||||
pblk_t* restrict block
|
||||
);
|
||||
|
||||
@@ -225,6 +225,24 @@ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varn
|
||||
|
||||
|
||||
|
||||
// -- Basic one-operand macro with real domain only --
|
||||
|
||||
// -- (no auxiliary arguments) --
|
||||
|
||||
#define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \
|
||||
\
|
||||
GENTFUNCRO( float, s, tfuncname ) \
|
||||
GENTFUNCRO( double, d, tfuncname ) \
|
||||
|
||||
// -- (one auxiliary argument) --
|
||||
|
||||
#define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \
|
||||
\
|
||||
GENTFUNCRO( float, s, tfuncname, varname ) \
|
||||
GENTFUNCRO( double, d, tfuncname, varname ) \
|
||||
|
||||
|
||||
|
||||
// -- Basic one-operand macro with complex domain only and real projection --
|
||||
|
||||
// -- (no auxiliary arguments) --
|
||||
|
||||
@@ -165,19 +165,56 @@
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#ifndef BLIS_STACK_BUF_ALIGN_SIZE
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
#endif
|
||||
|
||||
// Alignment size used when allocating memory via BLIS_MALLOC_USER.
|
||||
// To disable heap alignment, set this to 1.
|
||||
#ifndef BLIS_HEAP_ADDR_ALIGN_SIZE
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
#endif
|
||||
|
||||
// Alignment size used when sizing leading dimensions of memory allocated
|
||||
// via BLIS_MALLOC_USER.
|
||||
#ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
#endif
|
||||
|
||||
// Alignment size used when allocating blocks to the internal memory
|
||||
// Alignment sizes used when allocating blocks to the internal memory
|
||||
// pool, via BLIS_MALLOC_POOL.
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE
|
||||
#endif
|
||||
|
||||
// Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*.
|
||||
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 0
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 0
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_C 0
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -195,9 +195,16 @@
|
||||
#include "bli_adds_mxn_uplo.h"
|
||||
#include "bli_set0s_mxn.h"
|
||||
#include "bli_copys_mxn.h"
|
||||
#include "bli_scal2s_mxn.h"
|
||||
#include "bli_xpbys_mxn.h"
|
||||
#include "bli_xpbys_mxn_uplo.h"
|
||||
|
||||
// -- "broadcast B" scalar macros --
|
||||
|
||||
#include "bli_bcastbbs_mxn.h"
|
||||
#include "bli_scal2bbs_mxn.h"
|
||||
#include "bli_set0bbs_mxn.h"
|
||||
|
||||
|
||||
// -- 3m-specific scalar macros --
|
||||
|
||||
|
||||
@@ -1049,6 +1049,7 @@ typedef struct
|
||||
|
||||
siz_t block_size;
|
||||
siz_t align_size;
|
||||
siz_t offset_size;
|
||||
|
||||
malloc_ft malloc_fp;
|
||||
free_ft free_fp;
|
||||
|
||||
74
frame/include/level0/bb/bli_bcastbbs_mxn.h
Normal file
74
frame/include/level0/bb/bli_bcastbbs_mxn.h
Normal file
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_BCASTBBS_MXN_H
|
||||
#define BLIS_BCASTBBS_MXN_H
|
||||
|
||||
// bcastbbs_mxn
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
static void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
ctype* restrict y, const inc_t incy, const inc_t ldy \
|
||||
) \
|
||||
{ \
|
||||
/* Assume that the duplication factor is the column stride of y. */ \
|
||||
const dim_t d = ldy; \
|
||||
const dim_t ds_y = 1; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict yi = y + i*incy; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict yij = yi + j*ldy; \
|
||||
\
|
||||
for ( dim_t p = 1; p < d; ++p ) \
|
||||
{ \
|
||||
ctype* restrict yijd = yij + p*ds_y; \
|
||||
\
|
||||
PASTEMAC(ch,copys)( *yij, *yijd ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( bcastbbs_mxn )
|
||||
|
||||
#endif
|
||||
204
frame/include/level0/bb/bli_scal2bbs_mxn.h
Normal file
204
frame/include/level0/bb/bli_scal2bbs_mxn.h
Normal file
@@ -0,0 +1,204 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL2BBS_MXN_H
|
||||
#define BLIS_SCAL2BBS_MXN_H
|
||||
|
||||
// scal2bbs_mxn
|
||||
|
||||
#undef GENTFUNCRO
|
||||
#define GENTFUNCRO( ctype, ch, opname ) \
|
||||
\
|
||||
static void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
const conj_t conjx, \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict x, const inc_t incx, const inc_t ldx, \
|
||||
ctype* restrict y, const inc_t incy, const inc_t ldy \
|
||||
) \
|
||||
{ \
|
||||
/* Assume that the duplication factor is the row stride of y. */ \
|
||||
const dim_t d = incy; \
|
||||
const dim_t ds_y = 1; \
|
||||
\
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict xj = x + j*ldx; \
|
||||
ctype* restrict yj = y + j*ldy; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict xij = xj + i*incx; \
|
||||
ctype* restrict yij = yj + i*incy; \
|
||||
\
|
||||
PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \
|
||||
\
|
||||
for ( dim_t p = 1; p < d; ++p ) \
|
||||
{ \
|
||||
ctype* restrict yijd = yij + p*ds_y; \
|
||||
\
|
||||
PASTEMAC(ch,copys)( *yij, *yijd ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict xj = x + j*ldx; \
|
||||
ctype* restrict yj = y + j*ldy; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict xij = xj + i*incx; \
|
||||
ctype* restrict yij = yj + i*incy; \
|
||||
\
|
||||
PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \
|
||||
\
|
||||
for ( dim_t p = 1; p < d; ++p ) \
|
||||
{ \
|
||||
ctype* restrict yijd = yij + p*ds_y; \
|
||||
\
|
||||
PASTEMAC(ch,copys)( *yij, *yijd ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn )
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
static void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
const conj_t conjx, \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict x, const inc_t incx, const inc_t ldx, \
|
||||
ctype* restrict y, const inc_t incy, const inc_t ldy \
|
||||
) \
|
||||
{ \
|
||||
/* Assume that the duplication factor is the row stride of y. */ \
|
||||
const dim_t d = incy; \
|
||||
const dim_t ds_y = 1; \
|
||||
\
|
||||
const inc_t incx2 = 2 * incx; \
|
||||
const inc_t ldx2 = 2 * ldx; \
|
||||
\
|
||||
const inc_t incy2 = 2 * incy; \
|
||||
const inc_t ldy2 = 2 * ldy; \
|
||||
\
|
||||
ctype_r* restrict alpha_r = ( ctype_r* )alpha; \
|
||||
ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \
|
||||
ctype_r* restrict chi_r = ( ctype_r* )x; \
|
||||
ctype_r* restrict chi_i = ( ctype_r* )x + 1; \
|
||||
ctype_r* restrict psi_r = ( ctype_r* )y; \
|
||||
ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \
|
||||
\
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype_r* restrict chij_r = chi_r + j*ldx2; \
|
||||
ctype_r* restrict chij_i = chi_i + j*ldx2; \
|
||||
ctype_r* restrict psij_r = psi_r + j*ldy2; \
|
||||
ctype_r* restrict psij_i = psi_i + j*ldy2; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict chiij_r = chij_r + i*incx2; \
|
||||
ctype_r* restrict chiij_i = chij_i + i*incx2; \
|
||||
ctype_r* restrict psiij_r = psij_r + i*incy2; \
|
||||
ctype_r* restrict psiij_i = psij_i + i*incy2; \
|
||||
\
|
||||
PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \
|
||||
*chiij_r, *chiij_i, \
|
||||
*psiij_r, *psiij_i ); \
|
||||
\
|
||||
for ( dim_t p = 1; i < d; ++p ) \
|
||||
{ \
|
||||
ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \
|
||||
ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \
|
||||
\
|
||||
PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \
|
||||
*psiijd_r, *psiijd_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype_r* restrict chij_r = chi_r + j*ldx2; \
|
||||
ctype_r* restrict chij_i = chi_i + j*ldx2; \
|
||||
ctype_r* restrict psij_r = psi_r + j*ldy2; \
|
||||
ctype_r* restrict psij_i = psi_i + j*ldy2; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict chiij_r = chij_r + i*incx2; \
|
||||
ctype_r* restrict chiij_i = chij_i + i*incx2; \
|
||||
ctype_r* restrict psiij_r = psij_r + i*incy2; \
|
||||
ctype_r* restrict psiij_i = psij_i + i*incy2; \
|
||||
\
|
||||
PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \
|
||||
*chiij_r, *chiij_i, \
|
||||
*psiij_r, *psiij_i ); \
|
||||
\
|
||||
for ( dim_t p = 1; i < d; ++p ) \
|
||||
{ \
|
||||
ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \
|
||||
ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \
|
||||
\
|
||||
PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \
|
||||
*psiijd_r, *psiijd_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn )
|
||||
|
||||
#endif
|
||||
74
frame/include/level0/bb/bli_set0bbs_mxn.h
Normal file
74
frame/include/level0/bb/bli_set0bbs_mxn.h
Normal file
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SET0BBS_MXN_H
|
||||
#define BLIS_SET0BBS_MXN_H
|
||||
|
||||
// set0bbs_mxn
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
static void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
ctype* restrict y, const inc_t incy, const inc_t ldy \
|
||||
) \
|
||||
{ \
|
||||
/* Assume that the duplication factor is the row stride of y. */ \
|
||||
const dim_t d = incy; \
|
||||
const dim_t ds_y = 1; \
|
||||
\
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict yj = y + j*ldy; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict yij = yj + i*incy; \
|
||||
\
|
||||
for ( dim_t p = 0; p < d; ++p ) \
|
||||
{ \
|
||||
ctype* restrict yijd = yij + p*ds_y; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( *yijd ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( set0bbs_mxn )
|
||||
|
||||
#endif
|
||||
89
frame/include/level0/bli_scal2s_mxn.h
Normal file
89
frame/include/level0/bli_scal2s_mxn.h
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL2S_MXN_H
|
||||
#define BLIS_SCAL2S_MXN_H
|
||||
|
||||
// scal2s_mxn
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
static void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
const conj_t conjx, \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \
|
||||
ctype* restrict y, const inc_t rs_y, const inc_t cs_y \
|
||||
) \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict xj = x + j*cs_x; \
|
||||
ctype* restrict yj = y + j*cs_y; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict xij = xj + i*rs_x; \
|
||||
ctype* restrict yij = yj + i*rs_y; \
|
||||
\
|
||||
PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( dim_t j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict xj = x + j*cs_x; \
|
||||
ctype* restrict yj = y + j*cs_y; \
|
||||
\
|
||||
for ( dim_t i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype* restrict xij = xj + i*rs_x; \
|
||||
ctype* restrict yij = yj + i*rs_y; \
|
||||
\
|
||||
PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( scal2s_mxn )
|
||||
|
||||
#endif
|
||||
318
ref_kernels/1m/bli_packm_cxk_bb_ref.c
Normal file
318
ref_kernels/1m/bli_packm_cxk_bb_ref.c
Normal file
@@ -0,0 +1,318 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
void* restrict kappa, \
|
||||
void* restrict a, inc_t inca, inc_t lda, \
|
||||
void* restrict p, inc_t ldp, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict alpha1 = a; \
|
||||
ctype* restrict pi1 = p; \
|
||||
\
|
||||
/* Handle the packing of B (column panel schemas) separately from packing
|
||||
of A (row panel schemas). */ \
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
{ \
|
||||
if ( cdim == mnr ) \
|
||||
{ \
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2bbs_mxn) \
|
||||
( \
|
||||
conja, \
|
||||
cdim, \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, 2, ldp \
|
||||
); \
|
||||
\
|
||||
/* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
const dim_t i = cdim; \
|
||||
const dim_t m_edge = mnr - cdim; \
|
||||
const dim_t n_edge = n_max; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (i )*2; \
|
||||
\
|
||||
PASTEMAC(ch,set0bbs_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 2, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if ( n < n_max ) \
|
||||
{ \
|
||||
const dim_t j = n; \
|
||||
const dim_t m_edge = mnr; \
|
||||
const dim_t n_edge = n_max - n; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,set0bbs_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 2, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_row_packed( schema ) ) */ \
|
||||
{ \
|
||||
if ( cdim == mnr ) \
|
||||
{ \
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s_mxn) \
|
||||
( \
|
||||
conja, \
|
||||
cdim, \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, 1, ldp \
|
||||
); \
|
||||
\
|
||||
/* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
const dim_t i = cdim; \
|
||||
const dim_t m_edge = mnr - cdim; \
|
||||
const dim_t n_edge = n_max; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (i )*1; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if ( n < n_max ) \
|
||||
{ \
|
||||
const dim_t j = n; \
|
||||
const dim_t m_edge = mnr; \
|
||||
const dim_t n_edge = n_max - n; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -188,6 +189,7 @@ INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -344,6 +346,7 @@ INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -498,6 +501,7 @@ INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -641,6 +645,7 @@ INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -819,6 +824,7 @@ INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -978,6 +984,7 @@ INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -1145,6 +1152,7 @@ INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -1320,6 +1328,7 @@ INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -1503,6 +1512,7 @@ INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
|
||||
142
ref_kernels/3/bb/bli_gemmbb_ref.c
Normal file
142
ref_kernels/3/bb/bli_gemmbb_ref.c
Normal file
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// An implementation that indexes through B with the assumption that all
|
||||
// elements were broadcast (duplicated) by a factor of NP/NR.
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
\
|
||||
ctype ab[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ab = 1; \
|
||||
const inc_t cs_ab = mr; \
|
||||
\
|
||||
dim_t l, j, i; \
|
||||
\
|
||||
ctype ai; \
|
||||
ctype bj; \
|
||||
\
|
||||
\
|
||||
/* Initialize the accumulator elements in ab to zero. */ \
|
||||
for ( i = 0; i < m * n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,set0s)( *(ab + i) ); \
|
||||
} \
|
||||
\
|
||||
/* Perform a series of k rank-1 updates into ab. */ \
|
||||
for ( l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict abij = ab; \
|
||||
\
|
||||
/* In an optimized implementation, these two loops over MR and NR
|
||||
are typically fully unrolled. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
bj = *(b + j*cs_b); \
|
||||
\
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ai = *(a + i); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( ai, bj, *abij ); \
|
||||
\
|
||||
abij += rs_ab; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a += cs_a; \
|
||||
b += rs_b; \
|
||||
} \
|
||||
\
|
||||
/* Scale the result in ab by alpha. */ \
|
||||
for ( i = 0; i < m * n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
|
||||
scale by beta and then add the scaled redult in ab. */ \
|
||||
if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copys_mxn)( m, \
|
||||
n, \
|
||||
ab, rs_ab, cs_ab, \
|
||||
c, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m, \
|
||||
n, \
|
||||
ab, rs_ab, cs_ab, \
|
||||
beta, \
|
||||
c, rs_c, cs_c ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
138
ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
Normal file
138
ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// An implementation that indexes through B with the assumption that all
|
||||
// elements were broadcast (duplicated) by a factor of NP/NR.
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a1x, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bx1, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
/*
|
||||
printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \
|
||||
printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
|
||||
*/ \
|
||||
\
|
||||
ctype* minus_one = PASTEMAC(ch,m1); \
|
||||
\
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
PASTECH(ch,trsm_ukr_ft) \
|
||||
trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
|
||||
(double*)bx1, rs_b, cs_b, "%5.2f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
|
||||
(double*)b11, rs_b, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* lower: b11 = alpha * b11 - a10 * b01; */ \
|
||||
/* upper: b11 = alpha * b11 - a12 * b21; */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1x, \
|
||||
bx1, \
|
||||
alpha, \
|
||||
b11, rs_b, cs_b, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
|
||||
(double*)b11, rs_b, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */ \
|
||||
trsm_ukr \
|
||||
( \
|
||||
a11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
|
||||
(double*)b11, rs_b, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* Broadcast the elements of the updated b11 submatrix to their
|
||||
duplicated neighbors. */ \
|
||||
PASTEMAC(ch,bcastbbs_mxn) \
|
||||
( \
|
||||
mr, \
|
||||
nr, \
|
||||
b11, rs_b, cs_b \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
|
||||
( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \
|
||||
( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
|
||||
INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
|
||||
|
||||
206
ref_kernels/3/bb/bli_trsmbb_ref.c
Normal file
206
ref_kernels/3/bb/bli_trsmbb_ref.c
Normal file
@@ -0,0 +1,206 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// An implementation that indexes through B with the assumption that all
|
||||
// elements were broadcast (duplicated) by a factor of NP/NR.
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
\
|
||||
ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \
|
||||
ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \
|
||||
ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \
|
||||
ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a10t * B0; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype beta11c = *beta11; \
|
||||
ctype rho11; \
|
||||
\
|
||||
/* beta11 = beta11 - a10t * b01; */ \
|
||||
PASTEMAC(ch,set0s)( rho11 ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype* restrict alpha10 = a10t + (l )*cs_a; \
|
||||
ctype* restrict beta01 = b01 + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho11, beta11c ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
|
||||
\
|
||||
/* Store the local value back to b11. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *beta11 ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = m - iter - 1; \
|
||||
n_behind = iter; \
|
||||
\
|
||||
ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \
|
||||
ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \
|
||||
ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a12t * B2; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype beta11c = *beta11; \
|
||||
ctype rho11; \
|
||||
\
|
||||
/* beta11 = beta11 - a12t * b21; */ \
|
||||
PASTEMAC(ch,set0s)( rho11 ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype* restrict alpha12 = a12t + (l )*cs_a; \
|
||||
ctype* restrict beta21 = b21 + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho11, beta11c ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
|
||||
\
|
||||
/* Store the local value back to b11. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *beta11 ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
2
|
||||
3
|
||||
0.0
|
||||
|
||||
@@ -403,17 +403,7 @@ void libblis_test_gemm_md
|
||||
|
||||
time = bli_clock();
|
||||
|
||||
#if 0
|
||||
bli_printm( "a", &a, "%5.2f", "" );
|
||||
bli_printm( "b", &b, "%5.2f", "" );
|
||||
bli_printm( "c", &c, "%5.2f", "" );
|
||||
bli_printm( "alpha", &alpha, "%5.2f", "" );
|
||||
bli_printm( "beta", &beta, "%5.2f", "" );
|
||||
#endif
|
||||
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
|
||||
#if 0
|
||||
bli_printm( "c after", &c, "%5.2f", "" );
|
||||
#endif
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
}
|
||||
|
||||
@@ -869,7 +869,8 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
libblis_test_fprintf_c( os, " stack address %d\n", ( int )bli_info_get_stack_buf_align_size() );
|
||||
libblis_test_fprintf_c( os, " obj_t address %d\n", ( int )bli_info_get_heap_addr_align_size() );
|
||||
libblis_test_fprintf_c( os, " obj_t stride %d\n", ( int )bli_info_get_heap_stride_align_size() );
|
||||
libblis_test_fprintf_c( os, " pool block addr %d\n", ( int )bli_info_get_pool_addr_align_size() );
|
||||
libblis_test_fprintf_c( os, " pool block addr A (+offset) %d (+%d)\n", ( int )bli_info_get_pool_addr_align_size_a(), ( int )bli_info_get_pool_addr_offset_size_a() );
|
||||
libblis_test_fprintf_c( os, " pool block addr B (+offset) %d (+%d)\n", ( int )bli_info_get_pool_addr_align_size_b(), ( int )bli_info_get_pool_addr_offset_size_b() );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "BLAS/CBLAS compatibility layers \n" );
|
||||
libblis_test_fprintf_c( os, " BLAS API enabled? %d\n", ( int )bli_info_get_enable_blas() );
|
||||
|
||||
Reference in New Issue
Block a user