mirror of
https://github.com/amd/blis.git
synced 2026-03-30 06:05:38 +00:00
Added template implementations and other tweaks.
Details: - Added a 'template' configuration, which contains stub implementations of the level 1, 1f, and 3 kernels with one datatype implemented in C for each, with lots of in-file comments and documentation. - Modified some variable/parameter names for some 1/1f operations. (e.g. renaming vector length parameter from m to n.) - Moved level-1f fusing factors from axpyf, dotxf, and dotxaxpyf header files to bli_kernel.h. - Modifed test suite to print out fusing factors for axpyf, dotxf, and dotxaxpyf, as well as the default fusing factor (which are all equal in the reference and template implementations). - Cleaned up some sloppiness in the level-1f unb_var1.c files whereby these reference variants were implemented in terms of front-end routines rather that directly in terms of the kernels. (For example, axpy2v was implemented as two calls to axpyv rather than two calls to AXPYV_KERNEL.) - Changed the interface to dotxf so that it matches that of axpyf, in that A is assumed to be m x b_n in both cases, and for dotxf A is actually used as A^T. - Minor variable naming and comment changes to reference micro-kernels in frame/3/gemm/ukernels and frame/3/trsm/ukernels.
This commit is contained in:
@@ -97,6 +97,10 @@
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 32
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE 32
|
||||
|
||||
@@ -226,10 +226,25 @@
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -97,6 +97,10 @@
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE 16
|
||||
|
||||
@@ -216,10 +216,25 @@
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -97,6 +97,10 @@
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE 16
|
||||
|
||||
@@ -220,10 +220,25 @@
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -97,6 +97,10 @@
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 32
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
@@ -220,10 +220,25 @@
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -97,6 +97,10 @@
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
@@ -220,10 +220,25 @@
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -95,6 +95,10 @@
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE 16
|
||||
|
||||
@@ -220,10 +220,25 @@
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -97,14 +97,18 @@
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE 16
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE 16
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
@@ -116,7 +120,7 @@
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -54,21 +54,21 @@
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
#define BLIS_DEFAULT_MC_S 64
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 128
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_MC_D 64
|
||||
#define BLIS_DEFAULT_KC_D 128
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
@@ -220,10 +220,25 @@
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
|
||||
169
config/template/bli_config.h
Normal file
169
config/template/bli_config.h
Normal file
@@ -0,0 +1,169 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
// -- INTEGER PROPERTIES -------------------------------------------------------
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions,
|
||||
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
|
||||
// integers while 64 results in 64-bit integers. Any other value results in use
|
||||
// of the C99 type "long int". Note that this ONLY affects integers used
|
||||
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
|
||||
// interface.
|
||||
#define BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
|
||||
|
||||
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
|
||||
|
||||
// Define the number of floating-point types supported, and the size of the
|
||||
// largest type.
|
||||
#define BLIS_NUM_FP_TYPES 4
|
||||
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
|
||||
|
||||
// Enable use of built-in C99 "float complex" and "double complex" types and
|
||||
// associated overloaded operations and functions? Disabling results in
|
||||
// scomplex and dcomplex being defined in terms of simple structs.
|
||||
//#define BLIS_ENABLE_C99_COMPLEX
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 1
|
||||
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
// -- Contiguous (static) memory allocator --
|
||||
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS 1
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
// memory pools so that the micro-kernel, when computing with the end of the
|
||||
// last block, can exceed the bounds of the usable portion of the memory
|
||||
// region without causing a segmentation fault.
|
||||
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
|
||||
|
||||
// -- Memory alignment --
|
||||
|
||||
// It is sometimes useful to define the various memory alignments in terms
|
||||
// of some other characteristics of the system, such as the cache line size
|
||||
// and the page size.
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
// Alignment size used when allocating entire blocks of contiguous memory
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
// Basic (homogeneous) datatype support always enabled.
|
||||
|
||||
// Enable mixed domain operations?
|
||||
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
|
||||
// Enable extra mixed precision operations?
|
||||
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
|
||||
|
||||
|
||||
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
|
||||
|
||||
// Stay initialized after auto-initialization, unless and until the user
|
||||
// explicitly calls bli_finalize().
|
||||
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
|
||||
|
||||
|
||||
|
||||
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
|
||||
|
||||
// Enable the BLAS compatibility layer?
|
||||
#define BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions and
|
||||
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
|
||||
// A value of 32 results in the compatibility layer using 32-bit signed integers
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
391
config/template/bli_kernel.h
Normal file
391
config/template/bli_kernel.h
Normal file
@@ -0,0 +1,391 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 64
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 64
|
||||
#define BLIS_DEFAULT_KC_D 128
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 8
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 8
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
// used by certain blocked variants. But when the *are* used, they MUST be
|
||||
// be an integer multiple of NR!
|
||||
|
||||
#define BLIS_DEFAULT_NI_FAC 16
|
||||
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
|
||||
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
|
||||
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
|
||||
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_mxn.h"
|
||||
#include "bli_trsm_l_opt_mxn.h"
|
||||
#include "bli_trsm_u_opt_mxn.h"
|
||||
#include "bli_gemmtrsm_l_opt_mxn.h"
|
||||
#include "bli_gemmtrsm_u_opt_mxn.h"
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_mxn
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_opt_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_opt_mxn
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
#include "bli_axpy2v_opt_var1.h"
|
||||
#include "bli_dotaxpyv_opt_var1.h"
|
||||
#include "bli_axpyf_opt_var1.h"
|
||||
#include "bli_dotxf_opt_var1.h"
|
||||
#include "bli_dotxaxpyf_opt_var1.h"
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_opt_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_opt_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
308
config/template/kernels/1/bli_axpyv_opt_var1.c
Normal file
308
config/template/kernels/1/bli_axpyv_opt_var1.c
Normal file
@@ -0,0 +1,308 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy )
|
||||
{
|
||||
/*
|
||||
Template axpyv kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
complex kernel, coded in C, which can serve as the starting point for one
|
||||
to write an optimized kernel on an arbitrary architecture. (We show a
|
||||
template implementation for only double-precision complex because the
|
||||
templates for the other three floating-point types would be similar, with
|
||||
the real instantiations being noticeably simpler due to the disappearance
|
||||
of conjugation in the real domain.)
|
||||
|
||||
This kernel performs a vector scale and accumulate (axpy) operation:
|
||||
|
||||
y := y + alpha * conjx( x )
|
||||
|
||||
where x and y are vectors of length n and alpha is a scalar.
|
||||
|
||||
Parameters:
|
||||
|
||||
- conjx: Compute with conjugated values of x?
|
||||
- n: The number of elements in vectors x and y.
|
||||
- alpha: The address of a scalar.
|
||||
- x: The address of vector x.
|
||||
- incx: The vector increment of x. incx should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- y: The address of vector y.
|
||||
- incy: The vector increment of y. incy should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
|
||||
This template code calls the reference implementation if any of the
|
||||
following conditions are true:
|
||||
|
||||
- Either of the strides incx or incy is non-unit.
|
||||
- Vectors x and y are unaligned with different offsets.
|
||||
|
||||
If the vectors are aligned, or unaligned by the same offset, then optimized
|
||||
code can be used for the bulk of the computation. This template shows how
|
||||
the front-edge case can be handled so that the remaining computation is
|
||||
aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE,
|
||||
which is defined in bli_config.h.)
|
||||
|
||||
Additional things to consider:
|
||||
|
||||
- Because conjugation disappears in the real domain, real instances of
|
||||
this kernel can safely ignore the values of any conjugation parameters,
|
||||
thereby simplifying the implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t n_elem_per_reg = 1;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
|
||||
const siz_t type_size = sizeof( *x );
|
||||
|
||||
dcomplex* xp;
|
||||
dcomplex* yp;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
dim_t n_pre = 0;
|
||||
dim_t n_iter;
|
||||
dim_t n_left;
|
||||
|
||||
dim_t off_x, off_y;
|
||||
dim_t i;
|
||||
|
||||
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
if ( bli_zeq0( *alpha ) ) return;
|
||||
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( bli_has_nonunit_inc2( incx, incy ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
// If a, the second column of a, and y are unaligned by the same
|
||||
// offset, then we can still use an implementation that depends on
|
||||
// alignment for most of the operation.
|
||||
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
|
||||
|
||||
if ( off_x == off_y )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = off_x / type_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Compute the number of unrolled and leftover (edge) iterations.
|
||||
n_iter = ( n - n_pre ) / n_elem_per_iter;
|
||||
n_left = ( n - n_pre ) % n_elem_per_iter;
|
||||
|
||||
|
||||
// Initialize pointers into x and y.
|
||||
xp = x;
|
||||
yp = y;
|
||||
|
||||
|
||||
// Iterate over elements of x and y to compute:
|
||||
// y += alpha * conjx( x );
|
||||
if ( bli_is_noconj( conjx ) )
|
||||
{
|
||||
// Compute front edge cases if x and y were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha, *xp, *yp );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. The addresses xp and
|
||||
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha, *xp, *yp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha, *xp, *yp );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conjx ) )
|
||||
{
|
||||
// Compute front edge cases if x and y were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha, *xp, *yp );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. The addresses xp and
|
||||
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha, *xp, *yp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha, *xp, *yp );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,varname)( conjx, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_opt_var1, axpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_opt_var1, axpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
59
config/template/kernels/1/bli_axpyv_opt_var1.h
Normal file
59
config/template/kernels/1/bli_axpyv_opt_var1.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype axpyv kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( axpyv_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 )
|
||||
#endif
|
||||
|
||||
345
config/template/kernels/1/bli_dotv_opt_var1.c
Normal file
345
config/template/kernels/1/bli_dotv_opt_var1.c
Normal file
@@ -0,0 +1,345 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sssdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho )
|
||||
{
|
||||
/*
|
||||
Template dotv kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
complex kernel, coded in C, which can serve as the starting point for one
|
||||
to write an optimized kernel on an arbitrary architecture. (We show a
|
||||
template implementation for only double-precision complex because the
|
||||
templates for the other three floating-point types would be similar, with
|
||||
the real instantiations being noticeably simpler due to the disappearance
|
||||
of conjugation in the real domain.)
|
||||
|
||||
This kernel performs an inner (dot) product operation:
|
||||
|
||||
rho := conjx( x^T ) * conjy( y )
|
||||
|
||||
where x and y are vectors of length n and rho is a scalar.
|
||||
|
||||
Parameters:
|
||||
|
||||
- conjx: Compute with conjugated values of x?
|
||||
- conjy: Compute with conjugated values of y?
|
||||
- n: The number of elements in vectors x and y.
|
||||
- x: The address of vector x.
|
||||
- incx: The vector increment of x. incx should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- y: The address of vector y.
|
||||
- incy: The vector increment of y. incy should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- rho: The address of the output scalar.
|
||||
|
||||
This template code calls the reference implementation if any of the
|
||||
following conditions are true:
|
||||
|
||||
- Either of the strides incx or incy is non-unit.
|
||||
- Vectors x and y are unaligned with different offsets.
|
||||
|
||||
If the vectors are aligned, or unaligned by the same offset, then optimized
|
||||
code can be used for the bulk of the computation. This template shows how
|
||||
the front-edge case can be handled so that the remaining computation is
|
||||
aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE,
|
||||
which is defined in bli_config.h.)
|
||||
|
||||
Additional things to consider:
|
||||
|
||||
- While four combinations of possible values of conjx and conjy exist, we
|
||||
implement only conjugation on x explicitly; we induce the other two cases
|
||||
by toggling the effective conjugation on x and then conjugating the dot
|
||||
product result.
|
||||
- Because conjugation disappears in the real domain, real instances of
|
||||
this kernel can safely ignore the values of any conjugation parameters,
|
||||
thereby simplifying the implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t n_elem_per_reg = 1;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
|
||||
const siz_t type_size = sizeof( *x );
|
||||
|
||||
dcomplex* xp;
|
||||
dcomplex* yp;
|
||||
dcomplex dotxy;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
dim_t n_pre = 0;
|
||||
dim_t n_iter;
|
||||
dim_t n_left;
|
||||
|
||||
dim_t off_x, off_y;
|
||||
dim_t i;
|
||||
|
||||
conj_t conjx_use;
|
||||
|
||||
|
||||
// If the vector lengths are zero, set rho to zero and return.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
bli_zset0s( *rho );
|
||||
return;
|
||||
}
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( bli_has_nonunit_inc2( incx, incy ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
// If a, the second column of a, and y are unaligned by the same
|
||||
// offset, then we can still use an implementation that depends on
|
||||
// alignment for most of the operation.
|
||||
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
|
||||
|
||||
if ( off_x == off_y )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = off_x / type_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Compute the number of unrolled and leftover (edge) iterations.
|
||||
n_iter = ( n - n_pre ) / n_elem_per_iter;
|
||||
n_left = ( n - n_pre ) % n_elem_per_iter;
|
||||
|
||||
|
||||
// Initialize pointers into x and y.
|
||||
xp = x;
|
||||
yp = y;
|
||||
|
||||
|
||||
// Initialize accumulator to zero.
|
||||
bli_zset0s( dotxy );
|
||||
|
||||
|
||||
conjx_use = conjx;
|
||||
|
||||
// If y must be conjugated, we compute the result indirectly by first
|
||||
// toggling the effective conjugation of x and then conjugating the
|
||||
// resulting dot product.
|
||||
if ( bli_is_conj( conjy ) )
|
||||
bli_toggle_conj( conjx_use );
|
||||
|
||||
|
||||
// Iterate over elements of x and y to compute:
|
||||
// rho = conjx( x^T ) * conjy( y );
|
||||
if ( bli_is_noconj( conjx_use ) )
|
||||
{
|
||||
// Compute front edge cases if x and y were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. The addresses xp and
|
||||
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conjx_use ) )
|
||||
{
|
||||
// Compute front edge cases if x and y were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. The addresses xp and
|
||||
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
|
||||
xp += 1; yp += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If conjugation on y was requested, we induce it by conjugating
|
||||
// the contents of dotxy.
|
||||
if ( bli_is_conj( conjy ) )
|
||||
bli_zconjs( dotxy );
|
||||
|
||||
bli_zzcopys( dotxy, *rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,opname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chr,varname)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( dotv_opt_var1, dotv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( dotv_opt_var1, dotv_unb_var1 )
|
||||
#endif
|
||||
59
config/template/kernels/1/bli_dotv_opt_var1.h
Normal file
59
config/template/kernels/1/bli_dotv_opt_var1.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype dotv kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( dotv_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( dotv_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( dotv_opt_var1 )
|
||||
#endif
|
||||
436
config/template/kernels/1f/bli_axpy2v_opt_var1.c
Normal file
436
config/template/kernels/1f/bli_axpy2v_opt_var1.c
Normal file
@@ -0,0 +1,436 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha1,
|
||||
float* restrict alpha2,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha1,
|
||||
double* restrict alpha2,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha1,
|
||||
scomplex* restrict alpha2,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha1,
|
||||
dcomplex* restrict alpha2,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template axpy2v kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
complex kernel, coded in C, which can serve as the starting point for one
|
||||
to write an optimized kernel on an arbitrary architecture. (We show a
|
||||
template implementation for only double-precision complex because the
|
||||
templates for the other three floating-point types would be similar, with
|
||||
the real instantiations being noticeably simpler due to the disappearance
|
||||
of conjugation in the real domain.)
|
||||
|
||||
This kernel fuses two axpyv operations:
|
||||
|
||||
z := z + alpha1 * conjx( x )
|
||||
z := z + alpha2 * conjy( y )
|
||||
|
||||
where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
|
||||
|
||||
Parameters:
|
||||
|
||||
- conjx: Compute with conjugated values of x?
|
||||
- conjy: Compute with conjugated values of y?
|
||||
- n: The number of elements in vectors x, y, and z.
|
||||
- alpha1: The address of the scalar to be applied to x.
|
||||
- alpha2: The address of the scalar to be applied to y.
|
||||
- x: The address of vector x.
|
||||
- incx: The vector increment of x. incx should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- y: The address of vector y.
|
||||
- incy: The vector increment of y. incy should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- z: The address of vector z.
|
||||
- incz: The vector increment of z. incz should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
|
||||
This template code calls the reference implementation if any of the
|
||||
following conditions are true:
|
||||
|
||||
- Any of the strides incx, incy, or incz is non-unit.
|
||||
- Vectors x, y, and z are unaligned with different offsets.
|
||||
|
||||
If the vectors are aligned, or unaligned by the same offset, then optimized
|
||||
code can be used for the bulk of the computation. This template shows how
|
||||
the front-edge case can be handled so that the remaining computation is
|
||||
aligned. (This template guarantees alignment in the main loops to be
|
||||
BLIS_SIMD_ALIGN_SIZE, which is defined in bli_config.h.)
|
||||
|
||||
Here are a few additional things to consider:
|
||||
|
||||
- Because conjugation disappears in the real domain, real instances of
|
||||
this kernel can safely ignore the values of any conjugation parameters,
|
||||
thereby simplifying the implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t n_elem_per_reg = 1;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
|
||||
const siz_t type_size = sizeof( *x );
|
||||
|
||||
dcomplex* xp;
|
||||
dcomplex* yp;
|
||||
dcomplex* zp;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
dim_t n_pre = 0;
|
||||
dim_t n_iter;
|
||||
dim_t n_left;
|
||||
|
||||
dim_t off_x, off_y, off_z;
|
||||
dim_t i;
|
||||
|
||||
|
||||
// Return early if possible.
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( bli_has_nonunit_inc3( incx, incy, incz ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
// If a, the second column of a, and y are unaligned by the same
|
||||
// offset, then we can still use an implementation that depends on
|
||||
// alignment for most of the operation.
|
||||
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
|
||||
|
||||
if ( off_x == off_y && off_x == off_z )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = off_x / type_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Compute the number of unrolled and leftover (edge) iterations.
|
||||
n_iter = ( n - n_pre ) / n_elem_per_iter;
|
||||
n_left = ( n - n_pre ) % n_elem_per_iter;
|
||||
|
||||
|
||||
// Initialize pointers into x, y, and z.
|
||||
xp = x;
|
||||
yp = y;
|
||||
zp = z;
|
||||
|
||||
|
||||
// Iterate over rows of x, y, and z to compute:
|
||||
// z += alpha1 * conjx( x ) + alpha2 * conjy( y );
|
||||
if ( bli_is_noconj( conjx ) && bli_is_noconj( conjy ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpys( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha1 and alpha2 should be loaded once prior to the n_iter
|
||||
// loop and the elements of z should be loaded and stored only once
|
||||
// each. The addresses xp, yp, and zp are guaranteed to be aligned
|
||||
// to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpys( *alpha2, *yp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpys( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else if ( bli_is_noconj( conjx ) && bli_is_conj( conjy ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpyjs( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha1 and alpha2 should be loaded once prior to the n_iter
|
||||
// loop and the elements of z should be loaded and stored only once
|
||||
// each. The addresses xp, yp, and zp are guaranteed to be aligned
|
||||
// to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpyjs( *alpha2, *yp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzaxpys( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpyjs( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else if ( bli_is_conj( conjx ) && bli_is_noconj( conjy ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpys( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha1 and alpha2 should be loaded once prior to the n_iter
|
||||
// loop and the elements of z should be loaded and stored only once
|
||||
// each. The addresses xp, yp, and zp are guaranteed to be aligned
|
||||
// to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpys( *alpha2, *yp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpys( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conjx ) && bli_is_conj( conjy ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpyjs( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha1 and alpha2 should be loaded once prior to the n_iter
|
||||
// loop and the elements of z should be loaded and stored only once
|
||||
// each. The addresses xp, yp, and zp are guaranteed to be aligned
|
||||
// to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpyjs( *alpha2, *yp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzaxpyjs( *alpha1, *xp, *zp );
|
||||
bli_zzzaxpyjs( *alpha2, *yp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* restrict alpha1, \
|
||||
ctype_xy* restrict alpha2, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chz,kername)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha1, \
|
||||
alpha2, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpy2v_opt_var1, axpy2v_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpy2v_opt_var1, axpy2v_unb_var1 )
|
||||
#endif
|
||||
|
||||
58
config/template/kernels/1f/bli_axpy2v_opt_var1.h
Normal file
58
config/template/kernels/1f/bli_axpy2v_opt_var1.h
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t m, \
|
||||
ctype_xy* restrict alpha1, \
|
||||
ctype_xy* restrict alpha2, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( axpy2v_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( axpy2v_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( axpy2v_opt_var1 )
|
||||
#endif
|
||||
|
||||
416
config/template/kernels/1f/bli_axpyf_opt_var1.c
Normal file
416
config/template/kernels/1f/bli_axpyf_opt_var1.c
Normal file
@@ -0,0 +1,416 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
void bli_zzzaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template axpyf kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
complex kernel, coded in C, which can serve as the starting point for one
|
||||
to write an optimized kernel on an arbitrary architecture. (We show a
|
||||
template implementation for only double-precision complex because the
|
||||
templates for the other three floating-point types would be similar, with
|
||||
the real instantiations being noticeably simpler due to the disappearance
|
||||
of conjugation in the real domain.)
|
||||
|
||||
This kernel performs the following gemv-like operation:
|
||||
|
||||
y := y + alpha * conja( A ) * conjx( x )
|
||||
|
||||
where A is an m x b_n matrix, x is a vector of length b_n, y is a vector
|
||||
of length m, and alpha is a scalar. The operation is performed as a series
|
||||
of fused axpyv operations, and therefore A should be column-stored.
|
||||
|
||||
Parameters:
|
||||
|
||||
- conja: Compute with conjugated values of A?
|
||||
- conjx: Compute with conjugated values of x?
|
||||
- m: The number of rows in matrix A.
|
||||
- b_n: The number of columns in matrix A. Must be equal to or less than
|
||||
the fusing factor.
|
||||
- alpha: The address of a scalar.
|
||||
- a: The address of matrix A.
|
||||
- inca: The row stride of A. inca should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- lda: The column stride of A.
|
||||
- x: The address of vector x.
|
||||
- incx: The vector increment of x.
|
||||
- y: The address of vector y.
|
||||
- incy: The vector increment of y. incy should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
|
||||
This template code calls the reference implementation if any of the
|
||||
following conditions are true:
|
||||
|
||||
- Either of the strides inca or incy is non-unit.
|
||||
- The address of A, the second column of A, and y are unaligned with
|
||||
different offsets.
|
||||
|
||||
If the first/second columns of A and address of y are aligned, or unaligned
|
||||
by the same offset, then optimized code can be used for the bulk of the
|
||||
computation. This template shows how the front-edge case can be handled so
|
||||
that the remaining computation is aligned. (This template guarantees
|
||||
alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which is defined
|
||||
in bli_config.h.)
|
||||
|
||||
Additional things to consider:
|
||||
|
||||
- When optimizing, you should fully unroll the loops over b_n. This is the
|
||||
dimension across which we are fusing axpyv operations.
|
||||
- This template code chooses to call the reference implementation whenever
|
||||
b_n is less than the fusing factor, so as to avoid having to handle edge
|
||||
cases. One may choose to optimize this edge case, if desired.
|
||||
- Because conjugation disappears in the real domain, real instances of
|
||||
this kernel can safely ignore the values of any conjugation parameters,
|
||||
thereby simplifying the implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t n_elem_per_reg = 1;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
|
||||
const siz_t type_size = sizeof( *a );
|
||||
|
||||
dcomplex* ap[ bli_zaxpyf_fusefac ];
|
||||
dcomplex* xp[ bli_zaxpyf_fusefac ];
|
||||
dcomplex* yp;
|
||||
|
||||
dcomplex alpha_x[ bli_zaxpyf_fusefac ];
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
dim_t m_pre = 0;
|
||||
dim_t m_iter;
|
||||
dim_t m_left;
|
||||
|
||||
dim_t off_a, off_a2, off_y;
|
||||
dim_t i, j;
|
||||
|
||||
|
||||
// Return early if possible.
|
||||
if ( bli_zero_dim2( m, b_n ) ) return;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( b_n < bli_zaxpyf_fusefac )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_has_nonunit_inc3( inca, incx, incy ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
// If a, the second column of a, and y are unaligned by the same
|
||||
// offset, then we can still use an implementation that depends on
|
||||
// alignment for most of the operation.
|
||||
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
|
||||
|
||||
if ( off_a == off_y && off_a == off_a2 )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
m_pre = off_a / type_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Compute the number of unrolled and leftover (edge) iterations.
|
||||
m_iter = ( m - m_pre ) / n_elem_per_iter;
|
||||
m_left = ( m - m_pre ) % n_elem_per_iter;
|
||||
|
||||
|
||||
// Initialize pointers into the columns of A and elements of x.
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
ap[ j ] = a + (j )*lda;
|
||||
xp[ j ] = x + (j )*incx;
|
||||
}
|
||||
yp = y;
|
||||
|
||||
|
||||
// Load elements of x or conj(x) into alpha_x and scale by alpha.
|
||||
if ( bli_is_noconj( conjx ) )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzcopys( *xp[ j ], alpha_x[ j ] );
|
||||
bli_zzscals( *alpha, alpha_x[ j ] );
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conjx ) )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzcopyjs( *xp[ j ], alpha_x[ j ] );
|
||||
bli_zzscals( *alpha, alpha_x[ j ] );
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over rows of A and y to compute:
|
||||
// y += conja( A )*conjx( x );
|
||||
if ( bli_is_noconj( conja ) )
|
||||
{
|
||||
// Compute front edge cases if a and y were unaligned.
|
||||
for ( i = 0; i < m_pre; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
yp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of alpha_x should be loaded once prior to the m_iter
|
||||
// loop, and the b_n loop should be fully unrolled. The addresses in
|
||||
// ap[] and yp are guaranteed to be aligned to
|
||||
// BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
|
||||
|
||||
ap[ j ] += n_elem_per_iter;
|
||||
}
|
||||
yp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
yp += 1;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conja ) )
|
||||
{
|
||||
// Compute front edge cases if a and y were unaligned.
|
||||
for ( i = 0; i < m_pre; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
yp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of alpha_x should be loaded once prior to the m_iter
|
||||
// loop, and the b_n loop should be fully unrolled. The addresses in
|
||||
// ap[] and yp are guaranteed to be aligned to
|
||||
// BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
|
||||
|
||||
ap[ j ] += n_elem_per_iter;
|
||||
}
|
||||
yp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases.
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
yp += 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conja, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
62
config/template/kernels/1f/bli_axpyf_opt_var1.h
Normal file
62
config/template/kernels/1f/bli_axpyf_opt_var1.h
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype axpyf kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( axpyf_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( axpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( axpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
470
config/template/kernels/1f/bli_dotaxpyv_opt_var1.c
Normal file
470
config/template/kernels/1f/bli_dotaxpyv_opt_var1.c
Normal file
@@ -0,0 +1,470 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sssdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho,
|
||||
float* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho,
|
||||
double* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/*
|
||||
Template dotaxpyv kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
complex kernel, coded in C, which can serve as the starting point for one
|
||||
to write an optimized kernel on an arbitrary architecture. (We show a
|
||||
template implementation for only double-precision complex because the
|
||||
templates for the other three floating-point types would be similar, with
|
||||
the real instantiations being noticeably simpler due to the disappearance
|
||||
of conjugation in the real domain.)
|
||||
|
||||
This kernel fuses a dotv and axpyv operation:
|
||||
|
||||
rho := conjxt( x^T ) * conjy( y )
|
||||
z := z + alpha * conjx( x )
|
||||
|
||||
where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
|
||||
|
||||
Parameters:
|
||||
|
||||
- conjxt: Compute with conjugated values of x^T?
|
||||
- conjx: Compute with conjugated values of x?
|
||||
- conjy: Compute with conjugated values of y?
|
||||
- n: The number of elements in vectors x, y, and z.
|
||||
- alpha: The address of the scalar to be applied to x.
|
||||
- x: The address of vector x.
|
||||
- incx: The vector increment of x. incx should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- y: The address of vector y.
|
||||
- incy: The vector increment of y. incy should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- rho: The address of the output scalar of the dotv subproblem.
|
||||
- z: The address of vector z.
|
||||
- incz: The vector increment of z. incz should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
|
||||
This template code calls the reference implementation if any of the
|
||||
following conditions are true:
|
||||
|
||||
- Any of the strides incx, incy, or incz is non-unit.
|
||||
- Vectors x, y, and z are unaligned with different offsets.
|
||||
|
||||
If the vectors are aligned, or unaligned by the same offset, then optimized
|
||||
code can be used for the bulk of the computation. This template shows how
|
||||
the front-edge case can be handled so that the remaining computation is
|
||||
aligned. (This template guarantees alignment in the main loops to be
|
||||
BLIS_SIMD_ALIGN_SIZE, which is defined in bli_config.h.)
|
||||
|
||||
Here are a few additional things to consider:
|
||||
|
||||
- While four combinations of possible values of conjx and conjy exist, we
|
||||
implement only conjugation on x explicitly; we induce the other two cases
|
||||
by toggling the effective conjugation on x and then conjugating the dot
|
||||
product result.
|
||||
- Because conjugation disappears in the real domain, real instances of
|
||||
this kernel can safely ignore the values of any conjugation parameters,
|
||||
thereby simplifying the implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t n_elem_per_reg = 1;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
|
||||
const siz_t type_size = sizeof( *x );
|
||||
|
||||
dcomplex* xp;
|
||||
dcomplex* yp;
|
||||
dcomplex* zp;
|
||||
dcomplex dotxy;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
dim_t n_pre = 0;
|
||||
dim_t n_iter;
|
||||
dim_t n_left;
|
||||
|
||||
dim_t off_x, off_y, off_z;
|
||||
dim_t i;
|
||||
|
||||
conj_t conjxt_use;
|
||||
|
||||
|
||||
// If the vector lengths are zero, set rho to zero and return.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
bli_zset0s( *rho );
|
||||
return;
|
||||
}
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( bli_has_nonunit_inc3( incx, incy, incz ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
// If x, y, and z are unaligned by the same offset, then we can
|
||||
// still use an implementation that depends on alignment for most
|
||||
// of the operation.
|
||||
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
|
||||
|
||||
if ( off_x == off_y && off_x == off_z )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = off_x / type_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Compute the number of unrolled and leftover (edge) iterations.
|
||||
n_iter = ( n - n_pre ) / n_elem_per_iter;
|
||||
n_left = ( n - n_pre ) % n_elem_per_iter;
|
||||
|
||||
|
||||
// Initialize pointers into x, y, and z.
|
||||
xp = x;
|
||||
yp = y;
|
||||
zp = z;
|
||||
|
||||
|
||||
// Initialize accumulator to zero.
|
||||
bli_zset0s( dotxy );
|
||||
|
||||
|
||||
conjxt_use = conjxt;
|
||||
|
||||
// If y must be conjugated, we compute the result indirectly by first
|
||||
// toggling the effective conjugation of xt and then conjugating the
|
||||
// resulting dot product.
|
||||
if ( bli_is_conj( conjy ) )
|
||||
bli_toggle_conj( conjxt_use );
|
||||
|
||||
|
||||
// Iterate over elements of x, y, and z to compute:
|
||||
// r = conjxt( x^T ) * conjy( y );
|
||||
// z += alpha * conjx( x );
|
||||
if ( bli_is_noconj( conjx ) && bli_is_noconj( conjxt_use ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
bli_zzzaxpys( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha should be loaded once prior to the n_iter loop, dotxy
|
||||
// should be and kept in registers, and each element of x should be
|
||||
// loaded only once each. The addresses xp, yp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
bli_zzzaxpys( *alpha, *xp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
bli_zzzaxpys( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else if ( bli_is_noconj( conjx ) && bli_is_conj( conjxt_use ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
bli_zzzaxpys( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha should be loaded once prior to the n_iter loop, dotxy
|
||||
// should be and kept in registers, and each element of x should be
|
||||
// loaded only once each. The addresses xp, yp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
bli_zzzaxpys( *alpha, *xp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
bli_zzzaxpys( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else if ( bli_is_conj( conjx ) && bli_is_noconj( conjxt_use ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
bli_zzzaxpyjs( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha should be loaded once prior to the n_iter loop, dotxy
|
||||
// should be and kept in registers, and each element of x should be
|
||||
// loaded only once each. The addresses xp, yp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
bli_zzzaxpyjs( *alpha, *xp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzdots( *xp, *yp, dotxy );
|
||||
bli_zzzaxpyjs( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conjx ) && bli_is_conj( conjxt_use ) )
|
||||
{
|
||||
// Compute front edge cases if x, y, and z were unaligned.
|
||||
for ( i = 0; i < n_pre; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
bli_zzzaxpyjs( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// alpha should be loaded once prior to the n_iter loop, dotxy
|
||||
// should be and kept in registers, and each element of x should be
|
||||
// loaded only once each. The addresses xp, yp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
bli_zzzaxpyjs( *alpha, *xp, *zp );
|
||||
|
||||
xp += n_elem_per_iter;
|
||||
yp += n_elem_per_iter;
|
||||
zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *xp, *yp, dotxy );
|
||||
bli_zzzaxpyjs( *alpha, *xp, *zp );
|
||||
|
||||
xp += 1; yp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If conjugation on y was requested, we induce it by conjugating
|
||||
// the contents of rho.
|
||||
if ( bli_is_conj( conjy ) )
|
||||
bli_zconjs( dotxy );
|
||||
|
||||
bli_zzcopys( dotxy, *rho );
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_xy* restrict rho, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chz,kername)( conjxt, \
|
||||
conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
60
config/template/kernels/1f/bli_dotaxpyv_opt_var1.h
Normal file
60
config/template/kernels/1f/bli_dotaxpyv_opt_var1.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_xy* restrict rho, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotaxpyv_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotaxpyv_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotaxpyv_opt_var1 )
|
||||
#endif
|
||||
|
||||
610
config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c
Normal file
610
config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c
Normal file
@@ -0,0 +1,610 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sssdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict w, inc_t incw,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict w, inc_t incw,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict w, inc_t incw,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict w, inc_t incw,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
|
||||
{
|
||||
/*
|
||||
Template dotxaxpyf kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
complex kernel, coded in C, which can serve as the starting point for one
|
||||
to write an optimized kernel on an arbitrary architecture. (We show a
|
||||
template implementation for only double-precision complex because the
|
||||
templates for the other three floating-point types would be similar, with
|
||||
the real instantiations being noticeably simpler due to the disappearance
|
||||
of conjugation in the real domain.)
|
||||
|
||||
This kernel performs the following two gemv-like operations:
|
||||
|
||||
y := beta * y + alpha * conjat( A^T ) * conjw( w )
|
||||
z := z + alpha * conja( A ) * conjx( x )
|
||||
|
||||
where A is an m x b_n matrix, x and y are vector of length b_n, w and z
|
||||
are vectors of length m, and alpha and beta are scalars. The operation
|
||||
fuses a dotxf and an axpyf operation, and therefore A should be column-
|
||||
stored.
|
||||
|
||||
Parameters:
|
||||
|
||||
- conjat: Compute with conjugated values of A^T?
|
||||
- conja: Compute with conjugated values of A?
|
||||
- conjw: Compute with conjugated values of w?
|
||||
- conjx: Compute with conjugated values of x?
|
||||
- m: The number of rows in matrix A.
|
||||
- b_n: The number of columns in matrix A. Must be equal to or less than
|
||||
the fusing factor.
|
||||
- alpha: The address of the scalar to be applied to A^T*w and A*x.
|
||||
- a: The address of matrix A.
|
||||
- inca: The row stride of A. inca should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- lda: The column stride of A.
|
||||
- w: The address of vector w.
|
||||
- incw: The vector increment of w. incw should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- x: The address of vector x.
|
||||
- incx: The vector increment of x.
|
||||
- beta: The address of the scalar to be applied to y.
|
||||
- y: The address of vector y.
|
||||
- incy: The vector increment of y.
|
||||
- z: The address of vector z.
|
||||
- incz: The vector increment of z. incz should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
|
||||
This template code calls the reference implementation if any of the
|
||||
following conditions are true:
|
||||
|
||||
- Any of the strides inca, incw, or incz is non-unit.
|
||||
- The address of A, the second column of A, w, and z are unaligned with
|
||||
different offsets.
|
||||
|
||||
If the first/second rows of A and addresses of w and z are aligned, or
|
||||
unaligned by the same offset, then optimized code can be used for the bulk
|
||||
of the computation. This template shows how the front-edge case can be
|
||||
handled so that the remaining computation is aligned. (This template
|
||||
guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which
|
||||
is defined in bli_config.h.)
|
||||
|
||||
Additional things to consider:
|
||||
|
||||
- When optimizing, you should fully unroll the loops over b_n. This is the
|
||||
dimension across which we are fusing dotxv operations.
|
||||
- This template code chooses to call the reference implementation whenever
|
||||
b_n is less than the fusing factor, so as to avoid having to handle edge
|
||||
cases. One may choose to optimize this edge case, if desired.
|
||||
- Because conjugation disappears in the real domain, real instances of
|
||||
this kernel can safely ignore the values of any conjugation parameters,
|
||||
thereby simplifying the implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t n_elem_per_reg = 1;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
|
||||
const siz_t type_size = sizeof( *a );
|
||||
|
||||
dcomplex* ap[ bli_zdotxaxpyf_fusefac ];
|
||||
dcomplex* xp[ bli_zdotxaxpyf_fusefac ];
|
||||
dcomplex* yp[ bli_zdotxaxpyf_fusefac ];
|
||||
dcomplex* wp;
|
||||
dcomplex* zp;
|
||||
|
||||
dcomplex At_w[ bli_zdotxaxpyf_fusefac ];
|
||||
dcomplex alpha_x[ bli_zdotxaxpyf_fusefac ];
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
dim_t m_pre = 0;
|
||||
dim_t m_iter;
|
||||
dim_t m_left;
|
||||
|
||||
dim_t off_a, off_a2, off_w, off_z;
|
||||
dim_t i, j;
|
||||
|
||||
conj_t conjat_use;
|
||||
|
||||
|
||||
// Return early if possible.
|
||||
if ( bli_zero_dim2( m, b_n ) ) return;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( b_n < bli_zdotxaxpyf_fusefac )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_has_nonunit_inc3( inca, incw, incz ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( w, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
// If a, the second column of a, w, and z are unaligned by the same
|
||||
// offset, then we can still use an implementation that depends on
|
||||
// alignment for most of the operation.
|
||||
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_w = bli_offset_from_alignment( w, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
|
||||
|
||||
if ( off_a == off_a2 && off_a == off_w && off_a == off_z )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
m_pre = off_a / type_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Compute the number of unrolled and leftover (edge) iterations.
|
||||
m_iter = ( m - m_pre ) / n_elem_per_iter;
|
||||
m_left = ( m - m_pre ) % n_elem_per_iter;
|
||||
|
||||
|
||||
// Initialize pointers into the columns of A and elements of x.
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
ap[ j ] = a + (j )*lda;
|
||||
xp[ j ] = x + (j )*incx;
|
||||
yp[ j ] = y + (j )*incy;
|
||||
}
|
||||
wp = w;
|
||||
zp = z;
|
||||
|
||||
// Load elements of x or conj(x) into alpha_x and scale by alpha.
|
||||
if ( bli_is_noconj( conjx ) )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzcopys( *xp[ j ], alpha_x[ j ] );
|
||||
bli_zzscals( *alpha, alpha_x[ j ] );
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conjx ) )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzcopyjs( *xp[ j ], alpha_x[ j ] );
|
||||
bli_zzscals( *alpha, alpha_x[ j ] );
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize our accumulators to zero.
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zset0s( At_w[ j ] );
|
||||
}
|
||||
|
||||
|
||||
conjat_use = conjat;
|
||||
|
||||
// If w must be conjugated, we compute the result indirectly by first
|
||||
// toggling the effective conjugation of At and then conjugating the
|
||||
// resulting dot products.
|
||||
if ( bli_is_conj( conjw ) )
|
||||
bli_toggle_conj( conjat_use );
|
||||
|
||||
|
||||
// Iterate over the columns of A and elements of w and z to compute:
|
||||
// y = beta * y + alpha * conjat( A^T ) * conjw( w );
|
||||
// z = z + alpha * conja( A ) * conjx( x );
|
||||
// where A is m x b_n.
|
||||
if ( bli_is_noconj( conja ) && bli_is_noconj( conjat_use ) )
|
||||
{
|
||||
// Compute front edge cases if A, w, and z were unaligned.
|
||||
for ( i = 0; i < m_pre; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of alpha_x should be loaded once prior to the m_iter
|
||||
// loop, At_w should be kept in registers, and the b_n loop should
|
||||
// be fully unrolled. The addresses in ap[], wp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += n_elem_per_iter;
|
||||
}
|
||||
wp += n_elem_per_iter; zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else if ( bli_is_noconj( conja ) && bli_is_conj( conjat_use ) )
|
||||
{
|
||||
// Compute front edge cases if A, w, and z were unaligned.
|
||||
for ( i = 0; i < m_pre; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of alpha_x should be loaded once prior to the m_iter
|
||||
// loop, At_w should be kept in registers, and the b_n loop should
|
||||
// be fully unrolled. The addresses in ap[], wp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += n_elem_per_iter;
|
||||
}
|
||||
wp += n_elem_per_iter; zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else if ( bli_is_conj( conja ) && bli_is_noconj( conjat_use ) )
|
||||
{
|
||||
// Compute front edge cases if A, w, and z were unaligned.
|
||||
for ( i = 0; i < m_pre; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of alpha_x should be loaded once prior to the m_iter
|
||||
// loop, At_w should be kept in registers, and the b_n loop should
|
||||
// be fully unrolled. The addresses in ap[], wp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += n_elem_per_iter;
|
||||
}
|
||||
wp += n_elem_per_iter; zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
else if ( bli_is_conj( conja ) && bli_is_conj( conjat_use ) )
|
||||
{
|
||||
// Compute front edge cases if A, w, and z were unaligned.
|
||||
for ( i = 0; i < m_pre; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of alpha_x should be loaded once prior to the m_iter
|
||||
// loop, At_w should be kept in registers, and the b_n loop should
|
||||
// be fully unrolled. The addresses in ap[], wp, and zp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += n_elem_per_iter;
|
||||
}
|
||||
wp += n_elem_per_iter; zp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
|
||||
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
|
||||
|
||||
ap[ j ] += 1;
|
||||
}
|
||||
wp += 1; zp += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// If conjugation on w was requested, we induce it by conjugating
|
||||
// the contents of At_w.
|
||||
if ( bli_is_conj( conjw ) )
|
||||
{
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zconjs( At_w[ j ] );
|
||||
}
|
||||
}
|
||||
|
||||
// Scale the At_w product by alpha and accumulate into y after
|
||||
// scaling by beta.
|
||||
for ( j = 0; j < b_n; ++j )
|
||||
{
|
||||
bli_zzscals( *beta, *yp[ j ] );
|
||||
bli_zzzaxpys( *alpha, At_w[ j ], *yp[ j ] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conja, \
|
||||
conj_t conjw, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ab* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_b* restrict w, inc_t incw, \
|
||||
ctype_b* restrict x, inc_t incx, \
|
||||
ctype_c* restrict beta, \
|
||||
ctype_c* restrict y, inc_t incy, \
|
||||
ctype_c* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjat, \
|
||||
conja, \
|
||||
conjw, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
w, incw, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
y, incy, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
64
config/template/kernels/1f/bli_dotxaxpyf_opt_var1.h
Normal file
64
config/template/kernels/1f/bli_dotxaxpyf_opt_var1.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conja, \
|
||||
conj_t conjw, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ab* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_b* restrict w, inc_t incw, \
|
||||
ctype_b* restrict x, inc_t incx, \
|
||||
ctype_c* restrict beta, \
|
||||
ctype_c* restrict y, inc_t incy, \
|
||||
ctype_c* restrict z, inc_t incz \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
456
config/template/kernels/1f/bli_dotxf_opt_var1.c
Normal file
456
config/template/kernels/1f/bli_dotxf_opt_var1.c
Normal file
@@ -0,0 +1,456 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sssdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template dotxf kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
complex kernel, coded in C, which can serve as the starting point for one
|
||||
to write an optimized kernel on an arbitrary architecture. (We show a
|
||||
template implementation for only double-precision complex because the
|
||||
templates for the other three floating-point types would be similar, with
|
||||
the real instantiations being noticeably simpler due to the disappearance
|
||||
of conjugation in the real domain.)
|
||||
|
||||
This kernel performs the following gemv-like operation:
|
||||
|
||||
y := beta * y + alpha * conjat( A^T ) * conjx( x )
|
||||
|
||||
where A is an m x b_n matrix, x is a vector of length m, y is a vector
|
||||
of length b_n, and alpha and beta are scalars. The operation is performed
|
||||
as a series of fused dotxv operations, and therefore A should be column-
|
||||
stored.
|
||||
|
||||
Parameters:
|
||||
|
||||
- conjat: Compute with conjugated values of A^T?
|
||||
- conjx: Compute with conjugated values of x?
|
||||
- m: The number of rows in matrix A.
|
||||
- b_n: The number of columns in matrix A. Must be equal to or less than
|
||||
the fusing factor.
|
||||
- alpha: The address of the scalar to be applied to A*x.
|
||||
- a: The address of matrix A.
|
||||
- inca: The row stride of A. inca should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- lda: The column stride of A.
|
||||
- x: The address of vector x.
|
||||
- incx: The vector increment of x. incx should be unit unless the
|
||||
implementation makes special accomodation for non-unit values.
|
||||
- beta: The address of the scalar to be applied to y.
|
||||
- y: The address of vector y.
|
||||
- incy: The vector increment of y.
|
||||
|
||||
This template code calls the reference implementation if any of the
|
||||
following conditions are true:
|
||||
|
||||
- Either of the strides inca or incx is non-unit.
|
||||
- The address of A, the second column of A, and x are unaligned with
|
||||
different offsets.
|
||||
|
||||
If the first/second columns of A and address of x are aligned, or unaligned
|
||||
by the same offset, then optimized code can be used for the bulk of the
|
||||
computation. This template shows how the front-edge case can be handled so
|
||||
that the remaining computation is aligned. (This template guarantees
|
||||
alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which is defined
|
||||
in bli_config.h.)
|
||||
|
||||
Additional things to consider:
|
||||
|
||||
- When optimizing, you should fully unroll the loops over b_n. This is the
|
||||
dimension across which we are fusing dotxv operations.
|
||||
- This template code chooses to call the reference implementation whenever
|
||||
b_n is less than the fusing factor, so as to avoid having to handle edge
|
||||
cases. One may choose to optimize this edge case, if desired.
|
||||
- Because conjugation disappears in the real domain, real instances of
|
||||
this kernel can safely ignore the values of any conjugation parameters,
|
||||
thereby simplifying the implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t n_elem_per_reg = 1;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
|
||||
const siz_t type_size = sizeof( *x );
|
||||
|
||||
dcomplex* ap[ bli_zdotxf_fusefac ];
|
||||
dcomplex* xp;
|
||||
dcomplex* yp[ bli_zdotxf_fusefac ];
|
||||
|
||||
dcomplex Atx[ bli_zdotxf_fusefac ];
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
dim_t m_pre = 0;
|
||||
dim_t m_iter;
|
||||
dim_t m_left;
|
||||
|
||||
dim_t off_a, off_a2, off_x;
|
||||
dim_t i, j;
|
||||
|
||||
conj_t conjat_use;
|
||||
|
||||
|
||||
// Return early if possible.
|
||||
if ( bli_zero_dim1( b_n ) ) return;
|
||||
|
||||
// If the vector lengths are zero, scale r by beta and return.
|
||||
if ( bli_zero_dim1( m ) )
|
||||
{
|
||||
bli_zzscalv( BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
beta,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( b_n < bli_zdotxf_fusefac )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_has_nonunit_inc2( inca, incx ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
|
||||
bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
// If a, the second column of a, and x are unaligned by the same
|
||||
// offset, then we can still use an implementation that depends on
|
||||
// alignment for most of the operation.
|
||||
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
|
||||
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
|
||||
|
||||
if ( off_a == off_a2 && off_a == off_x )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
m_pre = off_x / type_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Compute the number of unrolled and leftover (edge) iterations.
|
||||
m_iter = ( m - m_pre ) / n_elem_per_iter;
|
||||
m_left = ( m - m_pre ) % n_elem_per_iter;
|
||||
|
||||
|
||||
// Initialize pointers into the rows of A and elements of y.
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
ap[ i ] = a + (i )*lda;
|
||||
yp[ i ] = y + (i )*incy;
|
||||
}
|
||||
xp = x;
|
||||
|
||||
|
||||
// Initialize our accumulators to zero.
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zset0s( Atx[ i ] );
|
||||
}
|
||||
|
||||
|
||||
conjat_use = conjat;
|
||||
|
||||
// If x must be conjugated, we compute the result indirectly by first
|
||||
// toggling the effective conjugation of A and then conjugating the
|
||||
// resulting product A^T*x.
|
||||
if ( bli_is_conj( conjx ) )
|
||||
bli_toggle_conj( conjat_use );
|
||||
|
||||
|
||||
// Iterate over columns of A and rows of x to compute:
|
||||
// Atx = conjat_use( A^T ) * x;
|
||||
if ( bli_is_noconj( conjat_use ) )
|
||||
{
|
||||
// Compute front edge cases if A and y were unaligned.
|
||||
for ( j = 0; j < m_pre; ++j )
|
||||
{
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
|
||||
|
||||
ap[ i ] += 1;
|
||||
}
|
||||
xp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of Atx should be kept in registers, and the b_n loop
|
||||
// should be fully unrolled. The addresses in ap[] and xp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( j = 0; j < m_iter; ++j )
|
||||
{
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
|
||||
|
||||
ap[ i ] += n_elem_per_iter;
|
||||
}
|
||||
xp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( j = 0; j < m_left; ++j )
|
||||
{
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
|
||||
|
||||
ap[ i ] += 1;
|
||||
}
|
||||
xp += 1;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_conj( conjat_use ) )
|
||||
{
|
||||
// Compute front edge cases if A and y were unaligned.
|
||||
for ( j = 0; j < m_pre; ++j )
|
||||
{
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
|
||||
|
||||
ap[ i ] += 1;
|
||||
}
|
||||
xp += 1;
|
||||
}
|
||||
|
||||
// The bulk of the operation is executed here. For best performance,
|
||||
// the elements of Atx should be kept in registers, and the b_n loop
|
||||
// should be fully unrolled. The addresses in ap[] and xp are
|
||||
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
|
||||
for ( j = 0; j < m_iter; ++j )
|
||||
{
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
|
||||
|
||||
ap[ i ] += n_elem_per_iter;
|
||||
}
|
||||
xp += n_elem_per_iter;
|
||||
}
|
||||
|
||||
// Compute tail edge cases, if applicable.
|
||||
for ( j = 0; j < m_left; ++j )
|
||||
{
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
|
||||
|
||||
ap[ i ] += 1;
|
||||
}
|
||||
xp += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// If conjugation on y was requested, we induce it by conjugating
|
||||
// the contents of Atx.
|
||||
if ( bli_is_conj( conjx ) )
|
||||
{
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zconjs( Atx[ i ] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Scale the Atx product by alpha and accumulate into y after
|
||||
// scaling by beta.
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
bli_zzscals( *beta, *yp[ i ] );
|
||||
bli_zzzaxpys( *alpha, Atx[ i ], *yp[ i ] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_xy* restrict alpha, \
|
||||
ctype_x* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_y* restrict x, inc_t incx, \
|
||||
ctype_r* restrict beta, \
|
||||
ctype_r* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjat, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxf_opt_var1, dotxf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxf_opt_var1, dotxf_unb_var1 )
|
||||
#endif
|
||||
|
||||
63
config/template/kernels/1f/bli_dotxf_opt_var1.h
Normal file
63
config/template/kernels/1f/bli_dotxf_opt_var1.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype dotxf kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict beta, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxf_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxf_opt_var1 )
|
||||
#endif
|
||||
|
||||
290
config/template/kernels/3/bli_gemm_opt_mxn.c
Normal file
290
config/template/kernels/3/bli_gemm_opt_mxn.c
Normal file
@@ -0,0 +1,290 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sgemm_opt_mxn(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sgemm_ref_mxn( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
a_next,
|
||||
b_next );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dgemm_opt_mxn(
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template gemm micro-kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
real micro-kernel, coded in C, which can serve as the starting point for
|
||||
one to write an optimized micro-kernel on an arbitrary architecture. (We
|
||||
show a template implementation for only double-precision real because
|
||||
the templates for the other three floating-point types would be nearly
|
||||
identical.)
|
||||
|
||||
This micro-kernel performs a matrix-matrix multiplication of the form:
|
||||
|
||||
C := beta * C + alpha * A * B
|
||||
|
||||
where A is MR x k, B is k x NR, C is MR x NR, and alpha and beta are
|
||||
scalars.
|
||||
|
||||
Parameters:
|
||||
|
||||
- k: The number of columns of A and rows of B.
|
||||
- alpha: The address of a scalar to the A*B product.
|
||||
- a: The address of a micro-panel of matrix A of dimension MR x k,
|
||||
stored by columns.
|
||||
- b: The address of a micro-panel of matrix B of dimension k x NR,
|
||||
stored by rows.
|
||||
- beta: The address of a scalar to the input value of matrix C.
|
||||
- c: The address of a block of matrix C of dimension MR x NR,
|
||||
stored according to rs_c and cs_c.
|
||||
- rs_c: The row stride of matrix C (ie: the distance to the next row,
|
||||
in units of matrix elements).
|
||||
- cs_c: The column stride of matrix C (ie: the distance to the next
|
||||
column, in units of matrix elements).
|
||||
- a_next: The address of the micro-panel of A that will be used the next
|
||||
time the gemm micro-kernel will be called.
|
||||
- b_next: The address of the micro-panel of B that will be used the next
|
||||
time the gemm micro-kernel will be called.
|
||||
|
||||
The diagram below shows the packed micro-panel operands and how elements
|
||||
of each would be stored when MR == NR == 4. (The hex digits indicate the
|
||||
order of the elements in memory.) Note that the storage of C is not shown
|
||||
since it is determined by the row and column strides of C.
|
||||
|
||||
c: a: b:
|
||||
_______ ______________________ _______
|
||||
| | |0 4 8 C | |0 1 2 3|
|
||||
MR | | |1 5 9 D . . . | |4 5 6 7|
|
||||
| | += |2 6 A E | |8 9 A B|
|
||||
|_______| |3_7_B_F_______________| |C D E F|
|
||||
| . |
|
||||
NR k | . |
|
||||
| . |
|
||||
| |
|
||||
| |
|
||||
|_______|
|
||||
|
||||
NR
|
||||
Here are a few things to consider:
|
||||
|
||||
- bli_?mr and bli_?nr give the MR and NR register blocksizes for the
|
||||
datatype corresponding to the '?' character.
|
||||
- bli_?packmr and bli_?packnr are usually equal to bli_?mr and bli_?nr,
|
||||
respectively. (They are only not equal if the register blocksize
|
||||
extensions are non-zero. See bli_config.h for more details.)
|
||||
- You may assume that the addresses a and b are aligned according to
|
||||
the alignment value BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in
|
||||
bli_config.h.
|
||||
- Here, we use a local array, ab, as temporary accumulator elements as
|
||||
we compute the a*b product. In an optimized micro-kernel, ab is held
|
||||
in registers rather than memory.
|
||||
- In column-major storage (or column storage), the "leading dimension"
|
||||
of a matrix is equivalent to its column stride, and the row stride is
|
||||
unit. In row-major storage (row storage), the "leading dimension" is
|
||||
equivalent to the row stride and the column stride is unit.
|
||||
- While all three loops are exposed in this template micro-kernel, the
|
||||
loops over MR and NR typically disappear in an optimized code because
|
||||
they are fully unrolled, leaving only the loop over k.
|
||||
- Some optimized micro-kernels will need the loop over k to be unrolled
|
||||
a few times (4x seems to be a common unrolling factor).
|
||||
- a_next and b_next can be used to perform prefetching, if prefetching
|
||||
is supported by the architecture. They may be safely ignored by the
|
||||
micro-kernel implementation, though.
|
||||
- If beta == 0.0 (or 0.0 + 0.0i for complex), then the micro-kernel
|
||||
should NOT use it explicitly, as C may contain uninitialized memory
|
||||
(including NaNs). This case should be detected and handled separately,
|
||||
preferably by simply overwriting C with the alpha*A*B product. An
|
||||
example of how to perform this "beta is zero" handling is included in
|
||||
this template implementation.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t mr = bli_dmr;
|
||||
const dim_t nr = bli_dnr;
|
||||
|
||||
const inc_t cs_a = bli_dpackmr;
|
||||
|
||||
const inc_t rs_b = bli_dpacknr;
|
||||
|
||||
const inc_t rs_ab = 1;
|
||||
const inc_t cs_ab = bli_dmr;
|
||||
|
||||
dim_t l, j, i;
|
||||
|
||||
double ab[ bli_dmr *
|
||||
bli_dnr ];
|
||||
double* abij;
|
||||
double ai, bj;
|
||||
|
||||
|
||||
/* Initialize the accumulator elements in ab to zero. */
|
||||
for ( i = 0; i < mr * nr; ++i )
|
||||
{
|
||||
bli_dset0s( *(ab + i) );
|
||||
}
|
||||
|
||||
/* Perform a series of k rank-1 updates into ab. */
|
||||
for ( l = 0; l < k; ++l )
|
||||
{
|
||||
abij = ab;
|
||||
|
||||
/* In an optimized implementation, these two loops over MR and NR
|
||||
are typically fully unrolled. */
|
||||
for ( j = 0; j < nr; ++j )
|
||||
{
|
||||
bj = *(b + j);
|
||||
|
||||
for ( i = 0; i < mr; ++i )
|
||||
{
|
||||
ai = *(a + i);
|
||||
|
||||
bli_ddots( ai, bj, *abij );
|
||||
|
||||
abij += rs_ab;
|
||||
}
|
||||
}
|
||||
|
||||
a += cs_a;
|
||||
b += rs_b;
|
||||
}
|
||||
|
||||
/* Scale each element of ab by alpha. */
|
||||
for ( i = 0; i < mr * nr; ++i )
|
||||
{
|
||||
bli_dscals( *alpha, *(ab + i) );
|
||||
}
|
||||
|
||||
/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
|
||||
scale c by beta and then add the scaled result in ab. */
|
||||
if ( bli_deq0( *beta ) )
|
||||
{
|
||||
/* c := ab */
|
||||
bli_dcopys_mxn( mr,
|
||||
nr,
|
||||
ab, rs_ab, cs_ab,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* c := beta * c + ab */
|
||||
bli_dxpbys_mxn( mr,
|
||||
nr,
|
||||
ab, rs_ab, cs_ab,
|
||||
beta,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cgemm_opt_mxn(
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cgemm_ref_mxn( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
a_next,
|
||||
b_next );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zgemm_opt_mxn(
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_zgemm_ref_mxn( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
a_next,
|
||||
b_next );
|
||||
}
|
||||
|
||||
54
config/template/kernels/3/bli_gemm_opt_mxn.h
Normal file
54
config/template/kernels/3/bli_gemm_opt_mxn.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_mxn )
|
||||
|
||||
303
config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c
Normal file
303
config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c
Normal file
@@ -0,0 +1,303 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sgemmtrsm_l_opt_mxn(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a10,
|
||||
float* restrict a11,
|
||||
float* restrict bd01,
|
||||
float* restrict bd11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
const inc_t rs_b = bli_spacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
float* restrict minus_one = bli_sm1;
|
||||
|
||||
|
||||
bli_sgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
bli_strsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dgemmtrsm_l_opt_mxn(
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a10,
|
||||
double* restrict a11,
|
||||
double* restrict bd01,
|
||||
double* restrict bd11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template gemmtrsm_l micro-kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
real micro-kernel that fuses a gemm with a trsm_l subproblem.
|
||||
|
||||
This micro-kernel implements the following sequence of operations:
|
||||
|
||||
B11 := alpha * B11 - A10 * B01 (gemm)
|
||||
B11 := inv(A11) * B11 (trsm)
|
||||
|
||||
where B11 is MR x NR, A10 is MR x k, B01 is k x NR, A11 is MR x MR and
|
||||
lower triangular, and alpha is a scalar. Here, inv() denotes matrix
|
||||
inverse.
|
||||
|
||||
NOTE: Here, this gemmtrsm micro-kernel supports element "duplication", a
|
||||
feature that is enabled or disabled in bli_kernel.h. Duplication factors
|
||||
are also defined in the aforementioned header. Duplication is NOT
|
||||
commonly used and most developers may assume it is disabled.
|
||||
|
||||
Parameters:
|
||||
|
||||
- k: The number of columns of A10 and rows of B01.
|
||||
- alpha: The address of a scalar to be applied to B11.
|
||||
- a10: The address of A10, which is the MR x k subpartition of the
|
||||
packed (column-stored) micro-panel of A that is situated to the
|
||||
left of the MR x MR lower triangular block.
|
||||
- a11: The address of A11, which is the MR x MR lower triangular block
|
||||
within the packed micro-panel of A that is situated to the
|
||||
right of A10. By the time this gemmtrsm kernel is called, the
|
||||
diagonal of A11 has already been inverted and the strictly upper
|
||||
triangle contains zeros.
|
||||
- bd01: The address of B01, which is the k x NR subpartition situated
|
||||
above the current MR x NR block B11. bd01 is row-stored. If
|
||||
duplication is enabled, then each element occurs d times,
|
||||
effectively increasing the dimension to k x d*NR. If duplication
|
||||
is disabled, then bd01 is simply the address of the top part of
|
||||
the current packed (row-stored) micro-panel of B (labeled b01
|
||||
in the diagram below).
|
||||
- bd11: The address of B11, which is the MR x NR subpartition situated
|
||||
below B01. If duplication is enabled, then each element occurs
|
||||
d times, effectively increasing the dimension to MR x d*NR. If
|
||||
duplication is disabled, then bd11 is simply the address of the
|
||||
current MR x NR block witin the packed (row-stored) micro-panel
|
||||
of B.
|
||||
- b11: The address of the current MR x NR block within the packed
|
||||
micro-panel of B. It exists in duplicated form as bd11. If
|
||||
duplication is disabled, then b11 and bd11 refer to the same
|
||||
MR x NR block within the packed (row-stored) micro-panel of B.
|
||||
- c11: The address of C11, which is the MR x NR block of the output
|
||||
matrix (ie: the matrix provided by the user to the highest-level
|
||||
trsm API call). C11 corresponds to the elements that exist in
|
||||
packed form in B11, and is stored according to rs_c and cs_c.
|
||||
- rs_c: The row stride of C11 (ie: the distance to the next row of C11,
|
||||
in units of matrix elements).
|
||||
- cs_c: The column stride of C11 (ie: the distance to the next column of
|
||||
C11, in units of matrix elements).
|
||||
- a_next: The address of the packed micro-panel of A that will be used the
|
||||
next time the gemmtrsm micro-kernel will be called.
|
||||
- b_next: The address of the packed micro-panel of B that will be used the
|
||||
next time the gemmtrsm micro-kernel will be called.
|
||||
|
||||
The diagram below shows the packed micro-panel operands and how elements
|
||||
of each would be stored when MR == NR == 4. (The hex digits indicate the
|
||||
order of the elements in memory.) We also show a B duplication buffer (bd)
|
||||
that contains a copy of the packed micro-panel of B with a duplication
|
||||
factor of 2. If duplication is disabled (as is commonly the case), then
|
||||
bd01 == b01 and bd11 == b11.
|
||||
|
||||
NR 2*NR
|
||||
NOTE: If duplication is disabled _______ _______________
|
||||
then bd01 and bd11 simply refer b01:|0 1 2 3| bd01:|0 0 1 1 2 2 3 3|
|
||||
to b01 and b11, respectively. |4 5 6 7| |4 4 5 5 6 6 7 7|
|
||||
|8 9 A B| |8 8 9 9 A A B B|
|
||||
|C D E F| |C C D D E E F F|
|
||||
k | . | | . |
|
||||
| . | | . |
|
||||
a10: a11: | . | | . |
|
||||
___________________ _______ |_______| |_______________|
|
||||
|0 4 8 C |`. | b11:| | bd11:| |
|
||||
MR |1 5 9 D . . . | `. | | | | |
|
||||
|2 6 A E | `. | MR | | | |
|
||||
|3_7_B_F____________|______`.| |_______| |_______________|
|
||||
|
||||
k MR
|
||||
|
||||
Thus, with duplication enabled, the operation takes the form of:
|
||||
|
||||
b11 = alpha * b11 - a10 * bd01;
|
||||
b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11;
|
||||
|
||||
And if duplication is disabled, the operation reduces to:
|
||||
|
||||
b11 = alpha * b11 - a10 * b01; (Note: Here, b01 == bd01.)
|
||||
b11 = inv(a11) * b11;
|
||||
c11 = b11;
|
||||
|
||||
A note on optimization:
|
||||
- This implementation simply calls the gemm micro-kernel and then the
|
||||
trsm micro-kernel. Let's assume that the gemm micro-kernel has already
|
||||
been optimized. You have two options with regards to optimizing the
|
||||
fused gemmtrsm kernel.
|
||||
(1) Optimize only the trsm kernel and continue to call the gemm and
|
||||
trsm micro-kernels in sequence, as is done in this template
|
||||
implementation.
|
||||
(2) Fuse the implementation of the gemm micro-kernel with that of the
|
||||
trsm micro-kernel by inlining both into this gemmtrsm function.
|
||||
The latter option is more labor-intensive, but also more likely to
|
||||
yield higher performance because it allows you to eliminate redundant
|
||||
memory operations on the packed MR x NR block B11.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const inc_t rs_b = bli_dpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
double* restrict minus_one = bli_dm1;
|
||||
|
||||
/* Reminder: if duplication is disabled, then bd01 == b01, bd11 == b11. */
|
||||
|
||||
/* b11 = alpha * b11 - a10 * bd01; */
|
||||
bli_dgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */
|
||||
bli_dtrsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cgemmtrsm_l_opt_mxn(
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a10,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd01,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
const inc_t rs_b = bli_cpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
scomplex* restrict minus_one = bli_cm1;
|
||||
|
||||
|
||||
bli_cgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
bli_ctrsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zgemmtrsm_l_opt_mxn(
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a10,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd01,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
const inc_t rs_b = bli_zpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
dcomplex* restrict minus_one = bli_zm1;
|
||||
|
||||
|
||||
bli_zgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
bli_ztrsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
56
config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h
Normal file
56
config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_mxn )
|
||||
|
||||
302
config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c
Normal file
302
config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c
Normal file
@@ -0,0 +1,302 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_sgemmtrsm_u_opt_mxn(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a12,
|
||||
float* restrict a11,
|
||||
float* restrict bd21,
|
||||
float* restrict bd11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
const inc_t rs_b = bli_spacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
float* restrict minus_one = bli_sm1;
|
||||
|
||||
|
||||
bli_sgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
bli_strsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dgemmtrsm_u_opt_mxn(
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a12,
|
||||
double* restrict a11,
|
||||
double* restrict bd21,
|
||||
double* restrict bd11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template gemmtrsm_u micro-kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
real micro-kernel that fuses a gemm with a trsm_u subproblem.
|
||||
|
||||
This micro-kernel implements the following sequence of operations:
|
||||
|
||||
B11 := alpha * B11 - A12 * B21 (gemm)
|
||||
B11 := inv(A11) * B11 (trsm)
|
||||
|
||||
where B11 is MR x NR, A12 is MR x k, B21 is k x NR, A11 is MR x MR and
|
||||
upper triangular, and alpha is a scalar. Here, inv() denotes matrix
|
||||
inverse.
|
||||
|
||||
NOTE: Here, this gemmtrsm micro-kernel supports element "duplication", a
|
||||
feature that is enabled or disabled in bli_kernel.h. Duplication factors
|
||||
are also defined in the aforementioned header. Duplication is NOT
|
||||
commonly used and most developers may assume it is disabled.
|
||||
|
||||
Parameters:
|
||||
|
||||
- k: The number of columns of A12 and rows of B21.
|
||||
- alpha: The address of a scalar to be applied to B11.
|
||||
- a12: The address of A12, which is the MR x k subpartition of the
|
||||
packed (column-stored) micro-panel of A that is situated to the
|
||||
right of the MR x MR upper triangular block.
|
||||
- a11: The address of A11, which is the MR x MR upper triangular block
|
||||
within the packed micro-panel of A that is situated to the
|
||||
left of A12. By the time this gemmtrsm kernel is called, the
|
||||
diagonal of A11 has already been inverted and the strictly lower
|
||||
triangle contains zeros.
|
||||
- bd21: The address of B21, which is the k x NR subpartition situated
|
||||
above the current MR x NR block B11. bd21 is row-stored. If
|
||||
duplication is enabled, then each element occurs d times,
|
||||
effectively increasing the dimension to k x d*NR. If duplication
|
||||
is disabled, then bd21 is simply the address of the top part of
|
||||
the current packed (row-stored) micro-panel of B (labeled b21
|
||||
in the diagram below).
|
||||
- bd11: The address of B11, which is the MR x NR subpartition situated
|
||||
above B21. If duplication is enabled, then each element occurs
|
||||
d times, effectively increasing the dimension to MR x d*NR. If
|
||||
duplication is disabled, then bd11 is simply the address of the
|
||||
current MR x NR block witin the packed (row-stored) micro-panel
|
||||
of B.
|
||||
- b11: The address of the current MR x NR block within the packed
|
||||
micro-panel of B. It exists in duplicated form as bd11. If
|
||||
duplication is disabled, then b11 and bd11 refer to the same
|
||||
MR x NR block within the packed (row-stored) micro-panel of B.
|
||||
- c11: The address of C11, which is the MR x NR block of the output
|
||||
matrix (ie: the matrix provided by the user to the highest-level
|
||||
trsm API call). C11 corresponds to the elements that exist in
|
||||
packed form in B11, and is stored according to rs_c and cs_c.
|
||||
- rs_c: The row stride of C11 (ie: the distance to the next row of C11,
|
||||
in units of matrix elements).
|
||||
- cs_c: The column stride of C11 (ie: the distance to the next column of
|
||||
C11, in units of matrix elements).
|
||||
- a_next: The address of the packed micro-panel of A that will be used the
|
||||
next time the gemmtrsm micro-kernel will be called.
|
||||
- b_next: The address of the packed micro-panel of B that will be used the
|
||||
next time the gemmtrsm micro-kernel will be called.
|
||||
|
||||
The diagram below shows the packed micro-panel operands and how elements
|
||||
of each would be stored when MR == NR == 4. (The hex digits indicate the
|
||||
order of the elements in memory.) We also show a B duplication buffer (bd)
|
||||
that contains a copy of the packed micro-panel of B with a duplication
|
||||
factor of 2. If duplication is disabled (as is commonly the case), then
|
||||
bd01 == b01 and bd11 == b11.
|
||||
|
||||
a11: a12: NR 2*NR
|
||||
________ ___________________ _______ _______________
|
||||
|`. |0 4 8 | b11:|0 1 2 3| bd11:|0 0 1 1 2 2 3 3|
|
||||
MR | `. |1 5 9 . . . | |4 5 6 7| |4 4 5 5 6 6 7 7|
|
||||
| `. |2 6 A | MR |8 9 A B| |8 8 9 9 A A B B|
|
||||
|______`.|3_7_B______________| |___.___| |_______._______|
|
||||
b21:| . | bd21:| . |
|
||||
MR k | . | | . |
|
||||
| | | |
|
||||
NOTE: If duplication is disabled | | | |
|
||||
then bd21 and bd11 simply refer k | | | |
|
||||
to b21 and b11, respectively. | | | |
|
||||
ALSO: Storage digits are shown | | | |
|
||||
starting with a12 to avoid |_______| |_______________|
|
||||
obscuring triangular structure of
|
||||
a11.
|
||||
|
||||
Thus, with duplication enabled, the operation takes the form of:
|
||||
|
||||
b11 = alpha * b11 - a12 * bd21;
|
||||
b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11;
|
||||
|
||||
And if duplication is disabled, the operation reduces to:
|
||||
|
||||
b11 = alpha * b11 - a12 * b21; (Note: Here, b21 == bd21.)
|
||||
b11 = inv(a11) * b11;
|
||||
c11 = b11;
|
||||
|
||||
A note on optimization:
|
||||
- This implementation simply calls the gemm micro-kernel and then the
|
||||
trsm micro-kernel. Let's assume that the gemm micro-kernel has already
|
||||
been optimized. You have two options with regards to optimizing the
|
||||
fused gemmtrsm kernel.
|
||||
(1) Optimize only the trsm kernel and continue to call the gemm and
|
||||
trsm micro-kernels in sequence, as is done in this template
|
||||
implementation.
|
||||
(2) Fuse the implementation of the gemm micro-kernel with that of the
|
||||
trsm micro-kernel by inlining both into this gemmtrsm function.
|
||||
The latter option is more labor-intensive, but also more likely to
|
||||
yield higher performance because it allows you to eliminate redundant
|
||||
memory operations on the packed MR x NR block B11.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
*/
|
||||
const inc_t rs_b = bli_dpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
double* restrict minus_one = bli_dm1;
|
||||
|
||||
/* Reminder: if duplication is disabled, then bd21 == b21, bd11 == b11. */
|
||||
|
||||
/* b11 = alpha * b11 - a12 * bd21; */
|
||||
bli_dgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */
|
||||
bli_dtrsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cgemmtrsm_u_opt_mxn(
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a12,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd21,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
const inc_t rs_b = bli_cpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
scomplex* restrict minus_one = bli_cm1;
|
||||
|
||||
|
||||
bli_cgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
bli_ctrsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zgemmtrsm_u_opt_mxn(
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a12,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd21,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
const inc_t rs_b = bli_zpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
dcomplex* restrict minus_one = bli_zm1;
|
||||
|
||||
|
||||
bli_zgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
bli_ztrsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
56
config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h
Normal file
56
config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_mxn )
|
||||
|
||||
218
config/template/kernels/3/bli_trsm_l_opt_mxn.c
Normal file
218
config/template/kernels/3/bli_trsm_l_opt_mxn.c
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_strsm_l_opt_mxn(
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict bd,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_l_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dtrsm_l_opt_mxn(
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict bd,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template trsm_l micro-kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
real trsm micro-kernel, coded in C, which can serve as the starting point
|
||||
for one to write an optimized micro-kernel on an arbitrary architecture.
|
||||
(We show a template implementation for only double-precision real because
|
||||
the templates for the other three floating-point types would be nearly
|
||||
identical.)
|
||||
|
||||
This micro-kernel performs a triangular solve with NR right-hand sides:
|
||||
|
||||
C := inv(A) * B
|
||||
|
||||
where A is MR x MR and lower triangular, B is MR x NR, and C is MR x NR.
|
||||
|
||||
NOTE: Here, this trsm micro-kernel supports element "duplication", a
|
||||
feature that is enabled or disabled in bli_kernel.h. Duplication factors
|
||||
are also defined in the aforementioned header. Duplication is NOT
|
||||
commonly used and most developers may assume it is disabled.
|
||||
|
||||
Parameters:
|
||||
|
||||
- a: The address of A, which is the MR x MR lower triangular block
|
||||
within the packed (column-stored) micro-panel of A. By the time
|
||||
this trsm micro-kernel is called, the diagonal of A has already
|
||||
been inverted and the strictly upper triangle contains zeros.
|
||||
- b: The address of B, which is the MR x NR subpartition of the
|
||||
current packed (row-stored) micro-panel of B.
|
||||
- bd: The address of the duplicated copy of B. If duplication is
|
||||
disabled, then bd == b.
|
||||
- c: The address of C, which is the MR x NR block of the output
|
||||
matrix (ie: the matrix provided by the user to the highest-level
|
||||
trsm API call). C corresponds to the elements that exist in
|
||||
packed form in B, and is stored according to rs_c and cs_c.
|
||||
- rs_c: The row stride of C (ie: the distance to the next row of C11,
|
||||
in units of matrix elements).
|
||||
- cs_c: The column stride of C (ie: the distance to the next column of
|
||||
C11, in units of matrix elements).
|
||||
|
||||
Please see the comments in bli_gemmtrsm_l_opt_mxn.c for a diagram of the
|
||||
trsm operation and where it fits in with the preceding gemm subproblem.
|
||||
|
||||
Here are a few things to consider:
|
||||
- While all three loops are exposed in this template micro-kernel, all
|
||||
three loops typically disappear in an optimized code because they are
|
||||
fully unrolled.
|
||||
- Note that the diagonal of the triangular matrix A contains the INVERSE
|
||||
of those elements. This is done during packing so that we can avoid
|
||||
expensive division instructions within this micro-kernel.
|
||||
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
|
||||
then the result must be written to three places: the sub-block within the
|
||||
duplicated copy of B, the sub-block of the original packed micro-panel of
|
||||
B, and the sub-block of the output matrix C. When duplication is not
|
||||
used, the micro-kernel should update only the latter two locations.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t m = bli_dmr;
|
||||
const dim_t n = bli_dnr;
|
||||
|
||||
const inc_t rs_a = 1;
|
||||
const inc_t cs_a = bli_dpackmr;
|
||||
|
||||
const inc_t rs_b = bli_dpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
dim_t iter, i, j, l;
|
||||
dim_t n_behind;
|
||||
|
||||
double* restrict alpha11;
|
||||
double* restrict a10t;
|
||||
double* restrict alpha10;
|
||||
double* restrict X0;
|
||||
double* restrict x1;
|
||||
double* restrict x01;
|
||||
double* restrict chi01;
|
||||
double* restrict chi11;
|
||||
double* restrict gamma11;
|
||||
double rho11;
|
||||
|
||||
for ( iter = 0; iter < m; ++iter )
|
||||
{
|
||||
i = iter;
|
||||
n_behind = i;
|
||||
alpha11 = a + (i )*rs_a + (i )*cs_a;
|
||||
a10t = a + (i )*rs_a + (0 )*cs_a;
|
||||
X0 = b + (0 )*rs_b + (0 )*cs_b;
|
||||
x1 = b + (i )*rs_b + (0 )*cs_b;
|
||||
|
||||
/* x1 = x1 - a10t * X0; */
|
||||
/* x1 = x1 / alpha11; */
|
||||
for ( j = 0; j < n; ++j )
|
||||
{
|
||||
x01 = X0 + (0 )*rs_b + (j )*cs_b;
|
||||
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
|
||||
gamma11 = c + (i )*rs_c + (j )*cs_c;
|
||||
|
||||
/* chi11 = chi11 - a10t * x01; */
|
||||
bli_dset0s( rho11 );
|
||||
for ( l = 0; l < n_behind; ++l )
|
||||
{
|
||||
alpha10 = a10t + (l )*cs_a;
|
||||
chi01 = x01 + (l )*rs_b;
|
||||
|
||||
bli_daxpys( *alpha10, *chi01, rho11 );
|
||||
}
|
||||
bli_dsubs( rho11, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */
|
||||
bli_dscals( *alpha11, *chi11 );
|
||||
|
||||
/* Output final result to matrix C. */
|
||||
bli_dcopys( *chi11, *gamma11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ctrsm_l_opt_mxn(
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict bd,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_l_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ztrsm_l_opt_mxn(
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict bd,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_l_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
50
config/template/kernels/3/bli_trsm_l_opt_mxn.h
Normal file
50
config/template/kernels/3/bli_trsm_l_opt_mxn.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_opt_mxn )
|
||||
|
||||
218
config/template/kernels/3/bli_trsm_u_opt_mxn.c
Normal file
218
config/template/kernels/3/bli_trsm_u_opt_mxn.c
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
void bli_strsm_u_opt_mxn(
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict bd,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_u_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dtrsm_u_opt_mxn(
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict bd,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template trsm_u micro-kernel implementation
|
||||
|
||||
This function contains a template implementation for a double-precision
|
||||
real trsm micro-kernel, coded in C, which can serve as the starting point
|
||||
for one to write an optimized micro-kernel on an arbitrary architecture.
|
||||
(We show a template implementation for only double-precision real because
|
||||
the templates for the other three floating-point types would be nearly
|
||||
identical.)
|
||||
|
||||
This micro-kernel performs a triangular solve with NR right-hand sides:
|
||||
|
||||
C := inv(A) * B
|
||||
|
||||
where A is MR x MR and upper triangular, B is MR x NR, and C is MR x NR.
|
||||
|
||||
NOTE: Here, this trsm micro-kernel supports element "duplication", a
|
||||
feature that is enabled or disabled in bli_kernel.h. Duplication factors
|
||||
are also defined in the aforementioned header. Duplication is NOT
|
||||
commonly used and most developers may assume it is disabled.
|
||||
|
||||
Parameters:
|
||||
|
||||
- a: The address of A, which is the MR x MR upper triangular block
|
||||
within the packed (column-stored) micro-panel of A. By the time
|
||||
this trsm micro-kernel is called, the diagonal of A has already
|
||||
been inverted and the strictly lower triangle contains zeros.
|
||||
- b: The address of B, which is the MR x NR subpartition of the
|
||||
current packed (row-stored) micro-panel of B.
|
||||
- bd: The address of the duplicated copy of B. If duplication is
|
||||
disabled, then bd == b.
|
||||
- c: The address of C, which is the MR x NR block of the output
|
||||
matrix (ie: the matrix provided by the user to the highest-level
|
||||
trsm API call). C corresponds to the elements that exist in
|
||||
packed form in B, and is stored according to rs_c and cs_c.
|
||||
- rs_c: The row stride of C (ie: the distance to the next row of C11,
|
||||
in units of matrix elements).
|
||||
- cs_c: The column stride of C (ie: the distance to the next column of
|
||||
C11, in units of matrix elements).
|
||||
|
||||
Please see the comments in bli_gemmtrsm_u_opt_mxn.c for a diagram of the
|
||||
trsm operation and where it fits in with the preceding gemm subproblem.
|
||||
|
||||
Here are a few things to consider:
|
||||
- While all three loops are exposed in this template micro-kernel, all
|
||||
three loops typically disappear in an optimized code because they are
|
||||
fully unrolled.
|
||||
- Note that the diagonal of the triangular matrix A contains the INVERSE
|
||||
of those elements. This is done during packing so that we can avoid
|
||||
expensive division instructions within this micro-kernel.
|
||||
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
|
||||
then the result must be written to three places: the sub-block within the
|
||||
duplicated copy of B, the sub-block of the original packed micro-panel of
|
||||
B, and the sub-block of the output matrix C. When duplication is not
|
||||
used, the micro-kernel should update only the latter two locations.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
|
||||
-FGVZ
|
||||
*/
|
||||
const dim_t m = bli_dmr;
|
||||
const dim_t n = bli_dnr;
|
||||
|
||||
const inc_t rs_a = 1;
|
||||
const inc_t cs_a = bli_dpackmr;
|
||||
|
||||
const inc_t rs_b = bli_dpacknr;
|
||||
const inc_t cs_b = 1;
|
||||
|
||||
dim_t iter, i, j, l;
|
||||
dim_t n_behind;
|
||||
|
||||
double* restrict alpha11;
|
||||
double* restrict a12t;
|
||||
double* restrict alpha12;
|
||||
double* restrict X2;
|
||||
double* restrict x1;
|
||||
double* restrict x21;
|
||||
double* restrict chi21;
|
||||
double* restrict chi11;
|
||||
double* restrict gamma11;
|
||||
double rho11;
|
||||
|
||||
for ( iter = 0; iter < m; ++iter )
|
||||
{
|
||||
i = m - iter - 1;
|
||||
n_behind = iter;
|
||||
alpha11 = a + (i )*rs_a + (i )*cs_a;
|
||||
a12t = a + (i )*rs_a + (i+1)*cs_a;
|
||||
x1 = b + (i )*rs_b + (0 )*cs_b;
|
||||
X2 = b + (i+1)*rs_b + (0 )*cs_b;
|
||||
|
||||
/* x1 = x1 - a12t * X2; */
|
||||
/* x1 = x1 / alpha11; */
|
||||
for ( j = 0; j < n; ++j )
|
||||
{
|
||||
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
|
||||
x21 = X2 + (0 )*rs_b + (j )*cs_b;
|
||||
gamma11 = c + (i )*rs_c + (j )*cs_c;
|
||||
|
||||
/* chi11 = chi11 - a12t * x21; */
|
||||
bli_dset0s( rho11 );
|
||||
for ( l = 0; l < n_behind; ++l )
|
||||
{
|
||||
alpha12 = a12t + (l )*cs_a;
|
||||
chi21 = x21 + (l )*rs_b;
|
||||
|
||||
bli_daxpys( *alpha12, *chi21, rho11 );
|
||||
}
|
||||
bli_dsubs( rho11, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */
|
||||
bli_dscals( *alpha11, *chi11 );
|
||||
|
||||
/* Output final result to matrix C. */
|
||||
bli_dcopys( *chi11, *gamma11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ctrsm_u_opt_mxn(
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict bd,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_u_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ztrsm_u_opt_mxn(
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict bd,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_u_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
50
config/template/kernels/3/bli_trsm_u_opt_mxn.h
Normal file
50
config/template/kernels/3/bli_trsm_u_opt_mxn.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_u_opt_mxn )
|
||||
|
||||
107
config/template/make_defs.mk
Normal file
107
config/template/make_defs.mk
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2013, The University of Texas
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
# Only include this block of code once.
|
||||
ifndef MAKE_DEFS_MK_INCLUDED
|
||||
MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := gcc
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS :=
|
||||
|
||||
|
||||
|
||||
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
|
||||
endif
|
||||
@@ -34,75 +34,9 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T axpy2v_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpy2v_unb_var1);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpy2v_unb_var1);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpy2v_unb_var1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_axpy2v_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
conj_t conjy = bli_obj_conj_status( *y );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If alpha is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the alpha object and extract the buffer at the alpha offset.
|
||||
bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_alpha][dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
conjy,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjx, \
|
||||
@@ -121,27 +55,27 @@ void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
ctype_y* y_cast = y; \
|
||||
ctype_z* z_cast = z; \
|
||||
\
|
||||
PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \
|
||||
n, \
|
||||
alpha1_cast, \
|
||||
x_cast, incx, \
|
||||
z_cast, incz ); \
|
||||
PASTEMAC3(chxy,chy,chz,axpyv)( conjy, \
|
||||
n, \
|
||||
alpha2_cast, \
|
||||
y_cast, incy, \
|
||||
z_cast, incz ); \
|
||||
PASTEMAC3(chxy,chx,chz,kername)( conjx, \
|
||||
n, \
|
||||
alpha1_cast, \
|
||||
x_cast, incx, \
|
||||
z_cast, incz ); \
|
||||
PASTEMAC3(chxy,chy,chz,kername)( conjy, \
|
||||
n, \
|
||||
alpha2_cast, \
|
||||
y_cast, incy, \
|
||||
z_cast, incz ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( axpy2v, axpy2v_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_BASIC( axpy2v_unb_var1, AXPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpy2v, axpy2v_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpy2v_unb_var1, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpy2v, axpy2v_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpy2v_unb_var1, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -32,12 +32,6 @@
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_axpy2v_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_x, ctype_y, ctype_z, chx, chy, chz, varname ) \
|
||||
|
||||
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t inca, inc_t lda, \
|
||||
ctype* x, inc_t incx, \
|
||||
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname)( \
|
||||
PASTEMAC3(ch,ch,ch,varname)( conja, \
|
||||
conjx, \
|
||||
m, \
|
||||
n, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
@@ -75,7 +75,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* alpha, \
|
||||
ctype_a* a, inc_t inca, inc_t lda, \
|
||||
ctype_x* x, inc_t incx, \
|
||||
@@ -85,7 +85,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
PASTEMAC3(cha,chx,chy,varname)( conja, \
|
||||
conjx, \
|
||||
m, \
|
||||
n, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
|
||||
@@ -35,24 +35,6 @@
|
||||
#include "bli_axpyf_unb_var1.h"
|
||||
|
||||
|
||||
//
|
||||
// Define fusing factors (if they are not already defined by the user
|
||||
// in bli_kernel.h).
|
||||
//
|
||||
#ifndef bli_saxpyf_fuse_fac
|
||||
#define bli_saxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
|
||||
#endif
|
||||
#ifndef bli_daxpyf_fuse_fac
|
||||
#define bli_daxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
|
||||
#endif
|
||||
#ifndef bli_caxpyf_fuse_fac
|
||||
#define bli_caxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
|
||||
#endif
|
||||
#ifndef bli_zaxpyf_fuse_fac
|
||||
#define bli_zaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
@@ -63,7 +45,7 @@ void PASTEMAC(ch,opname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t inca, inc_t lda, \
|
||||
ctype* x, inc_t incx, \
|
||||
@@ -83,7 +65,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* alpha, \
|
||||
ctype_a* a, inc_t inca, inc_t lda, \
|
||||
ctype_x* x, inc_t incx, \
|
||||
|
||||
@@ -34,71 +34,9 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T axpyf_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyf_unb_var1);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyf_unb_var1);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyf_unb_var1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_axpyf_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If alpha is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the alpha object and extract the buffer at the alpha offset.
|
||||
bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_alpha][dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -130,23 +68,23 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
PASTEMAC2(chx,chax,copycjs)( conjx, *chi1, alpha_chi1 ); \
|
||||
PASTEMAC2(chax,chax,scals)( *alpha_cast, alpha_chi1 ); \
|
||||
\
|
||||
PASTEMAC3(chax,cha,chy,axpyv)( conja, \
|
||||
m, \
|
||||
&alpha_chi1, \
|
||||
a1, inca, \
|
||||
y1, incy ); \
|
||||
PASTEMAC3(chax,cha,chy,kername)( conja, \
|
||||
m, \
|
||||
&alpha_chi1, \
|
||||
a1, inca, \
|
||||
y1, incy ); \
|
||||
} \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( axpyf, axpyf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_BASIC( axpyf_unb_var1, AXPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpyf, axpyf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpyf_unb_var1, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpyf, axpyf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpyf_unb_var1, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -32,12 +32,6 @@
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_axpyf_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
|
||||
@@ -46,7 +40,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* x, inc_t incx, \
|
||||
|
||||
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
dim_t m, \
|
||||
ctype* alpha, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* y, inc_t incy, \
|
||||
@@ -56,7 +56,7 @@ void PASTEMAC(ch,opname)( \
|
||||
PASTEMAC3(ch,ch,ch,varname)( conjxt, \
|
||||
conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
m, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
@@ -77,8 +77,8 @@ void PASTEMAC3(chx,chy,chz,opname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* alpha, \
|
||||
dim_t m, \
|
||||
ctype_x* alpha, \
|
||||
ctype_x* x, inc_t incx, \
|
||||
ctype_y* y, inc_t incy, \
|
||||
ctype_xy* rho, \
|
||||
@@ -88,7 +88,7 @@ void PASTEMAC3(chx,chy,chz,opname)( \
|
||||
PASTEMAC3(chx,chy,chz,varname)( conjxt, \
|
||||
conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
m, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
|
||||
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
dim_t m, \
|
||||
ctype* alpha, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* y, inc_t incy, \
|
||||
@@ -66,8 +66,8 @@ void PASTEMAC3(chx,chy,chz,opname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* alpha, \
|
||||
dim_t m, \
|
||||
ctype_x* alpha, \
|
||||
ctype_x* x, inc_t incx, \
|
||||
ctype_y* y, inc_t incy, \
|
||||
ctype_xy* rho, \
|
||||
|
||||
@@ -36,13 +36,13 @@
|
||||
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, dotxvker, axpyvker ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
dim_t m, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
@@ -52,36 +52,36 @@ void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
{ \
|
||||
ctype_xy* one = PASTEMAC(chxy,1); \
|
||||
ctype_xy* zero = PASTEMAC(chxy,0); \
|
||||
ctype_xy* alpha_cast = alpha; \
|
||||
ctype_x* alpha_cast = alpha; \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
ctype_xy* rho_cast = rho; \
|
||||
ctype_z* z_cast = z; \
|
||||
\
|
||||
PASTEMAC3(chx,chy,chxy,dotxv)( conjxt, \
|
||||
conjy, \
|
||||
n, \
|
||||
one, \
|
||||
x_cast, incx, \
|
||||
y_cast, incy, \
|
||||
zero, \
|
||||
rho_cast ); \
|
||||
PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \
|
||||
n, \
|
||||
alpha_cast, \
|
||||
x_cast, incx, \
|
||||
z_cast, incz ); \
|
||||
PASTEMAC3(chx,chy,chxy,dotxvker)( conjxt, \
|
||||
conjy, \
|
||||
m, \
|
||||
one, \
|
||||
x_cast, incx, \
|
||||
y_cast, incy, \
|
||||
zero, \
|
||||
rho_cast ); \
|
||||
PASTEMAC3(chx,chx,chz,axpyvker)( conjx, \
|
||||
m, \
|
||||
alpha_cast, \
|
||||
x_cast, incx, \
|
||||
z_cast, incz ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( dotaxpyv, dotaxpyv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_BASIC2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotaxpyv, dotaxpyv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_D2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotaxpyv, dotaxpyv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_P2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
dim_t m, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
|
||||
@@ -35,24 +35,6 @@
|
||||
#include "bli_dotxaxpyf_unb_var1.h"
|
||||
|
||||
|
||||
//
|
||||
// Define fusing factors (if they are not already defined by the user
|
||||
// in bli_kernel.h).
|
||||
//
|
||||
#ifndef bli_sdotxaxpyf_fuse_fac
|
||||
#define bli_sdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
|
||||
#endif
|
||||
#ifndef bli_ddotxaxpyf_fuse_fac
|
||||
#define bli_ddotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
|
||||
#endif
|
||||
#ifndef bli_cdotxaxpyf_fuse_fac
|
||||
#define bli_cdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
|
||||
#endif
|
||||
#ifndef bli_zdotxaxpyf_fuse_fac
|
||||
#define bli_zdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
|
||||
@@ -36,7 +36,7 @@
|
||||
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, opname, varname ) \
|
||||
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, dotxvker, axpyvker ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
conj_t conjat, \
|
||||
@@ -107,13 +107,13 @@ void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxaxpyf, dotxaxpyf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_BASIC2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf, dotxaxpyf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_D2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf, dotxaxpyf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_P2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -42,26 +42,26 @@
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
ctype* alpha, \
|
||||
ctype* x, inc_t incx, inc_t ldx, \
|
||||
ctype* y, inc_t incy, \
|
||||
ctype* a, inc_t inca, inc_t lda, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* beta, \
|
||||
ctype* r, inc_t incr \
|
||||
ctype* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,varname)( conjx, \
|
||||
conjy, \
|
||||
PASTEMAC3(ch,ch,ch,varname)( conjat, \
|
||||
conjx, \
|
||||
m, \
|
||||
n, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
x, incx, ldx, \
|
||||
y, incy, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
r, incr ); \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( dotxf, DOTXF_KERNEL )
|
||||
@@ -71,29 +71,29 @@ INSERT_GENTFUNC_BASIC( dotxf, DOTXF_KERNEL )
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,opname)( \
|
||||
void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype_xy* alpha, \
|
||||
ctype_x* x, inc_t incx, inc_t ldx, \
|
||||
ctype_y* y, inc_t incy, \
|
||||
ctype_r* beta, \
|
||||
ctype_r* r, inc_t incr \
|
||||
dim_t b_n, \
|
||||
ctype_ax* alpha, \
|
||||
ctype_a* a, inc_t inca, inc_t lda, \
|
||||
ctype_x* x, inc_t incx, \
|
||||
ctype_y* beta, \
|
||||
ctype_y* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(chx,chy,chr,varname)( conjx, \
|
||||
conjy, \
|
||||
PASTEMAC3(cha,chx,chy,varname)( conjat, \
|
||||
conjx, \
|
||||
m, \
|
||||
n, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
x, incx, ldx, \
|
||||
y, incy, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
r, incr ); \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
|
||||
@@ -35,24 +35,6 @@
|
||||
#include "bli_dotxf_unb_var1.h"
|
||||
|
||||
|
||||
//
|
||||
// Define fusing factors (if they are not already defined by the user
|
||||
// in bli_kernel.h).
|
||||
//
|
||||
#ifndef bli_sdotxf_fuse_fac
|
||||
#define bli_sdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
|
||||
#endif
|
||||
#ifndef bli_ddotxf_fuse_fac
|
||||
#define bli_ddotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
|
||||
#endif
|
||||
#ifndef bli_cdotxf_fuse_fac
|
||||
#define bli_cdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
|
||||
#endif
|
||||
#ifndef bli_zdotxf_fuse_fac
|
||||
#define bli_zdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
@@ -60,15 +42,15 @@
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
ctype* alpha, \
|
||||
ctype* x, inc_t incx, inc_t ldx, \
|
||||
ctype* y, inc_t incy, \
|
||||
ctype* a, inc_t inca, inc_t lda, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* beta, \
|
||||
ctype* r, inc_t incr \
|
||||
ctype* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( dotxf )
|
||||
@@ -78,18 +60,18 @@ INSERT_GENTPROT_BASIC( dotxf )
|
||||
// Prototype BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname ) \
|
||||
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,opname)( \
|
||||
void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype_xy* alpha, \
|
||||
ctype_x* x, inc_t incx, inc_t ldx, \
|
||||
ctype_y* y, inc_t incy, \
|
||||
ctype_r* beta, \
|
||||
ctype_r* r, inc_t incr \
|
||||
dim_t b_n, \
|
||||
ctype_ax* alpha, \
|
||||
ctype_a* a, inc_t inca, inc_t lda, \
|
||||
ctype_x* x, inc_t incx, \
|
||||
ctype_y* beta, \
|
||||
ctype_y* y, inc_t incy \
|
||||
);
|
||||
|
||||
|
||||
|
||||
@@ -34,139 +34,58 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T dotxf_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy,
|
||||
void* beta,
|
||||
void* rho
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_unb_var1);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_unb_var1);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_unb_var1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotxf_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
num_t dt_rho = bli_obj_datatype( *rho );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
conj_t conjy = bli_obj_conj_status( *y );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
void* buf_rho = bli_obj_buffer_at_off( *rho );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// The datatype of alpha MUST be the type union of x and y. This is to
|
||||
// prevent any unnecessary loss of information during computation.
|
||||
dt_alpha = bli_datatype_union( dt_x, dt_y );
|
||||
buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
|
||||
|
||||
// The datatype of beta MUST be the same as the datatype of rho.
|
||||
dt_beta = dt_rho;
|
||||
buf_beta = bli_obj_scalar_buffer( dt_beta, *beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y][dt_rho];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
conjy,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y,
|
||||
buf_beta,
|
||||
buf_rho );
|
||||
}
|
||||
*/
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t b_m, \
|
||||
dim_t n, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, inc_t ldx, \
|
||||
void* y, inc_t incy, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* x, inc_t incx, \
|
||||
void* beta, \
|
||||
void* r, inc_t incr \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_xy* alpha_cast = alpha; \
|
||||
ctype_ax* alpha_cast = alpha; \
|
||||
ctype_a* a_cast = a; \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* beta_cast = beta; \
|
||||
ctype_y* y_cast = y; \
|
||||
ctype_r* beta_cast = beta; \
|
||||
ctype_r* r_cast = r; \
|
||||
ctype_a* a1; \
|
||||
ctype_x* x1; \
|
||||
ctype_y* y1; \
|
||||
ctype_r* rho1; \
|
||||
ctype_y* psi1; \
|
||||
dim_t i; \
|
||||
\
|
||||
for ( i = 0; i < b_m; ++i ) \
|
||||
for ( i = 0; i < b_n; ++i ) \
|
||||
{ \
|
||||
x1 = x_cast + (0 )*incx + (i )*ldx; \
|
||||
y1 = y_cast + (0 )*incy; \
|
||||
rho1 = r_cast + (i )*incr; \
|
||||
a1 = a_cast + (0 )*inca + (i )*lda; \
|
||||
x1 = x_cast + (0 )*incx; \
|
||||
psi1 = y_cast + (i )*incy; \
|
||||
\
|
||||
PASTEMAC3(chx,chy,chr,dotxv)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha_cast, \
|
||||
x1, incx, \
|
||||
y1, incy, \
|
||||
beta_cast, \
|
||||
rho1 ); \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjat, \
|
||||
conjx, \
|
||||
m, \
|
||||
alpha_cast, \
|
||||
a1, inca, \
|
||||
x1, incx, \
|
||||
beta_cast, \
|
||||
psi1 ); \
|
||||
} \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxf, dotxf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxf_unb_var1, DOTXV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxf, dotxf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxf_unb_var1, DOTXV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxf, dotxf_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxf_unb_var1, DOTXV_KERNEL )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -32,26 +32,20 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_dotxf_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
|
||||
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, inc_t ldx, \
|
||||
void* y, inc_t incy, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* x, inc_t incx, \
|
||||
void* beta, \
|
||||
void* r, inc_t incr \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxf_unb_var1 )
|
||||
|
||||
@@ -159,8 +159,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
\
|
||||
conja = bli_extract_conj( transa ); \
|
||||
\
|
||||
/* Query the fusing factor from the dotxf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
|
||||
/* Query the fusing factor for the dotxf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxf_fusefac); \
|
||||
\
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
@@ -173,8 +173,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conja, \
|
||||
conjx, \
|
||||
f, \
|
||||
n_elem, \
|
||||
f, \
|
||||
alpha_cast, \
|
||||
A1, cs_at, rs_at, \
|
||||
x1, incx, \
|
||||
|
||||
@@ -177,8 +177,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
y_cast, incy ); \
|
||||
} \
|
||||
\
|
||||
/* Query the fusing factor from the axpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
|
||||
/* Query the fusing factor for the axpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,axpyf_fusefac); \
|
||||
\
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
|
||||
@@ -210,8 +210,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
y_cast, incy ); \
|
||||
} \
|
||||
\
|
||||
/* Query the fusing factor from the dotxaxpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxaxpyf_fuse_fac); \
|
||||
/* Query the fusing factor for the dotxaxpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxaxpyf_fusefac); \
|
||||
\
|
||||
for ( i = 0; i < m; i += f ) \
|
||||
{ \
|
||||
|
||||
@@ -228,8 +228,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
y_cast, incy ); \
|
||||
} \
|
||||
\
|
||||
/* Query the fusing factor from the dotxaxpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxaxpyf_fuse_fac); \
|
||||
/* Query the fusing factor for the dotxaxpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxaxpyf_fusefac); \
|
||||
\
|
||||
for ( i = 0; i < m; i += f ) \
|
||||
{ \
|
||||
|
||||
@@ -160,8 +160,8 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
\
|
||||
conja = bli_extract_conj( trans ); \
|
||||
\
|
||||
/* Query the fusing factor from the dotxf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
|
||||
/* Query the fusing factor for the dotxf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxf_fusefac); \
|
||||
\
|
||||
/* We reduce all of the possible cases down to just lower/upper. */ \
|
||||
if ( bli_is_upper( uplo_trans ) ) \
|
||||
@@ -208,15 +208,15 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
} \
|
||||
\
|
||||
/* x1 = x1 + alpha * A12 * x2; */ \
|
||||
PASTEMAC3(cha,chx,chx,dotxf)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
f, \
|
||||
n_ahead, \
|
||||
alpha_cast, \
|
||||
A12, cs_at, rs_at, \
|
||||
x2, incx, \
|
||||
one, \
|
||||
x1, incx ); \
|
||||
PASTEMAC3(cha,chx,chx,kername)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_ahead, \
|
||||
f, \
|
||||
alpha_cast, \
|
||||
A12, cs_at, rs_at, \
|
||||
x2, incx, \
|
||||
one, \
|
||||
x1, incx ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_lower( uplo_trans ) ) */ \
|
||||
@@ -265,8 +265,8 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
/* x1 = x1 + alpha * A10 * x0; */ \
|
||||
PASTEMAC3(cha,chx,chx,kername)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
f, \
|
||||
n_ahead, \
|
||||
f, \
|
||||
alpha_cast, \
|
||||
A10, cs_at, rs_at, \
|
||||
x0, incx, \
|
||||
|
||||
@@ -159,8 +159,8 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
\
|
||||
conja = bli_extract_conj( trans ); \
|
||||
\
|
||||
/* Query the fusing factor from the axpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
|
||||
/* Query the fusing factor for the axpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,axpyf_fusefac); \
|
||||
\
|
||||
/* We reduce all of the possible cases down to just lower/upper. */ \
|
||||
if ( bli_is_upper( uplo_trans ) ) \
|
||||
@@ -176,14 +176,14 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
x0 = x_cast + (0 )*incx; \
|
||||
\
|
||||
/* x0 = x0 + alpha * A01 * x1; */ \
|
||||
PASTEMAC3(cha,chx,chx,axpyf)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_behind, \
|
||||
f, \
|
||||
alpha_cast, \
|
||||
A01, rs_at, cs_at, \
|
||||
x1, incx, \
|
||||
x0, incx ); \
|
||||
PASTEMAC3(cha,chx,chx,kername)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_behind, \
|
||||
f, \
|
||||
alpha_cast, \
|
||||
A01, rs_at, cs_at, \
|
||||
x1, incx, \
|
||||
x0, incx ); \
|
||||
\
|
||||
/* x1 = alpha * A11 * x1; */ \
|
||||
for ( k = 0; k < f; ++k ) \
|
||||
|
||||
@@ -161,8 +161,8 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
\
|
||||
conja = bli_extract_conj( trans ); \
|
||||
\
|
||||
/* Query the fusing factor from the dotxf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
|
||||
/* Query the fusing factor for the dotxf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,dotxf_fusefac); \
|
||||
\
|
||||
/* x = alpha * x; */ \
|
||||
PASTEMAC2(chax,chx,scalv)( BLIS_NO_CONJUGATE, \
|
||||
@@ -186,8 +186,8 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
/* x1 = x1 - A12 * x2; */ \
|
||||
PASTEMAC3(cha,chx,chx,kername)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
f, \
|
||||
n_behind, \
|
||||
f, \
|
||||
minus_one, \
|
||||
A12, cs_at, rs_at, \
|
||||
x2, incx, \
|
||||
@@ -242,8 +242,8 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
/* x1 = x1 - A10 * x0; */ \
|
||||
PASTEMAC3(cha,chx,chx,kername)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
f, \
|
||||
n_behind, \
|
||||
f, \
|
||||
minus_one, \
|
||||
A10, cs_at, rs_at, \
|
||||
x0, incx, \
|
||||
|
||||
@@ -160,8 +160,8 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
\
|
||||
conja = bli_extract_conj( trans ); \
|
||||
\
|
||||
/* Query the fusing factor from the axpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
|
||||
/* Query the fusing factor for the axpyf implementation. */ \
|
||||
b_fuse = PASTEMAC(chax,axpyf_fusefac); \
|
||||
\
|
||||
/* x = alpha * x; */ \
|
||||
PASTEMAC2(chax,chx,scalv)( BLIS_NO_CONJUGATE, \
|
||||
|
||||
@@ -59,38 +59,39 @@ void PASTEMAC(ch,varname)( \
|
||||
const inc_t rs_ab = 1; \
|
||||
const inc_t cs_ab = PASTEMAC(ch,mr); \
|
||||
\
|
||||
dim_t k0, j0, i0; \
|
||||
dim_t l, j, i; \
|
||||
\
|
||||
ctype ab[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ]; \
|
||||
ctype* restrict ab00; \
|
||||
ctype a0; \
|
||||
ctype b0; \
|
||||
ctype* restrict abij; \
|
||||
ctype ai; \
|
||||
ctype bj; \
|
||||
\
|
||||
\
|
||||
/* Initialize the accumulator elements in ab to zero. */ \
|
||||
for ( i0 = 0; i0 < m * n; ++i0 ) \
|
||||
for ( i = 0; i < m * n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,set0s)( *(ab + i0) ); \
|
||||
PASTEMAC(ch,set0s)( *(ab + i) ); \
|
||||
} \
|
||||
\
|
||||
/* Perform a series of k rank-1 updates into ab. */ \
|
||||
for ( k0 = 0; k0 < k; ++k0 ) \
|
||||
for ( l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ab00 = ab; \
|
||||
abij = ab; \
|
||||
\
|
||||
for ( j0 = 0; j0 < n; ++j0 ) \
|
||||
/* In an optimized implementation, these two loops over MR and NR
|
||||
are typically fully unrolled. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
b0 = *(b + j0); \
|
||||
bj = *(b + j); \
|
||||
\
|
||||
for ( i0 = 0; i0 < m; ++i0 ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
a0 = *(a + i0); \
|
||||
ai = *(a + i); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( a0, \
|
||||
b0, \
|
||||
*ab00 ); \
|
||||
ab00 += rs_ab; \
|
||||
PASTEMAC(ch,dots)( ai, bj, *abij ); \
|
||||
\
|
||||
abij += rs_ab; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
@@ -99,9 +100,9 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
/* Scale the result in ab by alpha. */ \
|
||||
for ( i0 = 0; i0 < m * n; ++i0 ) \
|
||||
for ( i = 0; i < m * n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scals)( *alpha, *(ab + i0) ); \
|
||||
PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
|
||||
|
||||
@@ -41,12 +41,12 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aL, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
@@ -56,23 +56,23 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
\
|
||||
/* b = alpha * b - aL * bdT; */ \
|
||||
/* b11 = alpha * b11 - a10 * bd01; */ \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
aL, \
|
||||
bdT, \
|
||||
a10, \
|
||||
bd01, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b, \
|
||||
b11, rs_b, cs_b, \
|
||||
a_next, \
|
||||
b_next ); \
|
||||
\
|
||||
/* b = inv(a) * b;
|
||||
bd = b; (if gemm ukernel needs duplicated B)
|
||||
c = b; */ \
|
||||
PASTEMAC(ch,trsmukr)( a, \
|
||||
b, \
|
||||
bd, \
|
||||
c, rs_c, cs_c ); \
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */ \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bd11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ref_mxn, GEMM_UKERNEL, TRSM_L_UKERNEL )
|
||||
|
||||
@@ -42,12 +42,12 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aL, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
@@ -41,12 +41,12 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aR, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
@@ -56,23 +56,23 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
\
|
||||
/* b = alpha * b - aR * bdB; */ \
|
||||
/* b11 = alpha * b11 - a12 * bd21; */ \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
aR, \
|
||||
bdB, \
|
||||
a12, \
|
||||
bd21, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b, \
|
||||
b11, rs_b, cs_b, \
|
||||
a_next, \
|
||||
b_next ); \
|
||||
\
|
||||
/* b = inv(a) * b;
|
||||
bd = b; (if gemm ukernel needs duplicated B)
|
||||
c = b; */ \
|
||||
PASTEMAC(ch,trsmukr)( a, \
|
||||
b, \
|
||||
bd, \
|
||||
c, rs_c, cs_c ); \
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */ \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bd11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ref_mxn, GEMM_UKERNEL, TRSM_U_UKERNEL )
|
||||
|
||||
@@ -42,12 +42,12 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aR, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, k; \
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
ctype* restrict alpha11; \
|
||||
@@ -87,18 +87,20 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* chi11 = chi11 - a10t * x01; */ \
|
||||
PASTEMAC(ch,set0s)( rho11 ); \
|
||||
for ( k = 0; k < n_behind; ++k ) \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
alpha10 = a10t + (k )*cs_a; \
|
||||
chi01 = x01 + (k )*rs_b; \
|
||||
alpha10 = a10t + (l )*cs_a; \
|
||||
chi01 = x01 + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpys)( *alpha10, *chi01, rho11 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho11, *chi11 ); \
|
||||
\
|
||||
/* chi11 = chi11 / alpha11; */ \
|
||||
/* NOTE: 1.0/alpha11 is stored instead of alpha11, so we
|
||||
need to multiply rather than divide. */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scals)( *alpha11, *chi11 ); \
|
||||
\
|
||||
/* Output final result to matrix C. */ \
|
||||
|
||||
@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, k; \
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
ctype* restrict alpha11; \
|
||||
@@ -87,18 +87,20 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* chi11 = chi11 - a12t * x21; */ \
|
||||
PASTEMAC(ch,set0s)( rho11 ); \
|
||||
for ( k = 0; k < n_behind; ++k ) \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
alpha12 = a12t + (k )*cs_a; \
|
||||
chi21 = x21 + (k )*rs_b; \
|
||||
alpha12 = a12t + (l )*cs_a; \
|
||||
chi21 = x21 + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpys)( *alpha12, *chi21, rho11 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho11, *chi11 ); \
|
||||
\
|
||||
/* chi11 = chi11 / alpha11; */ \
|
||||
/* NOTE: 1.0/alpha11 is stored instead of alpha11, so we
|
||||
need to multiply rather than divide. */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scals)( *alpha11, *chi11 ); \
|
||||
\
|
||||
/* Output final result to matrix C. */ \
|
||||
|
||||
@@ -248,5 +248,21 @@
|
||||
#define bli_cnifac BLIS_DEFAULT_NI_FAC
|
||||
#define bli_znifac BLIS_DEFAULT_NI_FAC
|
||||
|
||||
// Default Level-1f fusing factors
|
||||
|
||||
#define bli_sdotxf_fusefac BLIS_DOTXF_FUSE_FAC_S
|
||||
#define bli_ddotxf_fusefac BLIS_DOTXF_FUSE_FAC_D
|
||||
#define bli_cdotxf_fusefac BLIS_DOTXF_FUSE_FAC_C
|
||||
#define bli_zdotxf_fusefac BLIS_DOTXF_FUSE_FAC_Z
|
||||
|
||||
#define bli_saxpyf_fusefac BLIS_AXPYF_FUSE_FAC_S
|
||||
#define bli_daxpyf_fusefac BLIS_AXPYF_FUSE_FAC_D
|
||||
#define bli_caxpyf_fusefac BLIS_AXPYF_FUSE_FAC_C
|
||||
#define bli_zaxpyf_fusefac BLIS_AXPYF_FUSE_FAC_Z
|
||||
|
||||
#define bli_sdotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_S
|
||||
#define bli_ddotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_D
|
||||
#define bli_cdotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_C
|
||||
#define bli_zdotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_Z
|
||||
|
||||
#endif
|
||||
|
||||
@@ -370,6 +370,18 @@
|
||||
\
|
||||
( rs < cs )
|
||||
|
||||
#define bli_has_nonunit_inc1( inc1 ) \
|
||||
\
|
||||
( inc1 != 1 )
|
||||
|
||||
#define bli_has_nonunit_inc2( inc1, inc2 ) \
|
||||
\
|
||||
( inc1 != 1 || inc2 != 1 )
|
||||
|
||||
#define bli_has_nonunit_inc3( inc1, inc2, inc3 ) \
|
||||
\
|
||||
( inc1 != 1 || inc2 != 1 || inc3 != 1 )
|
||||
|
||||
|
||||
// diag offset-related
|
||||
|
||||
|
||||
@@ -34,127 +34,46 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T dotxf_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy,
|
||||
void* beta,
|
||||
void* rho
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_opt_var1);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_opt_var1);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_opt_var1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotxf_opt_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
num_t dt_rho = bli_obj_datatype( *rho );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
conj_t conjy = bli_obj_conj_status( *y );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
void* buf_rho = bli_obj_buffer_at_off( *rho );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// The datatype of alpha MUST be the type union of x and y. This is to
|
||||
// prevent any unnecessary loss of information during computation.
|
||||
dt_alpha = bli_datatype_union( dt_x, dt_y );
|
||||
buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
|
||||
|
||||
// The datatype of beta MUST be the same as the datatype of rho.
|
||||
dt_beta = dt_rho;
|
||||
buf_beta = bli_obj_scalar_buffer( dt_beta, *beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y][dt_rho];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
conjy,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y,
|
||||
buf_beta,
|
||||
buf_rho );
|
||||
}
|
||||
*/
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t b_m, \
|
||||
dim_t n, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, inc_t ldx, \
|
||||
void* y, inc_t incy, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* x, inc_t incx, \
|
||||
void* beta, \
|
||||
void* r, inc_t incr \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_xy* alpha_cast = alpha; \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
ctype_x* a_cast = a; \
|
||||
ctype_y* x_cast = x; \
|
||||
ctype_r* beta_cast = beta; \
|
||||
ctype_r* r_cast = r; \
|
||||
ctype_x* x1; \
|
||||
ctype_y* y1; \
|
||||
ctype_r* rho1; \
|
||||
ctype_r* y_cast = y; \
|
||||
ctype_x* a1; \
|
||||
ctype_y* x1; \
|
||||
ctype_r* psi1; \
|
||||
dim_t i; \
|
||||
\
|
||||
for ( i = 0; i < b_m; ++i ) \
|
||||
for ( i = 0; i < b_n; ++i ) \
|
||||
{ \
|
||||
x1 = x_cast + (0 )*incx + (i )*ldx; \
|
||||
y1 = y_cast + (0 )*incy; \
|
||||
rho1 = r_cast + (i )*incr; \
|
||||
a1 = a_cast + (0 )*inca + (i )*lda; \
|
||||
x1 = x_cast + (0 )*incx; \
|
||||
psi1 = y_cast + (i )*incy; \
|
||||
\
|
||||
PASTEMAC3(chx,chy,chr,dotxv)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
PASTEMAC3(cha,chx,chy,dotxv)( conjat, \
|
||||
conjx, \
|
||||
m, \
|
||||
alpha_cast, \
|
||||
a1, inca, \
|
||||
x1, incx, \
|
||||
y1, incy, \
|
||||
beta_cast, \
|
||||
rho1 ); \
|
||||
psi1 ); \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -184,30 +103,30 @@ typedef union
|
||||
|
||||
|
||||
void bli_ddddotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t b_m,
|
||||
dim_t n,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx, inc_t ldx,
|
||||
void* y, inc_t incy,
|
||||
void* a, inc_t inca, inc_t lda,
|
||||
void* x, inc_t incx,
|
||||
void* beta,
|
||||
void* r, inc_t incr
|
||||
void* y, inc_t incy
|
||||
)
|
||||
{
|
||||
double* restrict alpha_cast = alpha;
|
||||
double* restrict beta_cast = beta;
|
||||
double* restrict a_cast = a;
|
||||
double* restrict x_cast = x;
|
||||
double* restrict y_cast = y;
|
||||
double* restrict r_cast = r;
|
||||
dim_t i;
|
||||
|
||||
const dim_t n_elem_per_reg = 2;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t n_pre;
|
||||
dim_t n_run;
|
||||
dim_t n_left;
|
||||
dim_t m_pre;
|
||||
dim_t m_run;
|
||||
dim_t m_left;
|
||||
|
||||
double* restrict x0;
|
||||
double* restrict x1;
|
||||
@@ -223,76 +142,76 @@ void bli_ddddotxf_opt_var1(
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
|
||||
if ( bli_zero_dim1( b_m ) ) return;
|
||||
if ( bli_zero_dim1( b_n ) ) return;
|
||||
|
||||
// If the vector lengths are zero, scale r by beta and return.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
if ( bli_zero_dim1( m ) )
|
||||
{
|
||||
PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
|
||||
b_m,
|
||||
b_n,
|
||||
beta_cast,
|
||||
r_cast, incr );
|
||||
y_cast, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
n_pre = 0;
|
||||
m_pre = 0;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( b_m < PASTEMAC(d,dotxf_fuse_fac) )
|
||||
if ( b_n < PASTEMAC(d,dotxf_fuse_fac) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( incx != 1 || incy != 1 || incr != 1 )
|
||||
else if ( inca != 1 || incx != 1 || incy != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) ||
|
||||
bli_is_unaligned_to( r, 16 ) )
|
||||
else if ( bli_is_unaligned_to( a, 16 ) ||
|
||||
bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) &&
|
||||
bli_is_aligned_to( r, 16 ) ) // Note: r is not affected by x and y being unaligned.
|
||||
if ( bli_is_unaligned_to( a, 16 ) &&
|
||||
bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_aligned_to( y, 16 ) ) // Note: r is not affected by x and y being unaligned.
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = 1;
|
||||
m_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
PASTEMAC3(d,d,d,dotxf_unb_var1)( conjx,
|
||||
conjy,
|
||||
b_m,
|
||||
n,
|
||||
PASTEMAC3(d,d,d,dotxf_unb_var1)( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha_cast,
|
||||
x_cast, incx, ldx,
|
||||
y_cast, incy,
|
||||
a_cast, inca, lda,
|
||||
x_cast, incx,
|
||||
beta_cast,
|
||||
r_cast, incr );
|
||||
y_cast, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
x0 = x_cast;
|
||||
x1 = x_cast + ldx;
|
||||
x2 = x_cast + 2*ldx;
|
||||
x3 = x_cast + 3*ldx;
|
||||
y0 = y_cast;
|
||||
x0 = a_cast;
|
||||
x1 = a_cast + lda;
|
||||
x2 = a_cast + 2*lda;
|
||||
x3 = a_cast + 3*lda;
|
||||
y0 = x_cast;
|
||||
|
||||
PASTEMAC(d,set0s)( rho0 );
|
||||
PASTEMAC(d,set0s)( rho1 );
|
||||
PASTEMAC(d,set0s)( rho2 );
|
||||
PASTEMAC(d,set0s)( rho3 );
|
||||
|
||||
if ( n_pre == 1 )
|
||||
if ( m_pre == 1 )
|
||||
{
|
||||
x0c = *x0;
|
||||
x1c = *x1;
|
||||
@@ -305,11 +224,11 @@ void bli_ddddotxf_opt_var1(
|
||||
rho2 += x2c * y0c;
|
||||
rho3 += x3c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
x1 += incx;
|
||||
x2 += incx;
|
||||
x3 += incx;
|
||||
y0 += incy;
|
||||
x0 += inca;
|
||||
x1 += inca;
|
||||
x2 += inca;
|
||||
x3 += inca;
|
||||
y0 += incx;
|
||||
}
|
||||
|
||||
rho0v.v = _mm_setzero_pd();
|
||||
@@ -317,7 +236,7 @@ void bli_ddddotxf_opt_var1(
|
||||
rho2v.v = _mm_setzero_pd();
|
||||
rho3v.v = _mm_setzero_pd();
|
||||
|
||||
for ( i = 0; i < n_run; ++i )
|
||||
for ( i = 0; i < m_run; ++i )
|
||||
{
|
||||
x0v.v = _mm_load_pd( ( double* )(x0 + 0*n_elem_per_reg) );
|
||||
x1v.v = _mm_load_pd( ( double* )(x1 + 0*n_elem_per_reg) );
|
||||
@@ -376,9 +295,9 @@ void bli_ddddotxf_opt_var1(
|
||||
rho2 += rho2v.d[0] + rho2v.d[1];
|
||||
rho3 += rho3v.d[0] + rho3v.d[1];
|
||||
|
||||
if ( n_left > 0 )
|
||||
if ( m_left > 0 )
|
||||
{
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
x0c = *x0;
|
||||
x1c = *x1;
|
||||
@@ -391,23 +310,23 @@ void bli_ddddotxf_opt_var1(
|
||||
rho2 += x2c * y0c;
|
||||
rho3 += x3c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
x1 += incx;
|
||||
x2 += incx;
|
||||
x3 += incx;
|
||||
y0 += incy;
|
||||
x0 += inca;
|
||||
x1 += inca;
|
||||
x2 += inca;
|
||||
x3 += inca;
|
||||
y0 += incx;
|
||||
}
|
||||
}
|
||||
/*
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast ) ); \
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+1) ); \
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+2) ); \
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+3) ); \
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast ) ); \
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+1) ); \
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+2) ); \
|
||||
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+3) ); \
|
||||
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(r_cast ) ); \
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(r_cast+1) ); \
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(r_cast+2) ); \
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(r_cast+3) ); \
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(y_cast ) ); \
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(y_cast+1) ); \
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(y_cast+2) ); \
|
||||
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(y_cast+3) ); \
|
||||
*/
|
||||
|
||||
rho1v.d[0] = rho0;
|
||||
@@ -418,8 +337,8 @@ void bli_ddddotxf_opt_var1(
|
||||
betav.v = _mm_loaddup_pd( ( double* ) beta_cast );
|
||||
alphav.v = _mm_loaddup_pd( ( double* ) alpha_cast );
|
||||
|
||||
rho0v.v = _mm_load_pd( ( double* )(r_cast + 0*n_elem_per_reg) );
|
||||
rho2v.v = _mm_load_pd( ( double* )(r_cast + 1*n_elem_per_reg) );
|
||||
rho0v.v = _mm_load_pd( ( double* )(y_cast + 0*n_elem_per_reg) );
|
||||
rho2v.v = _mm_load_pd( ( double* )(y_cast + 1*n_elem_per_reg) );
|
||||
|
||||
rho0v.v *= betav.v;
|
||||
rho2v.v *= betav.v;
|
||||
@@ -427,7 +346,7 @@ void bli_ddddotxf_opt_var1(
|
||||
rho0v.v += alphav.v * rho1v.v;
|
||||
rho2v.v += alphav.v * rho3v.v;
|
||||
|
||||
_mm_store_pd( ( double* )(r_cast + 0*n_elem_per_reg), rho0v.v );
|
||||
_mm_store_pd( ( double* )(r_cast + 1*n_elem_per_reg), rho2v.v );
|
||||
_mm_store_pd( ( double* )(y_cast + 0*n_elem_per_reg), rho0v.v );
|
||||
_mm_store_pd( ( double* )(y_cast + 1*n_elem_per_reg), rho2v.v );
|
||||
|
||||
}
|
||||
|
||||
@@ -32,12 +32,6 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_dotxf_opt_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
|
||||
|
||||
//
|
||||
// Define fusing factors for dotxf operation.
|
||||
@@ -49,18 +43,18 @@ void bli_dotxf_opt_var1( obj_t* alpha,
|
||||
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
|
||||
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t b_n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, inc_t ldx, \
|
||||
void* y, inc_t incy, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* x, inc_t incx, \
|
||||
void* beta, \
|
||||
void* r, inc_t incr \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )
|
||||
|
||||
@@ -3,9 +3,9 @@ c #rg # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major;
|
||||
c #rji # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
|
||||
0 # Test all combinations of storage schemes?
|
||||
32 # General stride spacing (for cases when testing general stride)
|
||||
d #sdcz # Datatype(s) to test
|
||||
sdcz #sdcz # Datatype(s) to test
|
||||
100 # Problem size: first to test
|
||||
500 # Problem size: maximum to test
|
||||
300 # Problem size: maximum to test
|
||||
100 # Problem size: increment between experiments
|
||||
1 # Error-checking level (0 = disable error checking; 1 = full error checking)
|
||||
i # Reaction to test failure ('i' = ignore; 's' = sleep() and continue; 'a' = abort)
|
||||
|
||||
@@ -1,60 +1,60 @@
|
||||
# --- Utility ------------------------------------------------------------------
|
||||
|
||||
0 randv (0 = disable all; 1 = specify)
|
||||
1 randv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
|
||||
0 randm (0 = disable all; 1 = specify)
|
||||
1 randm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m n (-1 = bind to problem size)
|
||||
|
||||
|
||||
# --- Level-1v -----------------------------------------------------------------
|
||||
|
||||
0 addv (0 = disable all; 1 = specify)
|
||||
1 addv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
? parameters: conjx (? = test all values)
|
||||
|
||||
0 axpyv (0 = disable all; 1 = specify)
|
||||
1 axpyv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
? parameters: conjx (? = test all values)
|
||||
|
||||
0 copyv (0 = disable all; 1 = specify)
|
||||
1 copyv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
? parameters: conjx (? = test all values)
|
||||
|
||||
0 dotv (0 = disable all; 1 = specify)
|
||||
1 dotv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
?? parameters: conjx conjy (? = test all values)
|
||||
|
||||
0 dotxv (0 = disable all; 1 = specify)
|
||||
1 dotxv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
?? parameters: conjx conjy (? = test all values)
|
||||
|
||||
0 fnormv (0 = disable all; 1 = specify)
|
||||
1 fnormv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
|
||||
0 scalv (0 = disable all; 1 = specify)
|
||||
1 scalv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
? parameters: conjbeta (? = test all values)
|
||||
|
||||
0 scal2v (0 = disable all; 1 = specify)
|
||||
1 scal2v (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
? parameters: conjx (? = test all values)
|
||||
|
||||
0 setv (0 = disable all; 1 = specify)
|
||||
1 setv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
|
||||
0 subv (0 = disable all; 1 = specify)
|
||||
1 subv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
? parameters: conjx (? = test all values)
|
||||
@@ -62,40 +62,40 @@
|
||||
|
||||
# --- Level-1m -----------------------------------------------------------------
|
||||
|
||||
0 addm (0 = disable all; 1 = specify)
|
||||
1 addm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
? parameters: transa (? = test all values)
|
||||
|
||||
0 axpym (0 = disable all; 1 = specify)
|
||||
1 axpym (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m n (-1 = bind to problem size)
|
||||
? parameters: transa (? = test all values)
|
||||
|
||||
0 copym (0 = disable all; 1 = specify)
|
||||
1 copym (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
? parameters: transa (? = test all values)
|
||||
|
||||
0 fnormm (0 = disable all; 1 = specify)
|
||||
1 fnormm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
|
||||
0 scalm (0 = disable all; 1 = specify)
|
||||
1 scalm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
? parameters: conjbeta (? = test all values)
|
||||
|
||||
0 scal2m (0 = disable all; 1 = specify)
|
||||
1 scal2m (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
? parameters: transa (? = test all values)
|
||||
|
||||
0 setm (0 = disable all; 1 = specify)
|
||||
1 setm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
|
||||
0 subm (0 = disable all; 1 = specify)
|
||||
1 subm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
? parameters: transa (? = test all values)
|
||||
@@ -103,52 +103,52 @@
|
||||
|
||||
# --- Level-2 ------------------------------------------------------------------
|
||||
|
||||
0 gemv (0 = disable all; 1 = specify)
|
||||
1 gemv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
?? parameters: transa conjx (? = test all values)
|
||||
|
||||
0 ger (0 = disable all; 1 = specify)
|
||||
1 ger (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
?? parameters: conjx conjy (? = test all values)
|
||||
|
||||
0 hemv (0 = disable all; 1 = specify)
|
||||
1 hemv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
??? parameters: uploa conja conjx (? = test all values)
|
||||
|
||||
0 her (0 = disable all; 1 = specify)
|
||||
1 her (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
?? parameters: uploc conjx (? = test all values)
|
||||
|
||||
0 her2 (0 = disable all; 1 = specify)
|
||||
1 her2 (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
??? parameters: uploc conjx conjy (? = test all values)
|
||||
|
||||
0 symv (0 = disable all; 1 = specify)
|
||||
1 symv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
??? parameters: uploa conja conjx (? = test all values)
|
||||
|
||||
0 syr (0 = disable all; 1 = specify)
|
||||
1 syr (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
?? parameters: uploc conjx (? = test all values)
|
||||
|
||||
0 syr2 (0 = disable all; 1 = specify)
|
||||
1 syr2 (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
??? parameters: uploc conjx conjy (? = test all values)
|
||||
|
||||
0 trmv (0 = disable all; 1 = specify)
|
||||
1 trmv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
??? parameters: uploa transa diaga (? = test all values)
|
||||
|
||||
0 trsv (0 = disable all; 1 = specify)
|
||||
1 trsv (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 dimensions: m (-1 = bind to problem size)
|
||||
??? parameters: uploa transa diaga (? = test all values)
|
||||
@@ -158,37 +158,37 @@
|
||||
|
||||
1 gemm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 -1 dimensions: m n k (-1 = bind to problem size)
|
||||
-1 -1 -2 dimensions: m n k (-1 = bind to problem size)
|
||||
?? parameters: transa transb (? = test all values)
|
||||
|
||||
1 hemm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m n (-1 = bind to problem size)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
???? parameters: side uploa conja transb (? = test all values)
|
||||
|
||||
1 herk (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m k (-1 = bind to problem size)
|
||||
-1 -2 dimensions: m k (-1 = bind to problem size)
|
||||
?? parameters: uploc transa (? = test all values)
|
||||
|
||||
1 her2k (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m k (-1 = bind to problem size)
|
||||
-1 -2 dimensions: m k (-1 = bind to problem size)
|
||||
??? parameters: uploc transa transb (? = test all values)
|
||||
|
||||
1 symm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m n (-1 = bind to problem size)
|
||||
-1 -2 dimensions: m n (-1 = bind to problem size)
|
||||
???? parameters: side uploa conja transb (? = test all values)
|
||||
|
||||
1 syrk (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m k (-1 = bind to problem size)
|
||||
-1 -2 dimensions: m k (-1 = bind to problem size)
|
||||
?? parameters: uploc transa (? = test all values)
|
||||
|
||||
1 syr2k (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 dimensions: m k (-1 = bind to problem size)
|
||||
-1 -2 dimensions: m k (-1 = bind to problem size)
|
||||
??? parameters: uploc transa transb (? = test all values)
|
||||
|
||||
1 trmm (0 = disable all; 1 = specify)
|
||||
|
||||
@@ -159,7 +159,7 @@
|
||||
1 gemm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -1 -2 dimensions: m n k (-1 = bind to problem size)
|
||||
nn parameters: transa transb (? = test all values)
|
||||
?? parameters: transa transb (? = test all values)
|
||||
|
||||
1 hemm (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
@@ -169,7 +169,7 @@ nn parameters: transa transb (? = test all values)
|
||||
1 herk (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
-1 -2 dimensions: m k (-1 = bind to problem size)
|
||||
ln parameters: uploc transa (? = test all values)
|
||||
?? parameters: uploc transa (? = test all values)
|
||||
|
||||
1 her2k (0 = disable all; 1 = specify)
|
||||
1 test sequential front-end (0 = disable; 1 = enable)
|
||||
|
||||
@@ -533,7 +533,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
BLIS_EXTEND_NC_C,
|
||||
BLIS_EXTEND_NC_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-3 register blocksizes \n" );
|
||||
libblis_test_fprintf_c( os, "level-3 register blocksizes s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
@@ -566,7 +566,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
*/
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-3 packing duplication \n" );
|
||||
libblis_test_fprintf_c( os, "level-3 packing duplication s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_NUM_DUPL_S,
|
||||
BLIS_DEFAULT_NUM_DUPL_D,
|
||||
@@ -578,7 +578,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
BLIS_NUM_ELEM_PER_REG_C,
|
||||
BLIS_NUM_ELEM_PER_REG_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-2 cache blocksizes \n" );
|
||||
libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_L2_MC_S,
|
||||
BLIS_DEFAULT_L2_MC_D,
|
||||
@@ -590,11 +590,27 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
BLIS_DEFAULT_L2_NC_C,
|
||||
BLIS_DEFAULT_L2_NC_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-1f fusing factors %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_FUSING_FACTOR_S,
|
||||
BLIS_DEFAULT_FUSING_FACTOR_D,
|
||||
BLIS_DEFAULT_FUSING_FACTOR_C,
|
||||
BLIS_DEFAULT_FUSING_FACTOR_Z );
|
||||
libblis_test_fprintf_c( os, "level-1f fusing factors s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " default %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_FUSE_FAC_S,
|
||||
BLIS_DEFAULT_FUSE_FAC_D,
|
||||
BLIS_DEFAULT_FUSE_FAC_C,
|
||||
BLIS_DEFAULT_FUSE_FAC_Z );
|
||||
libblis_test_fprintf_c( os, " axpyf %5u %5u %5u %5u\n",
|
||||
BLIS_AXPYF_FUSE_FAC_S,
|
||||
BLIS_AXPYF_FUSE_FAC_D,
|
||||
BLIS_AXPYF_FUSE_FAC_C,
|
||||
BLIS_AXPYF_FUSE_FAC_Z );
|
||||
libblis_test_fprintf_c( os, " dotxf %5u %5u %5u %5u\n",
|
||||
BLIS_DOTXF_FUSE_FAC_S,
|
||||
BLIS_DOTXF_FUSE_FAC_D,
|
||||
BLIS_DOTXF_FUSE_FAC_C,
|
||||
BLIS_DOTXF_FUSE_FAC_Z );
|
||||
libblis_test_fprintf_c( os, " dotxaxpyf %5u %5u %5u %5u\n",
|
||||
BLIS_DOTXAXPYF_FUSE_FAC_S,
|
||||
BLIS_DOTXAXPYF_FUSE_FAC_D,
|
||||
BLIS_DOTXAXPYF_FUSE_FAC_C,
|
||||
BLIS_DOTXAXPYF_FUSE_FAC_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf( os, "\n" );
|
||||
|
||||
|
||||
Reference in New Issue
Block a user