Added template implementations and other tweaks.

Details:
- Added a 'template' configuration, which contains stub implementations of the
  level 1, 1f, and 3 kernels with one datatype implemented in C for each, with
  lots of in-file comments and documentation.
- Modified some variable/parameter names for some 1/1f operations. (e.g.
  renaming vector length parameter from m to n.)
- Moved level-1f fusing factors from axpyf, dotxf, and dotxaxpyf header files
  to bli_kernel.h.
- Modifed test suite to print out fusing factors for axpyf, dotxf, and
  dotxaxpyf, as well as the default fusing factor (which are all equal
  in the reference and template implementations).
- Cleaned up some sloppiness in the level-1f unb_var1.c files whereby these
  reference variants were implemented in terms of front-end routines rather
  that directly in terms of the kernels. (For example, axpy2v was implemented
  as two calls to axpyv rather than two calls to AXPYV_KERNEL.)
- Changed the interface to dotxf so that it matches that of axpyf, in that
  A is assumed to be m x b_n in both cases, and for dotxf A is actually used
  as A^T.
- Minor variable naming and comment changes to reference micro-kernels in
  frame/3/gemm/ukernels and frame/3/trsm/ukernels.
This commit is contained in:
Field G. Van Zee
2013-09-30 12:58:18 -05:00
parent 97aaf220a8
commit 5e54f46ccb
80 changed files with 6343 additions and 799 deletions

View File

@@ -97,6 +97,10 @@
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 32
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE 32

View File

@@ -226,10 +226,25 @@
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z

View File

@@ -97,6 +97,10 @@
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE 16

View File

@@ -216,10 +216,25 @@
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z

View File

@@ -97,6 +97,10 @@
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE 16

View File

@@ -220,10 +220,25 @@
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z

View File

@@ -97,6 +97,10 @@
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 32
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_CACHE_LINE_SIZE

View File

@@ -220,10 +220,25 @@
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z

View File

@@ -97,6 +97,10 @@
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_CACHE_LINE_SIZE

View File

@@ -220,10 +220,25 @@
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z

View File

@@ -95,6 +95,10 @@
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE 16

View File

@@ -220,10 +220,25 @@
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z

View File

@@ -97,14 +97,18 @@
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE 16
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE 16
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
@@ -116,7 +120,7 @@
// Alignment size used when sizing strides (eg: of packed micro-panels)
// within a block of contiguous memory.
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE

View File

@@ -54,21 +54,21 @@
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
#define BLIS_DEFAULT_MC_S 64
#define BLIS_DEFAULT_KC_S 128
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 128
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_MC_D 64
#define BLIS_DEFAULT_KC_D 128
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
@@ -220,10 +220,25 @@
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z

View File

@@ -0,0 +1,169 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// -- OPERATING SYSTEM ---------------------------------------------------------
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
// integers while 64 results in 64-bit integers. Any other value results in use
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
// Define the number of floating-point types supported, and the size of the
// largest type.
#define BLIS_NUM_FP_TYPES 4
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
// Enable use of built-in C99 "float complex" and "double complex" types and
// associated overloaded operations and functions? Disabling results in
// scomplex and dcomplex being defined in terms of simple structs.
//#define BLIS_ENABLE_C99_COMPLEX
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 1
// -- MEMORY ALLOCATION --------------------------------------------------------
// -- Contiguous (static) memory allocator --
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS 1
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
// memory pools so that the micro-kernel, when computing with the end of the
// last block, can exceed the bounds of the usable portion of the memory
// region without causing a segmentation fault.
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
// -- Memory alignment --
// It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size
// and the page size.
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
// Alignment size used when sizing strides (eg: of packed micro-panels)
// within a block of contiguous memory.
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
// Basic (homogeneous) datatype support always enabled.
// Enable mixed domain operations?
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
// Enable extra mixed precision operations?
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
// Stay initialized after auto-initialization, unless and until the user
// explicitly calls bli_finalize().
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
// Enable the BLAS compatibility layer?
#define BLIS_ENABLE_BLAS2BLIS
// The bit size of the integer type used to track values such as dimensions and
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
// A value of 32 results in the compatibility layer using 32-bit signed integers
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
#endif

View File

@@ -0,0 +1,391 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 64
#define BLIS_DEFAULT_KC_S 128
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 64
#define BLIS_DEFAULT_KC_D 128
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
// used by certain blocked variants. But when the *are* used, they MUST be
// be an integer multiple of NR!
#define BLIS_DEFAULT_NI_FAC 16
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
#include "bli_gemm_opt_mxn.h"
#include "bli_trsm_l_opt_mxn.h"
#include "bli_trsm_u_opt_mxn.h"
#include "bli_gemmtrsm_l_opt_mxn.h"
#include "bli_gemmtrsm_u_opt_mxn.h"
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#define GEMM_UKERNEL gemm_opt_mxn
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_mxn
#define TRSM_L_UKERNEL trsm_l_opt_mxn
#define TRSM_U_UKERNEL trsm_u_opt_mxn
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
#include "bli_axpy2v_opt_var1.h"
#include "bli_dotaxpyv_opt_var1.h"
#include "bli_axpyf_opt_var1.h"
#include "bli_dotxf_opt_var1.h"
#include "bli_dotxaxpyf_opt_var1.h"
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_opt_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_opt_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_opt_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -0,0 +1,308 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sssaxpyv_opt_var1( conj_t conjx,
dim_t n,
float* restrict alpha,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy )
{
/* Just call the reference implementation. */
bli_sssaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
}
void bli_dddaxpyv_opt_var1( conj_t conjx,
dim_t n,
double* restrict alpha,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy )
{
/* Just call the reference implementation. */
bli_dddaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
}
void bli_cccaxpyv_opt_var1( conj_t conjx,
dim_t n,
scomplex* restrict alpha,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy )
{
/* Just call the reference implementation. */
bli_cccaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
}
void bli_zzzaxpyv_opt_var1( conj_t conjx,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy )
{
/*
Template axpyv kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs a vector scale and accumulate (axpy) operation:
y := y + alpha * conjx( x )
where x and y are vectors of length n and alpha is a scalar.
Parameters:
- conjx: Compute with conjugated values of x?
- n: The number of elements in vectors x and y.
- alpha: The address of a scalar.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides incx or incy is non-unit.
- Vectors x and y are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE,
which is defined in bli_config.h.)
Additional things to consider:
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
bool_t use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y;
dim_t i;
if ( bli_zero_dim1( n ) ) return;
if ( bli_zeq0( *alpha ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc2( incx, incy ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x and y.
xp = x;
yp = y;
// Iterate over elements of x and y to compute:
// y += alpha * conjx( x );
if ( bli_is_noconj( conjx ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzaxpys( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzaxpys( *alpha, *xp, *yp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzaxpys( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
}
else // if ( bli_is_conj( conjx ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzaxpyjs( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzaxpyjs( *alpha, *xp, *yp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzaxpyjs( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \
\
void PASTEMAC3(cha,chx,chy,opname)( \
conj_t conjx, \
dim_t n, \
ctype_a* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,varname)( conjx, \
n, \
alpha, \
x, incx, \
y, incy ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( axpyv_opt_var1, axpyv_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( axpyv_opt_var1, axpyv_unb_var1 )
#endif

View File

@@ -0,0 +1,59 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype axpyv kernel interfaces.
//
#undef GENTPROT3
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
ctype_a* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT3_BASIC( axpyv_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 )
#endif

View File

@@ -0,0 +1,345 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sssdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict rho )
{
/* Just call the reference implementation. */
bli_sssdotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
}
void bli_ddddotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict rho )
{
/* Just call the reference implementation. */
bli_ddddotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
}
void bli_cccdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict rho )
{
/* Just call the reference implementation. */
bli_cccdotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
}
void bli_zzzdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho )
{
/*
Template dotv kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs an inner (dot) product operation:
rho := conjx( x^T ) * conjy( y )
where x and y are vectors of length n and rho is a scalar.
Parameters:
- conjx: Compute with conjugated values of x?
- conjy: Compute with conjugated values of y?
- n: The number of elements in vectors x and y.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
- rho: The address of the output scalar.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides incx or incy is non-unit.
- Vectors x and y are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE,
which is defined in bli_config.h.)
Additional things to consider:
- While four combinations of possible values of conjx and conjy exist, we
implement only conjugation on x explicitly; we induce the other two cases
by toggling the effective conjugation on x and then conjugating the dot
product result.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
dcomplex dotxy;
bool_t use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y;
dim_t i;
conj_t conjx_use;
// If the vector lengths are zero, set rho to zero and return.
if ( bli_zero_dim1( n ) )
{
bli_zset0s( *rho );
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc2( incx, incy ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x and y.
xp = x;
yp = y;
// Initialize accumulator to zero.
bli_zset0s( dotxy );
conjx_use = conjx;
// If y must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of x and then conjugating the
// resulting dot product.
if ( bli_is_conj( conjy ) )
bli_toggle_conj( conjx_use );
// Iterate over elements of x and y to compute:
// rho = conjx( x^T ) * conjy( y );
if ( bli_is_noconj( conjx_use ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
}
else // if ( bli_is_conj( conjx_use ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
}
// If conjugation on y was requested, we induce it by conjugating
// the contents of dotxy.
if ( bli_is_conj( conjy ) )
bli_zconjs( dotxy );
bli_zzcopys( dotxy, *rho );
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
\
void PASTEMAC3(chx,chy,chr,opname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict rho \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(chx,chy,chr,varname)( conjx, \
conjy, \
n, \
x, incx, \
y, incy, \
rho ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( dotv_opt_var1, dotv_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( dotv_opt_var1, dotv_unb_var1 )
#endif

View File

@@ -0,0 +1,59 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype dotv kernel interfaces.
//
#undef GENTPROT3
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict rho \
);
INSERT_GENTPROT3_BASIC( dotv_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( dotv_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( dotv_opt_var1 )
#endif

View File

@@ -0,0 +1,436 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sssaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict alpha1,
float* restrict alpha2,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict z, inc_t incz
)
{
/* Just call the reference implementation. */
bli_sssaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
}
void bli_dddaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict alpha1,
double* restrict alpha2,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict z, inc_t incz
)
{
/* Just call the reference implementation. */
bli_dddaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
}
void bli_cccaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict alpha1,
scomplex* restrict alpha2,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict z, inc_t incz
)
{
/* Just call the reference implementation. */
bli_cccaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
}
void bli_zzzaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha1,
dcomplex* restrict alpha2,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz
)
{
/*
Template axpy2v kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel fuses two axpyv operations:
z := z + alpha1 * conjx( x )
z := z + alpha2 * conjy( y )
where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
Parameters:
- conjx: Compute with conjugated values of x?
- conjy: Compute with conjugated values of y?
- n: The number of elements in vectors x, y, and z.
- alpha1: The address of the scalar to be applied to x.
- alpha2: The address of the scalar to be applied to y.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
- z: The address of vector z.
- incz: The vector increment of z. incz should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Any of the strides incx, incy, or incz is non-unit.
- Vectors x, y, and z are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment in the main loops to be
BLIS_SIMD_ALIGN_SIZE, which is defined in bli_config.h.)
Here are a few additional things to consider:
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
dcomplex* zp;
bool_t use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y, off_z;
dim_t i;
// Return early if possible.
if ( bli_zero_dim1( n ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc3( incx, incy, incz ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y && off_x == off_z )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x, y, and z.
xp = x;
yp = y;
zp = z;
// Iterate over rows of x, y, and z to compute:
// z += alpha1 * conjx( x ) + alpha2 * conjy( y );
if ( bli_is_noconj( conjx ) && bli_is_noconj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzaxpys( *alpha1, *xp, *zp );
bli_zzzaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzaxpys( *alpha1, *xp, *zp );
bli_zzzaxpys( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzaxpys( *alpha1, *xp, *zp );
bli_zzzaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_noconj( conjx ) && bli_is_conj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzaxpys( *alpha1, *xp, *zp );
bli_zzzaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzaxpys( *alpha1, *xp, *zp );
bli_zzzaxpyjs( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzaxpys( *alpha1, *xp, *zp );
bli_zzzaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_conj( conjx ) && bli_is_noconj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzaxpyjs( *alpha1, *xp, *zp );
bli_zzzaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzaxpyjs( *alpha1, *xp, *zp );
bli_zzzaxpys( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzaxpyjs( *alpha1, *xp, *zp );
bli_zzzaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else // if ( bli_is_conj( conjx ) && bli_is_conj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzaxpyjs( *alpha1, *xp, *zp );
bli_zzzaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzaxpyjs( *alpha1, *xp, *zp );
bli_zzzaxpyjs( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzaxpyjs( *alpha1, *xp, *zp );
bli_zzzaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_xy* restrict alpha1, \
ctype_xy* restrict alpha2, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_z* restrict z, inc_t incz \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(chx,chy,chz,kername)( conjx, \
conjy, \
n, \
alpha1, \
alpha2, \
x, incx, \
y, incy, \
z, incz ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( axpy2v_opt_var1, axpy2v_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( axpy2v_opt_var1, axpy2v_unb_var1 )
#endif

View File

@@ -0,0 +1,58 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype_xy* restrict alpha1, \
ctype_xy* restrict alpha2, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_z* restrict z, inc_t incz \
);
INSERT_GENTPROT3U12_BASIC( axpy2v_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( axpy2v_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( axpy2v_opt_var1 )
#endif

View File

@@ -0,0 +1,416 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sssaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_sssaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
}
void bli_dddaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_dddaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
}
void bli_cccaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_cccaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
}
void bli_zzzaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy
)
{
/*
Template axpyf kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs the following gemv-like operation:
y := y + alpha * conja( A ) * conjx( x )
where A is an m x b_n matrix, x is a vector of length b_n, y is a vector
of length m, and alpha is a scalar. The operation is performed as a series
of fused axpyv operations, and therefore A should be column-stored.
Parameters:
- conja: Compute with conjugated values of A?
- conjx: Compute with conjugated values of x?
- m: The number of rows in matrix A.
- b_n: The number of columns in matrix A. Must be equal to or less than
the fusing factor.
- alpha: The address of a scalar.
- a: The address of matrix A.
- inca: The row stride of A. inca should be unit unless the
implementation makes special accomodation for non-unit values.
- lda: The column stride of A.
- x: The address of vector x.
- incx: The vector increment of x.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides inca or incy is non-unit.
- The address of A, the second column of A, and y are unaligned with
different offsets.
If the first/second columns of A and address of y are aligned, or unaligned
by the same offset, then optimized code can be used for the bulk of the
computation. This template shows how the front-edge case can be handled so
that the remaining computation is aligned. (This template guarantees
alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which is defined
in bli_config.h.)
Additional things to consider:
- When optimizing, you should fully unroll the loops over b_n. This is the
dimension across which we are fusing axpyv operations.
- This template code chooses to call the reference implementation whenever
b_n is less than the fusing factor, so as to avoid having to handle edge
cases. One may choose to optimize this edge case, if desired.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *a );
dcomplex* ap[ bli_zaxpyf_fusefac ];
dcomplex* xp[ bli_zaxpyf_fusefac ];
dcomplex* yp;
dcomplex alpha_x[ bli_zaxpyf_fusefac ];
bool_t use_ref = FALSE;
dim_t m_pre = 0;
dim_t m_iter;
dim_t m_left;
dim_t off_a, off_a2, off_y;
dim_t i, j;
// Return early if possible.
if ( bli_zero_dim2( m, b_n ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < bli_zaxpyf_fusefac )
{
use_ref = TRUE;
}
else if ( bli_has_nonunit_inc3( inca, incx, incy ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
if ( off_a == off_y && off_a == off_a2 )
{
use_ref = FALSE;
m_pre = off_a / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
m_iter = ( m - m_pre ) / n_elem_per_iter;
m_left = ( m - m_pre ) % n_elem_per_iter;
// Initialize pointers into the columns of A and elements of x.
for ( j = 0; j < b_n; ++j )
{
ap[ j ] = a + (j )*lda;
xp[ j ] = x + (j )*incx;
}
yp = y;
// Load elements of x or conj(x) into alpha_x and scale by alpha.
if ( bli_is_noconj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzcopys( *xp[ j ], alpha_x[ j ] );
bli_zzscals( *alpha, alpha_x[ j ] );
}
}
else // if ( bli_is_conj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzcopyjs( *xp[ j ], alpha_x[ j ] );
bli_zzscals( *alpha, alpha_x[ j ] );
}
}
// Iterate over rows of A and y to compute:
// y += conja( A )*conjx( x );
if ( bli_is_noconj( conja ) )
{
// Compute front edge cases if a and y were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, and the b_n loop should be fully unrolled. The addresses in
// ap[] and yp are guaranteed to be aligned to
// BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += n_elem_per_iter;
}
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
}
else // if ( bli_is_conj( conja ) )
{
// Compute front edge cases if a and y were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, and the b_n loop should be fully unrolled. The addresses in
// ap[] and yp are guaranteed to be aligned to
// BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += n_elem_per_iter;
}
yp += n_elem_per_iter;
}
// Compute tail edge cases.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ax* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,kername)( conja, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
y, incy ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 )
#endif

View File

@@ -0,0 +1,62 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype axpyf kernel interfaces.
//
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ax* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT3U12_BASIC( axpyf_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( axpyf_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( axpyf_opt_var1 )
#endif

View File

@@ -0,0 +1,470 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sssdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict alpha,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict rho,
float* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_sssdotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
}
void bli_ddddotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict alpha,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict rho,
double* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_ddddotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
}
void bli_cccdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict alpha,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict rho,
scomplex* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_cccdotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
}
void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho,
dcomplex* restrict z, inc_t incz )
{
/*
Template dotaxpyv kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel fuses a dotv and axpyv operation:
rho := conjxt( x^T ) * conjy( y )
z := z + alpha * conjx( x )
where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
Parameters:
- conjxt: Compute with conjugated values of x^T?
- conjx: Compute with conjugated values of x?
- conjy: Compute with conjugated values of y?
- n: The number of elements in vectors x, y, and z.
- alpha: The address of the scalar to be applied to x.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
- rho: The address of the output scalar of the dotv subproblem.
- z: The address of vector z.
- incz: The vector increment of z. incz should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Any of the strides incx, incy, or incz is non-unit.
- Vectors x, y, and z are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment in the main loops to be
BLIS_SIMD_ALIGN_SIZE, which is defined in bli_config.h.)
Here are a few additional things to consider:
- While four combinations of possible values of conjx and conjy exist, we
implement only conjugation on x explicitly; we induce the other two cases
by toggling the effective conjugation on x and then conjugating the dot
product result.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
dcomplex* zp;
dcomplex dotxy;
bool_t use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y, off_z;
dim_t i;
conj_t conjxt_use;
// If the vector lengths are zero, set rho to zero and return.
if ( bli_zero_dim1( n ) )
{
bli_zset0s( *rho );
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc3( incx, incy, incz ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If x, y, and z are unaligned by the same offset, then we can
// still use an implementation that depends on alignment for most
// of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y && off_x == off_z )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x, y, and z.
xp = x;
yp = y;
zp = z;
// Initialize accumulator to zero.
bli_zset0s( dotxy );
conjxt_use = conjxt;
// If y must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of xt and then conjugating the
// resulting dot product.
if ( bli_is_conj( conjy ) )
bli_toggle_conj( conjxt_use );
// Iterate over elements of x, y, and z to compute:
// r = conjxt( x^T ) * conjy( y );
// z += alpha * conjx( x );
if ( bli_is_noconj( conjx ) && bli_is_noconj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
bli_zzzaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
bli_zzzaxpys( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
bli_zzzaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_noconj( conjx ) && bli_is_conj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
bli_zzzaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
bli_zzzaxpys( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
bli_zzzaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_conj( conjx ) && bli_is_noconj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
bli_zzzaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
bli_zzzaxpyjs( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzdots( *xp, *yp, dotxy );
bli_zzzaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else // if ( bli_is_conj( conjx ) && bli_is_conj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
bli_zzzaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
bli_zzzaxpyjs( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zzzdotjs( *xp, *yp, dotxy );
bli_zzzaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
// If conjugation on y was requested, we induce it by conjugating
// the contents of rho.
if ( bli_is_conj( conjy ) )
bli_zconjs( dotxy );
bli_zzcopys( dotxy, *rho );
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_xy* restrict rho, \
ctype_z* restrict z, inc_t incz \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(chx,chy,chz,kername)( conjxt, \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
rho, \
z, incz ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
#endif

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_xy* restrict rho, \
ctype_z* restrict z, inc_t incz \
);
INSERT_GENTPROT3U12_BASIC( dotaxpyv_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( dotaxpyv_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( dotaxpyv_opt_var1 )
#endif

View File

@@ -0,0 +1,610 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sssdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict w, inc_t incw,
float* restrict x, inc_t incx,
float* restrict beta,
float* restrict y, inc_t incy,
float* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_sssdotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
}
void bli_ddddotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict w, inc_t incw,
double* restrict x, inc_t incx,
double* restrict beta,
double* restrict y, inc_t incy,
double* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_ddddotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
}
void bli_cccdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict w, inc_t incw,
scomplex* restrict x, inc_t incx,
scomplex* restrict beta,
scomplex* restrict y, inc_t incy,
scomplex* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_cccdotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
}
void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict w, inc_t incw,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz )
{
/*
Template dotxaxpyf kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs the following two gemv-like operations:
y := beta * y + alpha * conjat( A^T ) * conjw( w )
z := z + alpha * conja( A ) * conjx( x )
where A is an m x b_n matrix, x and y are vector of length b_n, w and z
are vectors of length m, and alpha and beta are scalars. The operation
fuses a dotxf and an axpyf operation, and therefore A should be column-
stored.
Parameters:
- conjat: Compute with conjugated values of A^T?
- conja: Compute with conjugated values of A?
- conjw: Compute with conjugated values of w?
- conjx: Compute with conjugated values of x?
- m: The number of rows in matrix A.
- b_n: The number of columns in matrix A. Must be equal to or less than
the fusing factor.
- alpha: The address of the scalar to be applied to A^T*w and A*x.
- a: The address of matrix A.
- inca: The row stride of A. inca should be unit unless the
implementation makes special accomodation for non-unit values.
- lda: The column stride of A.
- w: The address of vector w.
- incw: The vector increment of w. incw should be unit unless the
implementation makes special accomodation for non-unit values.
- x: The address of vector x.
- incx: The vector increment of x.
- beta: The address of the scalar to be applied to y.
- y: The address of vector y.
- incy: The vector increment of y.
- z: The address of vector z.
- incz: The vector increment of z. incz should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Any of the strides inca, incw, or incz is non-unit.
- The address of A, the second column of A, w, and z are unaligned with
different offsets.
If the first/second rows of A and addresses of w and z are aligned, or
unaligned by the same offset, then optimized code can be used for the bulk
of the computation. This template shows how the front-edge case can be
handled so that the remaining computation is aligned. (This template
guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which
is defined in bli_config.h.)
Additional things to consider:
- When optimizing, you should fully unroll the loops over b_n. This is the
dimension across which we are fusing dotxv operations.
- This template code chooses to call the reference implementation whenever
b_n is less than the fusing factor, so as to avoid having to handle edge
cases. One may choose to optimize this edge case, if desired.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *a );
dcomplex* ap[ bli_zdotxaxpyf_fusefac ];
dcomplex* xp[ bli_zdotxaxpyf_fusefac ];
dcomplex* yp[ bli_zdotxaxpyf_fusefac ];
dcomplex* wp;
dcomplex* zp;
dcomplex At_w[ bli_zdotxaxpyf_fusefac ];
dcomplex alpha_x[ bli_zdotxaxpyf_fusefac ];
bool_t use_ref = FALSE;
dim_t m_pre = 0;
dim_t m_iter;
dim_t m_left;
dim_t off_a, off_a2, off_w, off_z;
dim_t i, j;
conj_t conjat_use;
// Return early if possible.
if ( bli_zero_dim2( m, b_n ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < bli_zdotxaxpyf_fusefac )
{
use_ref = TRUE;
}
else if ( bli_has_nonunit_inc3( inca, incw, incz ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( w, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, w, and z are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
off_w = bli_offset_from_alignment( w, BLIS_SIMD_ALIGN_SIZE );
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
if ( off_a == off_a2 && off_a == off_w && off_a == off_z )
{
use_ref = FALSE;
m_pre = off_a / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
m_iter = ( m - m_pre ) / n_elem_per_iter;
m_left = ( m - m_pre ) % n_elem_per_iter;
// Initialize pointers into the columns of A and elements of x.
for ( j = 0; j < b_n; ++j )
{
ap[ j ] = a + (j )*lda;
xp[ j ] = x + (j )*incx;
yp[ j ] = y + (j )*incy;
}
wp = w;
zp = z;
// Load elements of x or conj(x) into alpha_x and scale by alpha.
if ( bli_is_noconj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzcopys( *xp[ j ], alpha_x[ j ] );
bli_zzscals( *alpha, alpha_x[ j ] );
}
}
else // if ( bli_is_conj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzcopyjs( *xp[ j ], alpha_x[ j ] );
bli_zzscals( *alpha, alpha_x[ j ] );
}
}
// Initialize our accumulators to zero.
for ( j = 0; j < b_n; ++j )
{
bli_zset0s( At_w[ j ] );
}
conjat_use = conjat;
// If w must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of At and then conjugating the
// resulting dot products.
if ( bli_is_conj( conjw ) )
bli_toggle_conj( conjat_use );
// Iterate over the columns of A and elements of w and z to compute:
// y = beta * y + alpha * conjat( A^T ) * conjw( w );
// z = z + alpha * conja( A ) * conjx( x );
// where A is m x b_n.
if ( bli_is_noconj( conja ) && bli_is_noconj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
else if ( bli_is_noconj( conja ) && bli_is_conj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
else if ( bli_is_conj( conja ) && bli_is_noconj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
else if ( bli_is_conj( conja ) && bli_is_conj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
// If conjugation on w was requested, we induce it by conjugating
// the contents of At_w.
if ( bli_is_conj( conjw ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zconjs( At_w[ j ] );
}
}
// Scale the At_w product by alpha and accumulate into y after
// scaling by beta.
for ( j = 0; j < b_n; ++j )
{
bli_zzscals( *beta, *yp[ j ] );
bli_zzzaxpys( *alpha, At_w[ j ], *yp[ j ] );
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \
\
void PASTEMAC3(cha,chb,chc,varname)( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ab* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_b* restrict w, inc_t incw, \
ctype_b* restrict x, inc_t incx, \
ctype_c* restrict beta, \
ctype_c* restrict y, inc_t incy, \
ctype_c* restrict z, inc_t incz \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,kername)( conjat, \
conja, \
conjw, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
w, incw, \
x, incx, \
beta, \
y, incy, \
z, incz ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
#endif

View File

@@ -0,0 +1,64 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \
\
void PASTEMAC3(cha,chb,chc,varname)( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ab* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_b* restrict w, inc_t incw, \
ctype_b* restrict x, inc_t incx, \
ctype_c* restrict beta, \
ctype_c* restrict y, inc_t incy, \
ctype_c* restrict z, inc_t incz \
);
INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 )
#endif

View File

@@ -0,0 +1,456 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sssdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict beta,
float* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_sssdotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
}
void bli_ddddotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict beta,
double* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_ddddotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
}
void bli_cccdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict x, inc_t incx,
scomplex* restrict beta,
scomplex* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_cccdotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
}
void bli_zzzdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy
)
{
/*
Template dotxf kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs the following gemv-like operation:
y := beta * y + alpha * conjat( A^T ) * conjx( x )
where A is an m x b_n matrix, x is a vector of length m, y is a vector
of length b_n, and alpha and beta are scalars. The operation is performed
as a series of fused dotxv operations, and therefore A should be column-
stored.
Parameters:
- conjat: Compute with conjugated values of A^T?
- conjx: Compute with conjugated values of x?
- m: The number of rows in matrix A.
- b_n: The number of columns in matrix A. Must be equal to or less than
the fusing factor.
- alpha: The address of the scalar to be applied to A*x.
- a: The address of matrix A.
- inca: The row stride of A. inca should be unit unless the
implementation makes special accomodation for non-unit values.
- lda: The column stride of A.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- beta: The address of the scalar to be applied to y.
- y: The address of vector y.
- incy: The vector increment of y.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides inca or incx is non-unit.
- The address of A, the second column of A, and x are unaligned with
different offsets.
If the first/second columns of A and address of x are aligned, or unaligned
by the same offset, then optimized code can be used for the bulk of the
computation. This template shows how the front-edge case can be handled so
that the remaining computation is aligned. (This template guarantees
alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which is defined
in bli_config.h.)
Additional things to consider:
- When optimizing, you should fully unroll the loops over b_n. This is the
dimension across which we are fusing dotxv operations.
- This template code chooses to call the reference implementation whenever
b_n is less than the fusing factor, so as to avoid having to handle edge
cases. One may choose to optimize this edge case, if desired.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* ap[ bli_zdotxf_fusefac ];
dcomplex* xp;
dcomplex* yp[ bli_zdotxf_fusefac ];
dcomplex Atx[ bli_zdotxf_fusefac ];
bool_t use_ref = FALSE;
dim_t m_pre = 0;
dim_t m_iter;
dim_t m_left;
dim_t off_a, off_a2, off_x;
dim_t i, j;
conj_t conjat_use;
// Return early if possible.
if ( bli_zero_dim1( b_n ) ) return;
// If the vector lengths are zero, scale r by beta and return.
if ( bli_zero_dim1( m ) )
{
bli_zzscalv( BLIS_NO_CONJUGATE,
b_n,
beta,
y, incy );
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < bli_zdotxf_fusefac )
{
use_ref = TRUE;
}
else if ( bli_has_nonunit_inc2( inca, incx ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and x are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
if ( off_a == off_a2 && off_a == off_x )
{
use_ref = FALSE;
m_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
m_iter = ( m - m_pre ) / n_elem_per_iter;
m_left = ( m - m_pre ) % n_elem_per_iter;
// Initialize pointers into the rows of A and elements of y.
for ( i = 0; i < b_n; ++i )
{
ap[ i ] = a + (i )*lda;
yp[ i ] = y + (i )*incy;
}
xp = x;
// Initialize our accumulators to zero.
for ( i = 0; i < b_n; ++i )
{
bli_zset0s( Atx[ i ] );
}
conjat_use = conjat;
// If x must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of A and then conjugating the
// resulting product A^T*x.
if ( bli_is_conj( conjx ) )
bli_toggle_conj( conjat_use );
// Iterate over columns of A and rows of x to compute:
// Atx = conjat_use( A^T ) * x;
if ( bli_is_noconj( conjat_use ) )
{
// Compute front edge cases if A and y were unaligned.
for ( j = 0; j < m_pre; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of Atx should be kept in registers, and the b_n loop
// should be fully unrolled. The addresses in ap[] and xp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( j = 0; j < m_iter; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += n_elem_per_iter;
}
xp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( j = 0; j < m_left; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
}
else // if ( bli_is_conj( conjat_use ) )
{
// Compute front edge cases if A and y were unaligned.
for ( j = 0; j < m_pre; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of Atx should be kept in registers, and the b_n loop
// should be fully unrolled. The addresses in ap[] and xp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( j = 0; j < m_iter; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += n_elem_per_iter;
}
xp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( j = 0; j < m_left; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
}
// If conjugation on y was requested, we induce it by conjugating
// the contents of Atx.
if ( bli_is_conj( conjx ) )
{
for ( i = 0; i < b_n; ++i )
{
bli_zconjs( Atx[ i ] );
}
}
// Scale the Atx product by alpha and accumulate into y after
// scaling by beta.
for ( i = 0; i < b_n; ++i )
{
bli_zzscals( *beta, *yp[ i ] );
bli_zzzaxpys( *alpha, Atx[ i ], *yp[ i ] );
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_xy* restrict alpha, \
ctype_x* restrict a, inc_t inca, inc_t lda, \
ctype_y* restrict x, inc_t incx, \
ctype_r* restrict beta, \
ctype_r* restrict y, inc_t incy \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,kername)( conjat, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
beta, \
y, incy ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxf_opt_var1, dotxf_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxf_opt_var1, dotxf_unb_var1 )
#endif

View File

@@ -0,0 +1,63 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype dotxf kernel interfaces.
//
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ax* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict beta, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( dotxf_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( dotxf_opt_var1 )
#endif

View File

@@ -0,0 +1,290 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sgemm_opt_mxn(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
/* Just call the reference implementation. */
bli_sgemm_ref_mxn( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
a_next,
b_next );
}
void bli_dgemm_opt_mxn(
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
/*
Template gemm micro-kernel implementation
This function contains a template implementation for a double-precision
real micro-kernel, coded in C, which can serve as the starting point for
one to write an optimized micro-kernel on an arbitrary architecture. (We
show a template implementation for only double-precision real because
the templates for the other three floating-point types would be nearly
identical.)
This micro-kernel performs a matrix-matrix multiplication of the form:
C := beta * C + alpha * A * B
where A is MR x k, B is k x NR, C is MR x NR, and alpha and beta are
scalars.
Parameters:
- k: The number of columns of A and rows of B.
- alpha: The address of a scalar to the A*B product.
- a: The address of a micro-panel of matrix A of dimension MR x k,
stored by columns.
- b: The address of a micro-panel of matrix B of dimension k x NR,
stored by rows.
- beta: The address of a scalar to the input value of matrix C.
- c: The address of a block of matrix C of dimension MR x NR,
stored according to rs_c and cs_c.
- rs_c: The row stride of matrix C (ie: the distance to the next row,
in units of matrix elements).
- cs_c: The column stride of matrix C (ie: the distance to the next
column, in units of matrix elements).
- a_next: The address of the micro-panel of A that will be used the next
time the gemm micro-kernel will be called.
- b_next: The address of the micro-panel of B that will be used the next
time the gemm micro-kernel will be called.
The diagram below shows the packed micro-panel operands and how elements
of each would be stored when MR == NR == 4. (The hex digits indicate the
order of the elements in memory.) Note that the storage of C is not shown
since it is determined by the row and column strides of C.
c: a: b:
_______ ______________________ _______
| | |0 4 8 C | |0 1 2 3|
MR | | |1 5 9 D . . . | |4 5 6 7|
| | += |2 6 A E | |8 9 A B|
|_______| |3_7_B_F_______________| |C D E F|
| . |
NR k | . |
| . |
| |
| |
|_______|
NR
Here are a few things to consider:
- bli_?mr and bli_?nr give the MR and NR register blocksizes for the
datatype corresponding to the '?' character.
- bli_?packmr and bli_?packnr are usually equal to bli_?mr and bli_?nr,
respectively. (They are only not equal if the register blocksize
extensions are non-zero. See bli_config.h for more details.)
- You may assume that the addresses a and b are aligned according to
the alignment value BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in
bli_config.h.
- Here, we use a local array, ab, as temporary accumulator elements as
we compute the a*b product. In an optimized micro-kernel, ab is held
in registers rather than memory.
- In column-major storage (or column storage), the "leading dimension"
of a matrix is equivalent to its column stride, and the row stride is
unit. In row-major storage (row storage), the "leading dimension" is
equivalent to the row stride and the column stride is unit.
- While all three loops are exposed in this template micro-kernel, the
loops over MR and NR typically disappear in an optimized code because
they are fully unrolled, leaving only the loop over k.
- Some optimized micro-kernels will need the loop over k to be unrolled
a few times (4x seems to be a common unrolling factor).
- a_next and b_next can be used to perform prefetching, if prefetching
is supported by the architecture. They may be safely ignored by the
micro-kernel implementation, though.
- If beta == 0.0 (or 0.0 + 0.0i for complex), then the micro-kernel
should NOT use it explicitly, as C may contain uninitialized memory
(including NaNs). This case should be detected and handled separately,
preferably by simply overwriting C with the alpha*A*B product. An
example of how to perform this "beta is zero" handling is included in
this template implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t mr = bli_dmr;
const dim_t nr = bli_dnr;
const inc_t cs_a = bli_dpackmr;
const inc_t rs_b = bli_dpacknr;
const inc_t rs_ab = 1;
const inc_t cs_ab = bli_dmr;
dim_t l, j, i;
double ab[ bli_dmr *
bli_dnr ];
double* abij;
double ai, bj;
/* Initialize the accumulator elements in ab to zero. */
for ( i = 0; i < mr * nr; ++i )
{
bli_dset0s( *(ab + i) );
}
/* Perform a series of k rank-1 updates into ab. */
for ( l = 0; l < k; ++l )
{
abij = ab;
/* In an optimized implementation, these two loops over MR and NR
are typically fully unrolled. */
for ( j = 0; j < nr; ++j )
{
bj = *(b + j);
for ( i = 0; i < mr; ++i )
{
ai = *(a + i);
bli_ddots( ai, bj, *abij );
abij += rs_ab;
}
}
a += cs_a;
b += rs_b;
}
/* Scale each element of ab by alpha. */
for ( i = 0; i < mr * nr; ++i )
{
bli_dscals( *alpha, *(ab + i) );
}
/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
scale c by beta and then add the scaled result in ab. */
if ( bli_deq0( *beta ) )
{
/* c := ab */
bli_dcopys_mxn( mr,
nr,
ab, rs_ab, cs_ab,
c, rs_c, cs_c );
}
else
{
/* c := beta * c + ab */
bli_dxpbys_mxn( mr,
nr,
ab, rs_ab, cs_ab,
beta,
c, rs_c, cs_c );
}
}
void bli_cgemm_opt_mxn(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
/* Just call the reference implementation. */
bli_cgemm_ref_mxn( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
a_next,
b_next );
}
void bli_zgemm_opt_mxn(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
/* Just call the reference implementation. */
bli_zgemm_ref_mxn( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
a_next,
b_next );
}

View File

@@ -0,0 +1,54 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemm_opt_mxn )

View File

@@ -0,0 +1,303 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sgemmtrsm_l_opt_mxn(
dim_t k,
float* restrict alpha,
float* restrict a10,
float* restrict a11,
float* restrict bd01,
float* restrict bd11,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
const inc_t rs_b = bli_spacknr;
const inc_t cs_b = 1;
float* restrict minus_one = bli_sm1;
bli_sgemm_opt_mxn( k,
minus_one,
a10,
bd01,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
bli_strsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
void bli_dgemmtrsm_l_opt_mxn(
dim_t k,
double* restrict alpha,
double* restrict a10,
double* restrict a11,
double* restrict bd01,
double* restrict bd11,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
/*
Template gemmtrsm_l micro-kernel implementation
This function contains a template implementation for a double-precision
real micro-kernel that fuses a gemm with a trsm_l subproblem.
This micro-kernel implements the following sequence of operations:
B11 := alpha * B11 - A10 * B01 (gemm)
B11 := inv(A11) * B11 (trsm)
where B11 is MR x NR, A10 is MR x k, B01 is k x NR, A11 is MR x MR and
lower triangular, and alpha is a scalar. Here, inv() denotes matrix
inverse.
NOTE: Here, this gemmtrsm micro-kernel supports element "duplication", a
feature that is enabled or disabled in bli_kernel.h. Duplication factors
are also defined in the aforementioned header. Duplication is NOT
commonly used and most developers may assume it is disabled.
Parameters:
- k: The number of columns of A10 and rows of B01.
- alpha: The address of a scalar to be applied to B11.
- a10: The address of A10, which is the MR x k subpartition of the
packed (column-stored) micro-panel of A that is situated to the
left of the MR x MR lower triangular block.
- a11: The address of A11, which is the MR x MR lower triangular block
within the packed micro-panel of A that is situated to the
right of A10. By the time this gemmtrsm kernel is called, the
diagonal of A11 has already been inverted and the strictly upper
triangle contains zeros.
- bd01: The address of B01, which is the k x NR subpartition situated
above the current MR x NR block B11. bd01 is row-stored. If
duplication is enabled, then each element occurs d times,
effectively increasing the dimension to k x d*NR. If duplication
is disabled, then bd01 is simply the address of the top part of
the current packed (row-stored) micro-panel of B (labeled b01
in the diagram below).
- bd11: The address of B11, which is the MR x NR subpartition situated
below B01. If duplication is enabled, then each element occurs
d times, effectively increasing the dimension to MR x d*NR. If
duplication is disabled, then bd11 is simply the address of the
current MR x NR block witin the packed (row-stored) micro-panel
of B.
- b11: The address of the current MR x NR block within the packed
micro-panel of B. It exists in duplicated form as bd11. If
duplication is disabled, then b11 and bd11 refer to the same
MR x NR block within the packed (row-stored) micro-panel of B.
- c11: The address of C11, which is the MR x NR block of the output
matrix (ie: the matrix provided by the user to the highest-level
trsm API call). C11 corresponds to the elements that exist in
packed form in B11, and is stored according to rs_c and cs_c.
- rs_c: The row stride of C11 (ie: the distance to the next row of C11,
in units of matrix elements).
- cs_c: The column stride of C11 (ie: the distance to the next column of
C11, in units of matrix elements).
- a_next: The address of the packed micro-panel of A that will be used the
next time the gemmtrsm micro-kernel will be called.
- b_next: The address of the packed micro-panel of B that will be used the
next time the gemmtrsm micro-kernel will be called.
The diagram below shows the packed micro-panel operands and how elements
of each would be stored when MR == NR == 4. (The hex digits indicate the
order of the elements in memory.) We also show a B duplication buffer (bd)
that contains a copy of the packed micro-panel of B with a duplication
factor of 2. If duplication is disabled (as is commonly the case), then
bd01 == b01 and bd11 == b11.
NR 2*NR
NOTE: If duplication is disabled _______ _______________
then bd01 and bd11 simply refer b01:|0 1 2 3| bd01:|0 0 1 1 2 2 3 3|
to b01 and b11, respectively. |4 5 6 7| |4 4 5 5 6 6 7 7|
|8 9 A B| |8 8 9 9 A A B B|
|C D E F| |C C D D E E F F|
k | . | | . |
| . | | . |
a10: a11: | . | | . |
___________________ _______ |_______| |_______________|
|0 4 8 C |`. | b11:| | bd11:| |
MR |1 5 9 D . . . | `. | | | | |
|2 6 A E | `. | MR | | | |
|3_7_B_F____________|______`.| |_______| |_______________|
k MR
Thus, with duplication enabled, the operation takes the form of:
b11 = alpha * b11 - a10 * bd01;
b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11;
And if duplication is disabled, the operation reduces to:
b11 = alpha * b11 - a10 * b01; (Note: Here, b01 == bd01.)
b11 = inv(a11) * b11;
c11 = b11;
A note on optimization:
- This implementation simply calls the gemm micro-kernel and then the
trsm micro-kernel. Let's assume that the gemm micro-kernel has already
been optimized. You have two options with regards to optimizing the
fused gemmtrsm kernel.
(1) Optimize only the trsm kernel and continue to call the gemm and
trsm micro-kernels in sequence, as is done in this template
implementation.
(2) Fuse the implementation of the gemm micro-kernel with that of the
trsm micro-kernel by inlining both into this gemmtrsm function.
The latter option is more labor-intensive, but also more likely to
yield higher performance because it allows you to eliminate redundant
memory operations on the packed MR x NR block B11.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const inc_t rs_b = bli_dpacknr;
const inc_t cs_b = 1;
double* restrict minus_one = bli_dm1;
/* Reminder: if duplication is disabled, then bd01 == b01, bd11 == b11. */
/* b11 = alpha * b11 - a10 * bd01; */
bli_dgemm_opt_mxn( k,
minus_one,
a10,
bd01,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */
bli_dtrsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
void bli_cgemmtrsm_l_opt_mxn(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a10,
scomplex* restrict a11,
scomplex* restrict bd01,
scomplex* restrict bd11,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
const inc_t rs_b = bli_cpacknr;
const inc_t cs_b = 1;
scomplex* restrict minus_one = bli_cm1;
bli_cgemm_opt_mxn( k,
minus_one,
a10,
bd01,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
bli_ctrsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
void bli_zgemmtrsm_l_opt_mxn(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a10,
dcomplex* restrict a11,
dcomplex* restrict bd01,
dcomplex* restrict bd11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
const inc_t rs_b = bli_zpacknr;
const inc_t cs_b = 1;
dcomplex* restrict minus_one = bli_zm1;
bli_zgemm_opt_mxn( k,
minus_one,
a10,
bd01,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
bli_ztrsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_mxn )

View File

@@ -0,0 +1,302 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sgemmtrsm_u_opt_mxn(
dim_t k,
float* restrict alpha,
float* restrict a12,
float* restrict a11,
float* restrict bd21,
float* restrict bd11,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
const inc_t rs_b = bli_spacknr;
const inc_t cs_b = 1;
float* restrict minus_one = bli_sm1;
bli_sgemm_opt_mxn( k,
minus_one,
a12,
bd21,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
bli_strsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
void bli_dgemmtrsm_u_opt_mxn(
dim_t k,
double* restrict alpha,
double* restrict a12,
double* restrict a11,
double* restrict bd21,
double* restrict bd11,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
/*
Template gemmtrsm_u micro-kernel implementation
This function contains a template implementation for a double-precision
real micro-kernel that fuses a gemm with a trsm_u subproblem.
This micro-kernel implements the following sequence of operations:
B11 := alpha * B11 - A12 * B21 (gemm)
B11 := inv(A11) * B11 (trsm)
where B11 is MR x NR, A12 is MR x k, B21 is k x NR, A11 is MR x MR and
upper triangular, and alpha is a scalar. Here, inv() denotes matrix
inverse.
NOTE: Here, this gemmtrsm micro-kernel supports element "duplication", a
feature that is enabled or disabled in bli_kernel.h. Duplication factors
are also defined in the aforementioned header. Duplication is NOT
commonly used and most developers may assume it is disabled.
Parameters:
- k: The number of columns of A12 and rows of B21.
- alpha: The address of a scalar to be applied to B11.
- a12: The address of A12, which is the MR x k subpartition of the
packed (column-stored) micro-panel of A that is situated to the
right of the MR x MR upper triangular block.
- a11: The address of A11, which is the MR x MR upper triangular block
within the packed micro-panel of A that is situated to the
left of A12. By the time this gemmtrsm kernel is called, the
diagonal of A11 has already been inverted and the strictly lower
triangle contains zeros.
- bd21: The address of B21, which is the k x NR subpartition situated
above the current MR x NR block B11. bd21 is row-stored. If
duplication is enabled, then each element occurs d times,
effectively increasing the dimension to k x d*NR. If duplication
is disabled, then bd21 is simply the address of the top part of
the current packed (row-stored) micro-panel of B (labeled b21
in the diagram below).
- bd11: The address of B11, which is the MR x NR subpartition situated
above B21. If duplication is enabled, then each element occurs
d times, effectively increasing the dimension to MR x d*NR. If
duplication is disabled, then bd11 is simply the address of the
current MR x NR block witin the packed (row-stored) micro-panel
of B.
- b11: The address of the current MR x NR block within the packed
micro-panel of B. It exists in duplicated form as bd11. If
duplication is disabled, then b11 and bd11 refer to the same
MR x NR block within the packed (row-stored) micro-panel of B.
- c11: The address of C11, which is the MR x NR block of the output
matrix (ie: the matrix provided by the user to the highest-level
trsm API call). C11 corresponds to the elements that exist in
packed form in B11, and is stored according to rs_c and cs_c.
- rs_c: The row stride of C11 (ie: the distance to the next row of C11,
in units of matrix elements).
- cs_c: The column stride of C11 (ie: the distance to the next column of
C11, in units of matrix elements).
- a_next: The address of the packed micro-panel of A that will be used the
next time the gemmtrsm micro-kernel will be called.
- b_next: The address of the packed micro-panel of B that will be used the
next time the gemmtrsm micro-kernel will be called.
The diagram below shows the packed micro-panel operands and how elements
of each would be stored when MR == NR == 4. (The hex digits indicate the
order of the elements in memory.) We also show a B duplication buffer (bd)
that contains a copy of the packed micro-panel of B with a duplication
factor of 2. If duplication is disabled (as is commonly the case), then
bd01 == b01 and bd11 == b11.
a11: a12: NR 2*NR
________ ___________________ _______ _______________
|`. |0 4 8 | b11:|0 1 2 3| bd11:|0 0 1 1 2 2 3 3|
MR | `. |1 5 9 . . . | |4 5 6 7| |4 4 5 5 6 6 7 7|
| `. |2 6 A | MR |8 9 A B| |8 8 9 9 A A B B|
|______`.|3_7_B______________| |___.___| |_______._______|
b21:| . | bd21:| . |
MR k | . | | . |
| | | |
NOTE: If duplication is disabled | | | |
then bd21 and bd11 simply refer k | | | |
to b21 and b11, respectively. | | | |
ALSO: Storage digits are shown | | | |
starting with a12 to avoid |_______| |_______________|
obscuring triangular structure of
a11.
Thus, with duplication enabled, the operation takes the form of:
b11 = alpha * b11 - a12 * bd21;
b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11;
And if duplication is disabled, the operation reduces to:
b11 = alpha * b11 - a12 * b21; (Note: Here, b21 == bd21.)
b11 = inv(a11) * b11;
c11 = b11;
A note on optimization:
- This implementation simply calls the gemm micro-kernel and then the
trsm micro-kernel. Let's assume that the gemm micro-kernel has already
been optimized. You have two options with regards to optimizing the
fused gemmtrsm kernel.
(1) Optimize only the trsm kernel and continue to call the gemm and
trsm micro-kernels in sequence, as is done in this template
implementation.
(2) Fuse the implementation of the gemm micro-kernel with that of the
trsm micro-kernel by inlining both into this gemmtrsm function.
The latter option is more labor-intensive, but also more likely to
yield higher performance because it allows you to eliminate redundant
memory operations on the packed MR x NR block B11.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
*/
const inc_t rs_b = bli_dpacknr;
const inc_t cs_b = 1;
double* restrict minus_one = bli_dm1;
/* Reminder: if duplication is disabled, then bd21 == b21, bd11 == b11. */
/* b11 = alpha * b11 - a12 * bd21; */
bli_dgemm_opt_mxn( k,
minus_one,
a12,
bd21,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */
bli_dtrsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
void bli_cgemmtrsm_u_opt_mxn(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a12,
scomplex* restrict a11,
scomplex* restrict bd21,
scomplex* restrict bd11,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
const inc_t rs_b = bli_cpacknr;
const inc_t cs_b = 1;
scomplex* restrict minus_one = bli_cm1;
bli_cgemm_opt_mxn( k,
minus_one,
a12,
bd21,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
bli_ctrsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
void bli_zgemmtrsm_u_opt_mxn(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a12,
dcomplex* restrict a11,
dcomplex* restrict bd21,
dcomplex* restrict bd11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
const inc_t rs_b = bli_zpacknr;
const inc_t cs_b = 1;
dcomplex* restrict minus_one = bli_zm1;
bli_zgemm_opt_mxn( k,
minus_one,
a12,
bd21,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
bli_ztrsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_mxn )

View File

@@ -0,0 +1,218 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_strsm_l_opt_mxn(
float* restrict a,
float* restrict b,
float* restrict bd,
float* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_strsm_l_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
void bli_dtrsm_l_opt_mxn(
double* restrict a,
double* restrict b,
double* restrict bd,
double* restrict c, inc_t rs_c, inc_t cs_c
)
{
/*
Template trsm_l micro-kernel implementation
This function contains a template implementation for a double-precision
real trsm micro-kernel, coded in C, which can serve as the starting point
for one to write an optimized micro-kernel on an arbitrary architecture.
(We show a template implementation for only double-precision real because
the templates for the other three floating-point types would be nearly
identical.)
This micro-kernel performs a triangular solve with NR right-hand sides:
C := inv(A) * B
where A is MR x MR and lower triangular, B is MR x NR, and C is MR x NR.
NOTE: Here, this trsm micro-kernel supports element "duplication", a
feature that is enabled or disabled in bli_kernel.h. Duplication factors
are also defined in the aforementioned header. Duplication is NOT
commonly used and most developers may assume it is disabled.
Parameters:
- a: The address of A, which is the MR x MR lower triangular block
within the packed (column-stored) micro-panel of A. By the time
this trsm micro-kernel is called, the diagonal of A has already
been inverted and the strictly upper triangle contains zeros.
- b: The address of B, which is the MR x NR subpartition of the
current packed (row-stored) micro-panel of B.
- bd: The address of the duplicated copy of B. If duplication is
disabled, then bd == b.
- c: The address of C, which is the MR x NR block of the output
matrix (ie: the matrix provided by the user to the highest-level
trsm API call). C corresponds to the elements that exist in
packed form in B, and is stored according to rs_c and cs_c.
- rs_c: The row stride of C (ie: the distance to the next row of C11,
in units of matrix elements).
- cs_c: The column stride of C (ie: the distance to the next column of
C11, in units of matrix elements).
Please see the comments in bli_gemmtrsm_l_opt_mxn.c for a diagram of the
trsm operation and where it fits in with the preceding gemm subproblem.
Here are a few things to consider:
- While all three loops are exposed in this template micro-kernel, all
three loops typically disappear in an optimized code because they are
fully unrolled.
- Note that the diagonal of the triangular matrix A contains the INVERSE
of those elements. This is done during packing so that we can avoid
expensive division instructions within this micro-kernel.
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
then the result must be written to three places: the sub-block within the
duplicated copy of B, the sub-block of the original packed micro-panel of
B, and the sub-block of the output matrix C. When duplication is not
used, the micro-kernel should update only the latter two locations.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t m = bli_dmr;
const dim_t n = bli_dnr;
const inc_t rs_a = 1;
const inc_t cs_a = bli_dpackmr;
const inc_t rs_b = bli_dpacknr;
const inc_t cs_b = 1;
dim_t iter, i, j, l;
dim_t n_behind;
double* restrict alpha11;
double* restrict a10t;
double* restrict alpha10;
double* restrict X0;
double* restrict x1;
double* restrict x01;
double* restrict chi01;
double* restrict chi11;
double* restrict gamma11;
double rho11;
for ( iter = 0; iter < m; ++iter )
{
i = iter;
n_behind = i;
alpha11 = a + (i )*rs_a + (i )*cs_a;
a10t = a + (i )*rs_a + (0 )*cs_a;
X0 = b + (0 )*rs_b + (0 )*cs_b;
x1 = b + (i )*rs_b + (0 )*cs_b;
/* x1 = x1 - a10t * X0; */
/* x1 = x1 / alpha11; */
for ( j = 0; j < n; ++j )
{
x01 = X0 + (0 )*rs_b + (j )*cs_b;
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
gamma11 = c + (i )*rs_c + (j )*cs_c;
/* chi11 = chi11 - a10t * x01; */
bli_dset0s( rho11 );
for ( l = 0; l < n_behind; ++l )
{
alpha10 = a10t + (l )*cs_a;
chi01 = x01 + (l )*rs_b;
bli_daxpys( *alpha10, *chi01, rho11 );
}
bli_dsubs( rho11, *chi11 );
/* chi11 = chi11 / alpha11; */
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */
bli_dscals( *alpha11, *chi11 );
/* Output final result to matrix C. */
bli_dcopys( *chi11, *gamma11 );
}
}
}
void bli_ctrsm_l_opt_mxn(
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict bd,
scomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ctrsm_l_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
void bli_ztrsm_l_opt_mxn(
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict bd,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ztrsm_l_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}

View File

@@ -0,0 +1,50 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_l_opt_mxn )

View File

@@ -0,0 +1,218 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_strsm_u_opt_mxn(
float* restrict a,
float* restrict b,
float* restrict bd,
float* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_strsm_u_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
void bli_dtrsm_u_opt_mxn(
double* restrict a,
double* restrict b,
double* restrict bd,
double* restrict c, inc_t rs_c, inc_t cs_c
)
{
/*
Template trsm_u micro-kernel implementation
This function contains a template implementation for a double-precision
real trsm micro-kernel, coded in C, which can serve as the starting point
for one to write an optimized micro-kernel on an arbitrary architecture.
(We show a template implementation for only double-precision real because
the templates for the other three floating-point types would be nearly
identical.)
This micro-kernel performs a triangular solve with NR right-hand sides:
C := inv(A) * B
where A is MR x MR and upper triangular, B is MR x NR, and C is MR x NR.
NOTE: Here, this trsm micro-kernel supports element "duplication", a
feature that is enabled or disabled in bli_kernel.h. Duplication factors
are also defined in the aforementioned header. Duplication is NOT
commonly used and most developers may assume it is disabled.
Parameters:
- a: The address of A, which is the MR x MR upper triangular block
within the packed (column-stored) micro-panel of A. By the time
this trsm micro-kernel is called, the diagonal of A has already
been inverted and the strictly lower triangle contains zeros.
- b: The address of B, which is the MR x NR subpartition of the
current packed (row-stored) micro-panel of B.
- bd: The address of the duplicated copy of B. If duplication is
disabled, then bd == b.
- c: The address of C, which is the MR x NR block of the output
matrix (ie: the matrix provided by the user to the highest-level
trsm API call). C corresponds to the elements that exist in
packed form in B, and is stored according to rs_c and cs_c.
- rs_c: The row stride of C (ie: the distance to the next row of C11,
in units of matrix elements).
- cs_c: The column stride of C (ie: the distance to the next column of
C11, in units of matrix elements).
Please see the comments in bli_gemmtrsm_u_opt_mxn.c for a diagram of the
trsm operation and where it fits in with the preceding gemm subproblem.
Here are a few things to consider:
- While all three loops are exposed in this template micro-kernel, all
three loops typically disappear in an optimized code because they are
fully unrolled.
- Note that the diagonal of the triangular matrix A contains the INVERSE
of those elements. This is done during packing so that we can avoid
expensive division instructions within this micro-kernel.
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
then the result must be written to three places: the sub-block within the
duplicated copy of B, the sub-block of the original packed micro-panel of
B, and the sub-block of the output matrix C. When duplication is not
used, the micro-kernel should update only the latter two locations.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t m = bli_dmr;
const dim_t n = bli_dnr;
const inc_t rs_a = 1;
const inc_t cs_a = bli_dpackmr;
const inc_t rs_b = bli_dpacknr;
const inc_t cs_b = 1;
dim_t iter, i, j, l;
dim_t n_behind;
double* restrict alpha11;
double* restrict a12t;
double* restrict alpha12;
double* restrict X2;
double* restrict x1;
double* restrict x21;
double* restrict chi21;
double* restrict chi11;
double* restrict gamma11;
double rho11;
for ( iter = 0; iter < m; ++iter )
{
i = m - iter - 1;
n_behind = iter;
alpha11 = a + (i )*rs_a + (i )*cs_a;
a12t = a + (i )*rs_a + (i+1)*cs_a;
x1 = b + (i )*rs_b + (0 )*cs_b;
X2 = b + (i+1)*rs_b + (0 )*cs_b;
/* x1 = x1 - a12t * X2; */
/* x1 = x1 / alpha11; */
for ( j = 0; j < n; ++j )
{
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
x21 = X2 + (0 )*rs_b + (j )*cs_b;
gamma11 = c + (i )*rs_c + (j )*cs_c;
/* chi11 = chi11 - a12t * x21; */
bli_dset0s( rho11 );
for ( l = 0; l < n_behind; ++l )
{
alpha12 = a12t + (l )*cs_a;
chi21 = x21 + (l )*rs_b;
bli_daxpys( *alpha12, *chi21, rho11 );
}
bli_dsubs( rho11, *chi11 );
/* chi11 = chi11 / alpha11; */
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */
bli_dscals( *alpha11, *chi11 );
/* Output final result to matrix C. */
bli_dcopys( *chi11, *gamma11 );
}
}
}
void bli_ctrsm_u_opt_mxn(
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict bd,
scomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ctrsm_u_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
void bli_ztrsm_u_opt_mxn(
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict bd,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ztrsm_u_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}

View File

@@ -0,0 +1,50 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_u_opt_mxn )

View File

@@ -0,0 +1,107 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2013, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := gcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O2
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS :=
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif

View File

@@ -34,75 +34,9 @@
#include "blis.h"
/*
#define FUNCPTR_T axpy2v_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
conj_t conjy,
dim_t n,
void* alpha,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpy2v_unb_var1);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpy2v_unb_var1);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpy2v_unb_var1);
#endif
#endif
void bli_axpy2v_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
conj_t conjx = bli_obj_conj_status( *x );
conj_t conjy = bli_obj_conj_status( *y );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
num_t dt_alpha;
void* buf_alpha;
FUNCPTR_T f;
// If alpha is a scalar constant, use dt_x to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the alpha object and extract the buffer at the alpha offset.
bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_alpha][dt_x][dt_y];
// Invoke the function.
f( conjx,
conjy,
n,
buf_alpha,
buf_x, inc_x,
buf_y, inc_y );
}
*/
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjx, \
@@ -121,27 +55,27 @@ void PASTEMAC3(chx,chy,chz,varname)( \
ctype_y* y_cast = y; \
ctype_z* z_cast = z; \
\
PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \
n, \
alpha1_cast, \
x_cast, incx, \
z_cast, incz ); \
PASTEMAC3(chxy,chy,chz,axpyv)( conjy, \
n, \
alpha2_cast, \
y_cast, incy, \
z_cast, incz ); \
PASTEMAC3(chxy,chx,chz,kername)( conjx, \
n, \
alpha1_cast, \
x_cast, incx, \
z_cast, incz ); \
PASTEMAC3(chxy,chy,chz,kername)( conjy, \
n, \
alpha2_cast, \
y_cast, incy, \
z_cast, incz ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3U12_BASIC( axpy2v, axpy2v_unb_var1 )
INSERT_GENTFUNC3U12_BASIC( axpy2v_unb_var1, AXPYV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( axpy2v, axpy2v_unb_var1 )
INSERT_GENTFUNC3U12_MIX_D( axpy2v_unb_var1, AXPYV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( axpy2v, axpy2v_unb_var1 )
INSERT_GENTFUNC3U12_MIX_P( axpy2v_unb_var1, AXPYV_KERNEL )
#endif

View File

@@ -32,12 +32,6 @@
*/
/*
void bli_axpy2v_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y );
*/
#undef GENTPROT3
#define GENTPROT3( ctype_x, ctype_y, ctype_z, chx, chy, chz, varname ) \

View File

@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t n, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname)( \
PASTEMAC3(ch,ch,ch,varname)( conja, \
conjx, \
m, \
n, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
@@ -75,7 +75,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t n, \
dim_t b_n, \
ctype_ax* alpha, \
ctype_a* a, inc_t inca, inc_t lda, \
ctype_x* x, inc_t incx, \
@@ -85,7 +85,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
PASTEMAC3(cha,chx,chy,varname)( conja, \
conjx, \
m, \
n, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \

View File

@@ -35,24 +35,6 @@
#include "bli_axpyf_unb_var1.h"
//
// Define fusing factors (if they are not already defined by the user
// in bli_kernel.h).
//
#ifndef bli_saxpyf_fuse_fac
#define bli_saxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
#endif
#ifndef bli_daxpyf_fuse_fac
#define bli_daxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
#endif
#ifndef bli_caxpyf_fuse_fac
#define bli_caxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
#endif
#ifndef bli_zaxpyf_fuse_fac
#define bli_zaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
#endif
//
// Prototype BLAS-like interfaces with homogeneous-typed operands.
//
@@ -63,7 +45,7 @@ void PASTEMAC(ch,opname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t n, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
@@ -83,7 +65,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t n, \
dim_t b_n, \
ctype_ax* alpha, \
ctype_a* a, inc_t inca, inc_t lda, \
ctype_x* x, inc_t incx, \

View File

@@ -34,71 +34,9 @@
#include "blis.h"
/*
#define FUNCPTR_T axpyf_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
dim_t n,
void* alpha,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyf_unb_var1);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyf_unb_var1);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyf_unb_var1);
#endif
#endif
void bli_axpyf_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
conj_t conjx = bli_obj_conj_status( *x );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
num_t dt_alpha;
void* buf_alpha;
FUNCPTR_T f;
// If alpha is a scalar constant, use dt_x to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the alpha object and extract the buffer at the alpha offset.
bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_alpha][dt_x][dt_y];
// Invoke the function.
f( conjx,
n,
buf_alpha,
buf_x, inc_x,
buf_y, inc_y );
}
*/
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conja, \
@@ -130,23 +68,23 @@ void PASTEMAC3(cha,chx,chy,varname)( \
PASTEMAC2(chx,chax,copycjs)( conjx, *chi1, alpha_chi1 ); \
PASTEMAC2(chax,chax,scals)( *alpha_cast, alpha_chi1 ); \
\
PASTEMAC3(chax,cha,chy,axpyv)( conja, \
m, \
&alpha_chi1, \
a1, inca, \
y1, incy ); \
PASTEMAC3(chax,cha,chy,kername)( conja, \
m, \
&alpha_chi1, \
a1, inca, \
y1, incy ); \
} \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3U12_BASIC( axpyf, axpyf_unb_var1 )
INSERT_GENTFUNC3U12_BASIC( axpyf_unb_var1, AXPYV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( axpyf, axpyf_unb_var1 )
INSERT_GENTFUNC3U12_MIX_D( axpyf_unb_var1, AXPYV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( axpyf, axpyf_unb_var1 )
INSERT_GENTFUNC3U12_MIX_P( axpyf_unb_var1, AXPYV_KERNEL )
#endif

View File

@@ -32,12 +32,6 @@
*/
/*
void bli_axpyf_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y );
*/
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
@@ -46,7 +40,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t n, \
dim_t b_n, \
void* alpha, \
void* a, inc_t inca, inc_t lda, \
void* x, inc_t incx, \

View File

@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
@@ -56,7 +56,7 @@ void PASTEMAC(ch,opname)( \
PASTEMAC3(ch,ch,ch,varname)( conjxt, \
conjx, \
conjy, \
n, \
m, \
alpha, \
x, incx, \
y, incy, \
@@ -77,8 +77,8 @@ void PASTEMAC3(chx,chy,chz,opname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_xy* alpha, \
dim_t m, \
ctype_x* alpha, \
ctype_x* x, inc_t incx, \
ctype_y* y, inc_t incy, \
ctype_xy* rho, \
@@ -88,7 +88,7 @@ void PASTEMAC3(chx,chy,chz,opname)( \
PASTEMAC3(chx,chy,chz,varname)( conjxt, \
conjx, \
conjy, \
n, \
m, \
alpha, \
x, incx, \
y, incy, \

View File

@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
@@ -66,8 +66,8 @@ void PASTEMAC3(chx,chy,chz,opname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_xy* alpha, \
dim_t m, \
ctype_x* alpha, \
ctype_x* x, inc_t incx, \
ctype_y* y, inc_t incy, \
ctype_xy* rho, \

View File

@@ -36,13 +36,13 @@
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, dotxvker, axpyvker ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
dim_t m, \
void* alpha, \
void* x, inc_t incx, \
void* y, inc_t incy, \
@@ -52,36 +52,36 @@ void PASTEMAC3(chx,chy,chz,varname)( \
{ \
ctype_xy* one = PASTEMAC(chxy,1); \
ctype_xy* zero = PASTEMAC(chxy,0); \
ctype_xy* alpha_cast = alpha; \
ctype_x* alpha_cast = alpha; \
ctype_x* x_cast = x; \
ctype_y* y_cast = y; \
ctype_xy* rho_cast = rho; \
ctype_z* z_cast = z; \
\
PASTEMAC3(chx,chy,chxy,dotxv)( conjxt, \
conjy, \
n, \
one, \
x_cast, incx, \
y_cast, incy, \
zero, \
rho_cast ); \
PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \
n, \
alpha_cast, \
x_cast, incx, \
z_cast, incz ); \
PASTEMAC3(chx,chy,chxy,dotxvker)( conjxt, \
conjy, \
m, \
one, \
x_cast, incx, \
y_cast, incy, \
zero, \
rho_cast ); \
PASTEMAC3(chx,chx,chz,axpyvker)( conjx, \
m, \
alpha_cast, \
x_cast, incx, \
z_cast, incz ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3U12_BASIC( dotaxpyv, dotaxpyv_unb_var1 )
INSERT_GENTFUNC3U12_BASIC2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotaxpyv, dotaxpyv_unb_var1 )
INSERT_GENTFUNC3U12_MIX_D2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotaxpyv, dotaxpyv_unb_var1 )
INSERT_GENTFUNC3U12_MIX_P2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
#endif

View File

@@ -40,7 +40,7 @@ void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
dim_t m, \
void* alpha, \
void* x, inc_t incx, \
void* y, inc_t incy, \

View File

@@ -35,24 +35,6 @@
#include "bli_dotxaxpyf_unb_var1.h"
//
// Define fusing factors (if they are not already defined by the user
// in bli_kernel.h).
//
#ifndef bli_sdotxaxpyf_fuse_fac
#define bli_sdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
#endif
#ifndef bli_ddotxaxpyf_fuse_fac
#define bli_ddotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
#endif
#ifndef bli_cdotxaxpyf_fuse_fac
#define bli_cdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
#endif
#ifndef bli_zdotxaxpyf_fuse_fac
#define bli_zdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
#endif
//
// Prototype BLAS-like interfaces with homogeneous-typed operands.
//

View File

@@ -36,7 +36,7 @@
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, opname, varname ) \
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, dotxvker, axpyvker ) \
\
void PASTEMAC3(cha,chb,chc,varname)( \
conj_t conjat, \
@@ -107,13 +107,13 @@ void PASTEMAC3(cha,chb,chc,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3U12_BASIC( dotxaxpyf, dotxaxpyf_unb_var1 )
INSERT_GENTFUNC3U12_BASIC2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf, dotxaxpyf_unb_var1 )
INSERT_GENTFUNC3U12_MIX_D2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf, dotxaxpyf_unb_var1 )
INSERT_GENTFUNC3U12_MIX_P2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
#endif

View File

@@ -42,26 +42,26 @@
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,opname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
dim_t b_n, \
ctype* alpha, \
ctype* x, inc_t incx, inc_t ldx, \
ctype* y, inc_t incy, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* r, inc_t incr \
ctype* y, inc_t incy \
) \
{ \
PASTEMAC3(ch,ch,ch,varname)( conjx, \
conjy, \
PASTEMAC3(ch,ch,ch,varname)( conjat, \
conjx, \
m, \
n, \
b_n, \
alpha, \
x, incx, ldx, \
y, incy, \
a, inca, lda, \
x, incx, \
beta, \
r, incr ); \
y, incy ); \
}
INSERT_GENTFUNC_BASIC( dotxf, DOTXF_KERNEL )
@@ -71,29 +71,29 @@ INSERT_GENTFUNC_BASIC( dotxf, DOTXF_KERNEL )
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
\
void PASTEMAC3(chx,chy,chr,opname)( \
void PASTEMAC3(cha,chx,chy,opname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype_xy* alpha, \
ctype_x* x, inc_t incx, inc_t ldx, \
ctype_y* y, inc_t incy, \
ctype_r* beta, \
ctype_r* r, inc_t incr \
dim_t b_n, \
ctype_ax* alpha, \
ctype_a* a, inc_t inca, inc_t lda, \
ctype_x* x, inc_t incx, \
ctype_y* beta, \
ctype_y* y, inc_t incy \
) \
{ \
PASTEMAC3(chx,chy,chr,varname)( conjx, \
conjy, \
PASTEMAC3(cha,chx,chy,varname)( conjat, \
conjx, \
m, \
n, \
b_n, \
alpha, \
x, incx, ldx, \
y, incy, \
a, inca, lda, \
x, incx, \
beta, \
r, incr ); \
y, incy ); \
}
// Define the basic set of functions unconditionally, and then also some

View File

@@ -35,24 +35,6 @@
#include "bli_dotxf_unb_var1.h"
//
// Define fusing factors (if they are not already defined by the user
// in bli_kernel.h).
//
#ifndef bli_sdotxf_fuse_fac
#define bli_sdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
#endif
#ifndef bli_ddotxf_fuse_fac
#define bli_ddotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
#endif
#ifndef bli_cdotxf_fuse_fac
#define bli_cdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
#endif
#ifndef bli_zdotxf_fuse_fac
#define bli_zdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
#endif
//
// Prototype BLAS-like interfaces with homogeneous-typed operands.
//
@@ -60,15 +42,15 @@
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
dim_t b_n, \
ctype* alpha, \
ctype* x, inc_t incx, inc_t ldx, \
ctype* y, inc_t incy, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* r, inc_t incr \
ctype* y, inc_t incy \
);
INSERT_GENTPROT_BASIC( dotxf )
@@ -78,18 +60,18 @@ INSERT_GENTPROT_BASIC( dotxf )
// Prototype BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname ) \
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname ) \
\
void PASTEMAC3(chx,chy,chr,opname)( \
void PASTEMAC3(cha,chx,chy,opname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype_xy* alpha, \
ctype_x* x, inc_t incx, inc_t ldx, \
ctype_y* y, inc_t incy, \
ctype_r* beta, \
ctype_r* r, inc_t incr \
dim_t b_n, \
ctype_ax* alpha, \
ctype_a* a, inc_t inca, inc_t lda, \
ctype_x* x, inc_t incx, \
ctype_y* beta, \
ctype_y* y, inc_t incy \
);

View File

@@ -34,139 +34,58 @@
#include "blis.h"
/*
#define FUNCPTR_T dotxf_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
conj_t conjy,
dim_t n,
void* alpha,
void* x, inc_t incx,
void* y, inc_t incy,
void* beta,
void* rho
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_unb_var1);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_unb_var1);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_unb_var1);
#endif
#endif
void bli_dotxf_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
num_t dt_rho = bli_obj_datatype( *rho );
conj_t conjx = bli_obj_conj_status( *x );
conj_t conjy = bli_obj_conj_status( *y );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
void* buf_rho = bli_obj_buffer_at_off( *rho );
num_t dt_alpha;
void* buf_alpha;
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
// The datatype of alpha MUST be the type union of x and y. This is to
// prevent any unnecessary loss of information during computation.
dt_alpha = bli_datatype_union( dt_x, dt_y );
buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
// The datatype of beta MUST be the same as the datatype of rho.
dt_beta = dt_rho;
buf_beta = bli_obj_scalar_buffer( dt_beta, *beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y][dt_rho];
// Invoke the function.
f( conjx,
conjy,
n,
buf_alpha,
buf_x, inc_x,
buf_y, inc_y,
buf_beta,
buf_rho );
}
*/
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t b_m, \
dim_t n, \
dim_t m, \
dim_t b_n, \
void* alpha, \
void* x, inc_t incx, inc_t ldx, \
void* y, inc_t incy, \
void* a, inc_t inca, inc_t lda, \
void* x, inc_t incx, \
void* beta, \
void* r, inc_t incr \
void* y, inc_t incy \
) \
{ \
ctype_xy* alpha_cast = alpha; \
ctype_ax* alpha_cast = alpha; \
ctype_a* a_cast = a; \
ctype_x* x_cast = x; \
ctype_y* beta_cast = beta; \
ctype_y* y_cast = y; \
ctype_r* beta_cast = beta; \
ctype_r* r_cast = r; \
ctype_a* a1; \
ctype_x* x1; \
ctype_y* y1; \
ctype_r* rho1; \
ctype_y* psi1; \
dim_t i; \
\
for ( i = 0; i < b_m; ++i ) \
for ( i = 0; i < b_n; ++i ) \
{ \
x1 = x_cast + (0 )*incx + (i )*ldx; \
y1 = y_cast + (0 )*incy; \
rho1 = r_cast + (i )*incr; \
a1 = a_cast + (0 )*inca + (i )*lda; \
x1 = x_cast + (0 )*incx; \
psi1 = y_cast + (i )*incy; \
\
PASTEMAC3(chx,chy,chr,dotxv)( conjx, \
conjy, \
n, \
alpha_cast, \
x1, incx, \
y1, incy, \
beta_cast, \
rho1 ); \
PASTEMAC3(cha,chx,chy,kername)( conjat, \
conjx, \
m, \
alpha_cast, \
a1, inca, \
x1, incx, \
beta_cast, \
psi1 ); \
} \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3U12_BASIC( dotxf, dotxf_unb_var1 )
INSERT_GENTFUNC3U12_BASIC( dotxf_unb_var1, DOTXV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxf, dotxf_unb_var1 )
INSERT_GENTFUNC3U12_MIX_D( dotxf_unb_var1, DOTXV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxf, dotxf_unb_var1 )
INSERT_GENTFUNC3U12_MIX_P( dotxf_unb_var1, DOTXV_KERNEL )
#endif

View File

@@ -32,26 +32,20 @@
*/
void bli_dotxf_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho );
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
dim_t b_n, \
void* alpha, \
void* x, inc_t incx, inc_t ldx, \
void* y, inc_t incy, \
void* a, inc_t inca, inc_t lda, \
void* x, inc_t incx, \
void* beta, \
void* r, inc_t incr \
void* y, inc_t incy \
);
INSERT_GENTPROT3U12_BASIC( dotxf_unb_var1 )

View File

@@ -159,8 +159,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
\
conja = bli_extract_conj( transa ); \
\
/* Query the fusing factor from the dotxf implementation. */ \
b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
/* Query the fusing factor for the dotxf implementation. */ \
b_fuse = PASTEMAC(chax,dotxf_fusefac); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
@@ -173,8 +173,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
/* y1 = beta * y1 + alpha * A1 * x; */ \
PASTEMAC3(cha,chx,chy,kername)( conja, \
conjx, \
f, \
n_elem, \
f, \
alpha_cast, \
A1, cs_at, rs_at, \
x1, incx, \

View File

@@ -177,8 +177,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
y_cast, incy ); \
} \
\
/* Query the fusing factor from the axpyf implementation. */ \
b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
/* Query the fusing factor for the axpyf implementation. */ \
b_fuse = PASTEMAC(chax,axpyf_fusefac); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \

View File

@@ -210,8 +210,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
y_cast, incy ); \
} \
\
/* Query the fusing factor from the dotxaxpyf implementation. */ \
b_fuse = PASTEMAC(chax,dotxaxpyf_fuse_fac); \
/* Query the fusing factor for the dotxaxpyf implementation. */ \
b_fuse = PASTEMAC(chax,dotxaxpyf_fusefac); \
\
for ( i = 0; i < m; i += f ) \
{ \

View File

@@ -228,8 +228,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
y_cast, incy ); \
} \
\
/* Query the fusing factor from the dotxaxpyf implementation. */ \
b_fuse = PASTEMAC(chax,dotxaxpyf_fuse_fac); \
/* Query the fusing factor for the dotxaxpyf implementation. */ \
b_fuse = PASTEMAC(chax,dotxaxpyf_fusefac); \
\
for ( i = 0; i < m; i += f ) \
{ \

View File

@@ -160,8 +160,8 @@ void PASTEMAC2(cha,chx,varname)( \
\
conja = bli_extract_conj( trans ); \
\
/* Query the fusing factor from the dotxf implementation. */ \
b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
/* Query the fusing factor for the dotxf implementation. */ \
b_fuse = PASTEMAC(chax,dotxf_fusefac); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uplo_trans ) ) \
@@ -208,15 +208,15 @@ void PASTEMAC2(cha,chx,varname)( \
} \
\
/* x1 = x1 + alpha * A12 * x2; */ \
PASTEMAC3(cha,chx,chx,dotxf)( conja, \
BLIS_NO_CONJUGATE, \
f, \
n_ahead, \
alpha_cast, \
A12, cs_at, rs_at, \
x2, incx, \
one, \
x1, incx ); \
PASTEMAC3(cha,chx,chx,kername)( conja, \
BLIS_NO_CONJUGATE, \
n_ahead, \
f, \
alpha_cast, \
A12, cs_at, rs_at, \
x2, incx, \
one, \
x1, incx ); \
} \
} \
else /* if ( bli_is_lower( uplo_trans ) ) */ \
@@ -265,8 +265,8 @@ void PASTEMAC2(cha,chx,varname)( \
/* x1 = x1 + alpha * A10 * x0; */ \
PASTEMAC3(cha,chx,chx,kername)( conja, \
BLIS_NO_CONJUGATE, \
f, \
n_ahead, \
f, \
alpha_cast, \
A10, cs_at, rs_at, \
x0, incx, \

View File

@@ -159,8 +159,8 @@ void PASTEMAC2(cha,chx,varname)( \
\
conja = bli_extract_conj( trans ); \
\
/* Query the fusing factor from the axpyf implementation. */ \
b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
/* Query the fusing factor for the axpyf implementation. */ \
b_fuse = PASTEMAC(chax,axpyf_fusefac); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uplo_trans ) ) \
@@ -176,14 +176,14 @@ void PASTEMAC2(cha,chx,varname)( \
x0 = x_cast + (0 )*incx; \
\
/* x0 = x0 + alpha * A01 * x1; */ \
PASTEMAC3(cha,chx,chx,axpyf)( conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
alpha_cast, \
A01, rs_at, cs_at, \
x1, incx, \
x0, incx ); \
PASTEMAC3(cha,chx,chx,kername)( conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
alpha_cast, \
A01, rs_at, cs_at, \
x1, incx, \
x0, incx ); \
\
/* x1 = alpha * A11 * x1; */ \
for ( k = 0; k < f; ++k ) \

View File

@@ -161,8 +161,8 @@ void PASTEMAC2(cha,chx,varname)( \
\
conja = bli_extract_conj( trans ); \
\
/* Query the fusing factor from the dotxf implementation. */ \
b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
/* Query the fusing factor for the dotxf implementation. */ \
b_fuse = PASTEMAC(chax,dotxf_fusefac); \
\
/* x = alpha * x; */ \
PASTEMAC2(chax,chx,scalv)( BLIS_NO_CONJUGATE, \
@@ -186,8 +186,8 @@ void PASTEMAC2(cha,chx,varname)( \
/* x1 = x1 - A12 * x2; */ \
PASTEMAC3(cha,chx,chx,kername)( conja, \
BLIS_NO_CONJUGATE, \
f, \
n_behind, \
f, \
minus_one, \
A12, cs_at, rs_at, \
x2, incx, \
@@ -242,8 +242,8 @@ void PASTEMAC2(cha,chx,varname)( \
/* x1 = x1 - A10 * x0; */ \
PASTEMAC3(cha,chx,chx,kername)( conja, \
BLIS_NO_CONJUGATE, \
f, \
n_behind, \
f, \
minus_one, \
A10, cs_at, rs_at, \
x0, incx, \

View File

@@ -160,8 +160,8 @@ void PASTEMAC2(cha,chx,varname)( \
\
conja = bli_extract_conj( trans ); \
\
/* Query the fusing factor from the axpyf implementation. */ \
b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
/* Query the fusing factor for the axpyf implementation. */ \
b_fuse = PASTEMAC(chax,axpyf_fusefac); \
\
/* x = alpha * x; */ \
PASTEMAC2(chax,chx,scalv)( BLIS_NO_CONJUGATE, \

View File

@@ -59,38 +59,39 @@ void PASTEMAC(ch,varname)( \
const inc_t rs_ab = 1; \
const inc_t cs_ab = PASTEMAC(ch,mr); \
\
dim_t k0, j0, i0; \
dim_t l, j, i; \
\
ctype ab[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ]; \
ctype* restrict ab00; \
ctype a0; \
ctype b0; \
ctype* restrict abij; \
ctype ai; \
ctype bj; \
\
\
/* Initialize the accumulator elements in ab to zero. */ \
for ( i0 = 0; i0 < m * n; ++i0 ) \
for ( i = 0; i < m * n; ++i ) \
{ \
PASTEMAC(ch,set0s)( *(ab + i0) ); \
PASTEMAC(ch,set0s)( *(ab + i) ); \
} \
\
/* Perform a series of k rank-1 updates into ab. */ \
for ( k0 = 0; k0 < k; ++k0 ) \
for ( l = 0; l < k; ++l ) \
{ \
ab00 = ab; \
abij = ab; \
\
for ( j0 = 0; j0 < n; ++j0 ) \
/* In an optimized implementation, these two loops over MR and NR
are typically fully unrolled. */ \
for ( j = 0; j < n; ++j ) \
{ \
b0 = *(b + j0); \
bj = *(b + j); \
\
for ( i0 = 0; i0 < m; ++i0 ) \
for ( i = 0; i < m; ++i ) \
{ \
a0 = *(a + i0); \
ai = *(a + i); \
\
PASTEMAC(ch,dots)( a0, \
b0, \
*ab00 ); \
ab00 += rs_ab; \
PASTEMAC(ch,dots)( ai, bj, *abij ); \
\
abij += rs_ab; \
} \
} \
\
@@ -99,9 +100,9 @@ void PASTEMAC(ch,varname)( \
} \
\
/* Scale the result in ab by alpha. */ \
for ( i0 = 0; i0 < m * n; ++i0 ) \
for ( i = 0; i < m * n; ++i ) \
{ \
PASTEMAC(ch,scals)( *alpha, *(ab + i0) ); \
PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
} \
\
/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,

View File

@@ -41,12 +41,12 @@
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict aL, \
ctype* restrict a, \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
@@ -56,23 +56,23 @@ void PASTEMAC(ch,varname)( \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \
\
/* b = alpha * b - aL * bdT; */ \
/* b11 = alpha * b11 - a10 * bd01; */ \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
aL, \
bdT, \
a10, \
bd01, \
alpha, \
b, rs_b, cs_b, \
b11, rs_b, cs_b, \
a_next, \
b_next ); \
\
/* b = inv(a) * b;
bd = b; (if gemm ukernel needs duplicated B)
c = b; */ \
PASTEMAC(ch,trsmukr)( a, \
b, \
bd, \
c, rs_c, cs_c ); \
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */ \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bd11, \
c11, rs_c, cs_c ); \
}
INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ref_mxn, GEMM_UKERNEL, TRSM_L_UKERNEL )

View File

@@ -42,12 +42,12 @@
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict aL, \
ctype* restrict a, \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);

View File

@@ -41,12 +41,12 @@
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict aR, \
ctype* restrict a, \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
@@ -56,23 +56,23 @@ void PASTEMAC(ch,varname)( \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \
\
/* b = alpha * b - aR * bdB; */ \
/* b11 = alpha * b11 - a12 * bd21; */ \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
aR, \
bdB, \
a12, \
bd21, \
alpha, \
b, rs_b, cs_b, \
b11, rs_b, cs_b, \
a_next, \
b_next ); \
\
/* b = inv(a) * b;
bd = b; (if gemm ukernel needs duplicated B)
c = b; */ \
PASTEMAC(ch,trsmukr)( a, \
b, \
bd, \
c, rs_c, cs_c ); \
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */ \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bd11, \
c11, rs_c, cs_c ); \
}
INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ref_mxn, GEMM_UKERNEL, TRSM_U_UKERNEL )

View File

@@ -42,12 +42,12 @@
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict aR, \
ctype* restrict a, \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);

View File

@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
const inc_t rs_b = PASTEMAC(ch,packnr); \
const inc_t cs_b = 1; \
\
dim_t iter, i, j, k; \
dim_t iter, i, j, l; \
dim_t n_behind; \
\
ctype* restrict alpha11; \
@@ -87,18 +87,20 @@ void PASTEMAC(ch,varname)( \
\
/* chi11 = chi11 - a10t * x01; */ \
PASTEMAC(ch,set0s)( rho11 ); \
for ( k = 0; k < n_behind; ++k ) \
for ( l = 0; l < n_behind; ++l ) \
{ \
alpha10 = a10t + (k )*cs_a; \
chi01 = x01 + (k )*rs_b; \
alpha10 = a10t + (l )*cs_a; \
chi01 = x01 + (l )*rs_b; \
\
PASTEMAC(ch,axpys)( *alpha10, *chi01, rho11 ); \
} \
PASTEMAC(ch,subs)( rho11, *chi11 ); \
\
/* chi11 = chi11 / alpha11; */ \
/* NOTE: 1.0/alpha11 is stored instead of alpha11, so we
need to multiply rather than divide. */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( *alpha11, *chi11 ); \
\
/* Output final result to matrix C. */ \

View File

@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
const inc_t rs_b = PASTEMAC(ch,packnr); \
const inc_t cs_b = 1; \
\
dim_t iter, i, j, k; \
dim_t iter, i, j, l; \
dim_t n_behind; \
\
ctype* restrict alpha11; \
@@ -87,18 +87,20 @@ void PASTEMAC(ch,varname)( \
\
/* chi11 = chi11 - a12t * x21; */ \
PASTEMAC(ch,set0s)( rho11 ); \
for ( k = 0; k < n_behind; ++k ) \
for ( l = 0; l < n_behind; ++l ) \
{ \
alpha12 = a12t + (k )*cs_a; \
chi21 = x21 + (k )*rs_b; \
alpha12 = a12t + (l )*cs_a; \
chi21 = x21 + (l )*rs_b; \
\
PASTEMAC(ch,axpys)( *alpha12, *chi21, rho11 ); \
} \
PASTEMAC(ch,subs)( rho11, *chi11 ); \
\
/* chi11 = chi11 / alpha11; */ \
/* NOTE: 1.0/alpha11 is stored instead of alpha11, so we
need to multiply rather than divide. */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( *alpha11, *chi11 ); \
\
/* Output final result to matrix C. */ \

View File

@@ -248,5 +248,21 @@
#define bli_cnifac BLIS_DEFAULT_NI_FAC
#define bli_znifac BLIS_DEFAULT_NI_FAC
// Default Level-1f fusing factors
#define bli_sdotxf_fusefac BLIS_DOTXF_FUSE_FAC_S
#define bli_ddotxf_fusefac BLIS_DOTXF_FUSE_FAC_D
#define bli_cdotxf_fusefac BLIS_DOTXF_FUSE_FAC_C
#define bli_zdotxf_fusefac BLIS_DOTXF_FUSE_FAC_Z
#define bli_saxpyf_fusefac BLIS_AXPYF_FUSE_FAC_S
#define bli_daxpyf_fusefac BLIS_AXPYF_FUSE_FAC_D
#define bli_caxpyf_fusefac BLIS_AXPYF_FUSE_FAC_C
#define bli_zaxpyf_fusefac BLIS_AXPYF_FUSE_FAC_Z
#define bli_sdotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_S
#define bli_ddotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_D
#define bli_cdotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_C
#define bli_zdotxaxpyf_fusefac BLIS_DOTXAXPYF_FUSE_FAC_Z
#endif

View File

@@ -370,6 +370,18 @@
\
( rs < cs )
#define bli_has_nonunit_inc1( inc1 ) \
\
( inc1 != 1 )
#define bli_has_nonunit_inc2( inc1, inc2 ) \
\
( inc1 != 1 || inc2 != 1 )
#define bli_has_nonunit_inc3( inc1, inc2, inc3 ) \
\
( inc1 != 1 || inc2 != 1 || inc3 != 1 )
// diag offset-related

View File

@@ -34,127 +34,46 @@
#include "blis.h"
/*
#define FUNCPTR_T dotxf_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
conj_t conjy,
dim_t n,
void* alpha,
void* x, inc_t incx,
void* y, inc_t incy,
void* beta,
void* rho
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_opt_var1);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_opt_var1);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_opt_var1);
#endif
#endif
void bli_dotxf_opt_var1( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
num_t dt_rho = bli_obj_datatype( *rho );
conj_t conjx = bli_obj_conj_status( *x );
conj_t conjy = bli_obj_conj_status( *y );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
void* buf_rho = bli_obj_buffer_at_off( *rho );
num_t dt_alpha;
void* buf_alpha;
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
// The datatype of alpha MUST be the type union of x and y. This is to
// prevent any unnecessary loss of information during computation.
dt_alpha = bli_datatype_union( dt_x, dt_y );
buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
// The datatype of beta MUST be the same as the datatype of rho.
dt_beta = dt_rho;
buf_beta = bli_obj_scalar_buffer( dt_beta, *beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y][dt_rho];
// Invoke the function.
f( conjx,
conjy,
n,
buf_alpha,
buf_x, inc_x,
buf_y, inc_y,
buf_beta,
buf_rho );
}
*/
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t b_m, \
dim_t n, \
dim_t m, \
dim_t b_n, \
void* alpha, \
void* x, inc_t incx, inc_t ldx, \
void* y, inc_t incy, \
void* a, inc_t inca, inc_t lda, \
void* x, inc_t incx, \
void* beta, \
void* r, inc_t incr \
void* y, inc_t incy \
) \
{ \
ctype_xy* alpha_cast = alpha; \
ctype_x* x_cast = x; \
ctype_y* y_cast = y; \
ctype_x* a_cast = a; \
ctype_y* x_cast = x; \
ctype_r* beta_cast = beta; \
ctype_r* r_cast = r; \
ctype_x* x1; \
ctype_y* y1; \
ctype_r* rho1; \
ctype_r* y_cast = y; \
ctype_x* a1; \
ctype_y* x1; \
ctype_r* psi1; \
dim_t i; \
\
for ( i = 0; i < b_m; ++i ) \
for ( i = 0; i < b_n; ++i ) \
{ \
x1 = x_cast + (0 )*incx + (i )*ldx; \
y1 = y_cast + (0 )*incy; \
rho1 = r_cast + (i )*incr; \
a1 = a_cast + (0 )*inca + (i )*lda; \
x1 = x_cast + (0 )*incx; \
psi1 = y_cast + (i )*incy; \
\
PASTEMAC3(chx,chy,chr,dotxv)( conjx, \
conjy, \
n, \
PASTEMAC3(cha,chx,chy,dotxv)( conjat, \
conjx, \
m, \
alpha_cast, \
a1, inca, \
x1, incx, \
y1, incy, \
beta_cast, \
rho1 ); \
psi1 ); \
} \
}
@@ -184,30 +103,30 @@ typedef union
void bli_ddddotxf_opt_var1(
conj_t conjat,
conj_t conjx,
conj_t conjy,
dim_t b_m,
dim_t n,
dim_t m,
dim_t b_n,
void* alpha,
void* x, inc_t incx, inc_t ldx,
void* y, inc_t incy,
void* a, inc_t inca, inc_t lda,
void* x, inc_t incx,
void* beta,
void* r, inc_t incr
void* y, inc_t incy
)
{
double* restrict alpha_cast = alpha;
double* restrict beta_cast = beta;
double* restrict a_cast = a;
double* restrict x_cast = x;
double* restrict y_cast = y;
double* restrict r_cast = r;
dim_t i;
const dim_t n_elem_per_reg = 2;
const dim_t n_iter_unroll = 4;
dim_t n_pre;
dim_t n_run;
dim_t n_left;
dim_t m_pre;
dim_t m_run;
dim_t m_left;
double* restrict x0;
double* restrict x1;
@@ -223,76 +142,76 @@ void bli_ddddotxf_opt_var1(
bool_t use_ref = FALSE;
if ( bli_zero_dim1( b_m ) ) return;
if ( bli_zero_dim1( b_n ) ) return;
// If the vector lengths are zero, scale r by beta and return.
if ( bli_zero_dim1( n ) )
if ( bli_zero_dim1( m ) )
{
PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
b_m,
b_n,
beta_cast,
r_cast, incr );
y_cast, incy );
return;
}
n_pre = 0;
m_pre = 0;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_m < PASTEMAC(d,dotxf_fuse_fac) )
if ( b_n < PASTEMAC(d,dotxf_fuse_fac) )
{
use_ref = TRUE;
}
else if ( incx != 1 || incy != 1 || incr != 1 )
else if ( inca != 1 || incx != 1 || incy != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) ||
bli_is_unaligned_to( r, 16 ) )
else if ( bli_is_unaligned_to( a, 16 ) ||
bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) &&
bli_is_aligned_to( r, 16 ) ) // Note: r is not affected by x and y being unaligned.
if ( bli_is_unaligned_to( a, 16 ) &&
bli_is_unaligned_to( x, 16 ) &&
bli_is_aligned_to( y, 16 ) ) // Note: r is not affected by x and y being unaligned.
{
use_ref = FALSE;
n_pre = 1;
m_pre = 1;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
PASTEMAC3(d,d,d,dotxf_unb_var1)( conjx,
conjy,
b_m,
n,
PASTEMAC3(d,d,d,dotxf_unb_var1)( conjat,
conjx,
m,
b_n,
alpha_cast,
x_cast, incx, ldx,
y_cast, incy,
a_cast, inca, lda,
x_cast, incx,
beta_cast,
r_cast, incr );
y_cast, incy );
return;
}
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
x0 = x_cast;
x1 = x_cast + ldx;
x2 = x_cast + 2*ldx;
x3 = x_cast + 3*ldx;
y0 = y_cast;
x0 = a_cast;
x1 = a_cast + lda;
x2 = a_cast + 2*lda;
x3 = a_cast + 3*lda;
y0 = x_cast;
PASTEMAC(d,set0s)( rho0 );
PASTEMAC(d,set0s)( rho1 );
PASTEMAC(d,set0s)( rho2 );
PASTEMAC(d,set0s)( rho3 );
if ( n_pre == 1 )
if ( m_pre == 1 )
{
x0c = *x0;
x1c = *x1;
@@ -305,11 +224,11 @@ void bli_ddddotxf_opt_var1(
rho2 += x2c * y0c;
rho3 += x3c * y0c;
x0 += incx;
x1 += incx;
x2 += incx;
x3 += incx;
y0 += incy;
x0 += inca;
x1 += inca;
x2 += inca;
x3 += inca;
y0 += incx;
}
rho0v.v = _mm_setzero_pd();
@@ -317,7 +236,7 @@ void bli_ddddotxf_opt_var1(
rho2v.v = _mm_setzero_pd();
rho3v.v = _mm_setzero_pd();
for ( i = 0; i < n_run; ++i )
for ( i = 0; i < m_run; ++i )
{
x0v.v = _mm_load_pd( ( double* )(x0 + 0*n_elem_per_reg) );
x1v.v = _mm_load_pd( ( double* )(x1 + 0*n_elem_per_reg) );
@@ -376,9 +295,9 @@ void bli_ddddotxf_opt_var1(
rho2 += rho2v.d[0] + rho2v.d[1];
rho3 += rho3v.d[0] + rho3v.d[1];
if ( n_left > 0 )
if ( m_left > 0 )
{
for ( i = 0; i < n_left; ++i )
for ( i = 0; i < m_left; ++i )
{
x0c = *x0;
x1c = *x1;
@@ -391,23 +310,23 @@ void bli_ddddotxf_opt_var1(
rho2 += x2c * y0c;
rho3 += x3c * y0c;
x0 += incx;
x1 += incx;
x2 += incx;
x3 += incx;
y0 += incy;
x0 += inca;
x1 += inca;
x2 += inca;
x3 += inca;
y0 += incx;
}
}
/*
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast ) ); \
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+1) ); \
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+2) ); \
PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+3) ); \
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast ) ); \
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+1) ); \
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+2) ); \
PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+3) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(r_cast ) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(r_cast+1) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(r_cast+2) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(r_cast+3) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(y_cast ) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(y_cast+1) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(y_cast+2) ); \
PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(y_cast+3) ); \
*/
rho1v.d[0] = rho0;
@@ -418,8 +337,8 @@ void bli_ddddotxf_opt_var1(
betav.v = _mm_loaddup_pd( ( double* ) beta_cast );
alphav.v = _mm_loaddup_pd( ( double* ) alpha_cast );
rho0v.v = _mm_load_pd( ( double* )(r_cast + 0*n_elem_per_reg) );
rho2v.v = _mm_load_pd( ( double* )(r_cast + 1*n_elem_per_reg) );
rho0v.v = _mm_load_pd( ( double* )(y_cast + 0*n_elem_per_reg) );
rho2v.v = _mm_load_pd( ( double* )(y_cast + 1*n_elem_per_reg) );
rho0v.v *= betav.v;
rho2v.v *= betav.v;
@@ -427,7 +346,7 @@ void bli_ddddotxf_opt_var1(
rho0v.v += alphav.v * rho1v.v;
rho2v.v += alphav.v * rho3v.v;
_mm_store_pd( ( double* )(r_cast + 0*n_elem_per_reg), rho0v.v );
_mm_store_pd( ( double* )(r_cast + 1*n_elem_per_reg), rho2v.v );
_mm_store_pd( ( double* )(y_cast + 0*n_elem_per_reg), rho0v.v );
_mm_store_pd( ( double* )(y_cast + 1*n_elem_per_reg), rho2v.v );
}

View File

@@ -32,12 +32,6 @@
*/
void bli_dotxf_opt_var1( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho );
//
// Define fusing factors for dotxf operation.
@@ -49,18 +43,18 @@ void bli_dotxf_opt_var1( obj_t* alpha,
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjat, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
dim_t b_n, \
void* alpha, \
void* x, inc_t incx, inc_t ldx, \
void* y, inc_t incy, \
void* a, inc_t inca, inc_t lda, \
void* x, inc_t incx, \
void* beta, \
void* r, inc_t incr \
void* y, inc_t incy \
);
INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )

View File

@@ -3,9 +3,9 @@ c #rg # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major;
c #rji # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
0 # Test all combinations of storage schemes?
32 # General stride spacing (for cases when testing general stride)
d #sdcz # Datatype(s) to test
sdcz #sdcz # Datatype(s) to test
100 # Problem size: first to test
500 # Problem size: maximum to test
300 # Problem size: maximum to test
100 # Problem size: increment between experiments
1 # Error-checking level (0 = disable error checking; 1 = full error checking)
i # Reaction to test failure ('i' = ignore; 's' = sleep() and continue; 'a' = abort)

View File

@@ -1,60 +1,60 @@
# --- Utility ------------------------------------------------------------------
0 randv (0 = disable all; 1 = specify)
1 randv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
0 randm (0 = disable all; 1 = specify)
1 randm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m n (-1 = bind to problem size)
# --- Level-1v -----------------------------------------------------------------
0 addv (0 = disable all; 1 = specify)
1 addv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
? parameters: conjx (? = test all values)
0 axpyv (0 = disable all; 1 = specify)
1 axpyv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
? parameters: conjx (? = test all values)
0 copyv (0 = disable all; 1 = specify)
1 copyv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
? parameters: conjx (? = test all values)
0 dotv (0 = disable all; 1 = specify)
1 dotv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
?? parameters: conjx conjy (? = test all values)
0 dotxv (0 = disable all; 1 = specify)
1 dotxv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
?? parameters: conjx conjy (? = test all values)
0 fnormv (0 = disable all; 1 = specify)
1 fnormv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
0 scalv (0 = disable all; 1 = specify)
1 scalv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
? parameters: conjbeta (? = test all values)
0 scal2v (0 = disable all; 1 = specify)
1 scal2v (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
? parameters: conjx (? = test all values)
0 setv (0 = disable all; 1 = specify)
1 setv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
0 subv (0 = disable all; 1 = specify)
1 subv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
? parameters: conjx (? = test all values)
@@ -62,40 +62,40 @@
# --- Level-1m -----------------------------------------------------------------
0 addm (0 = disable all; 1 = specify)
1 addm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
? parameters: transa (? = test all values)
0 axpym (0 = disable all; 1 = specify)
1 axpym (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m n (-1 = bind to problem size)
? parameters: transa (? = test all values)
0 copym (0 = disable all; 1 = specify)
1 copym (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
? parameters: transa (? = test all values)
0 fnormm (0 = disable all; 1 = specify)
1 fnormm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
0 scalm (0 = disable all; 1 = specify)
1 scalm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
? parameters: conjbeta (? = test all values)
0 scal2m (0 = disable all; 1 = specify)
1 scal2m (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
? parameters: transa (? = test all values)
0 setm (0 = disable all; 1 = specify)
1 setm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
0 subm (0 = disable all; 1 = specify)
1 subm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
? parameters: transa (? = test all values)
@@ -103,52 +103,52 @@
# --- Level-2 ------------------------------------------------------------------
0 gemv (0 = disable all; 1 = specify)
1 gemv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
?? parameters: transa conjx (? = test all values)
0 ger (0 = disable all; 1 = specify)
1 ger (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m n (-1 = bind to problem size)
?? parameters: conjx conjy (? = test all values)
0 hemv (0 = disable all; 1 = specify)
1 hemv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
??? parameters: uploa conja conjx (? = test all values)
0 her (0 = disable all; 1 = specify)
1 her (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
?? parameters: uploc conjx (? = test all values)
0 her2 (0 = disable all; 1 = specify)
1 her2 (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
??? parameters: uploc conjx conjy (? = test all values)
0 symv (0 = disable all; 1 = specify)
1 symv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
??? parameters: uploa conja conjx (? = test all values)
0 syr (0 = disable all; 1 = specify)
1 syr (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
?? parameters: uploc conjx (? = test all values)
0 syr2 (0 = disable all; 1 = specify)
1 syr2 (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
??? parameters: uploc conjx conjy (? = test all values)
0 trmv (0 = disable all; 1 = specify)
1 trmv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
??? parameters: uploa transa diaga (? = test all values)
0 trsv (0 = disable all; 1 = specify)
1 trsv (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 dimensions: m (-1 = bind to problem size)
??? parameters: uploa transa diaga (? = test all values)
@@ -158,37 +158,37 @@
1 gemm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 -1 dimensions: m n k (-1 = bind to problem size)
-1 -1 -2 dimensions: m n k (-1 = bind to problem size)
?? parameters: transa transb (? = test all values)
1 hemm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m n (-1 = bind to problem size)
-1 -2 dimensions: m n (-1 = bind to problem size)
???? parameters: side uploa conja transb (? = test all values)
1 herk (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m k (-1 = bind to problem size)
-1 -2 dimensions: m k (-1 = bind to problem size)
?? parameters: uploc transa (? = test all values)
1 her2k (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m k (-1 = bind to problem size)
-1 -2 dimensions: m k (-1 = bind to problem size)
??? parameters: uploc transa transb (? = test all values)
1 symm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m n (-1 = bind to problem size)
-1 -2 dimensions: m n (-1 = bind to problem size)
???? parameters: side uploa conja transb (? = test all values)
1 syrk (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m k (-1 = bind to problem size)
-1 -2 dimensions: m k (-1 = bind to problem size)
?? parameters: uploc transa (? = test all values)
1 syr2k (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 dimensions: m k (-1 = bind to problem size)
-1 -2 dimensions: m k (-1 = bind to problem size)
??? parameters: uploc transa transb (? = test all values)
1 trmm (0 = disable all; 1 = specify)

View File

@@ -159,7 +159,7 @@
1 gemm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -1 -2 dimensions: m n k (-1 = bind to problem size)
nn parameters: transa transb (? = test all values)
?? parameters: transa transb (? = test all values)
1 hemm (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
@@ -169,7 +169,7 @@ nn parameters: transa transb (? = test all values)
1 herk (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)
-1 -2 dimensions: m k (-1 = bind to problem size)
ln parameters: uploc transa (? = test all values)
?? parameters: uploc transa (? = test all values)
1 her2k (0 = disable all; 1 = specify)
1 test sequential front-end (0 = disable; 1 = enable)

View File

@@ -533,7 +533,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
BLIS_EXTEND_NC_C,
BLIS_EXTEND_NC_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-3 register blocksizes \n" );
libblis_test_fprintf_c( os, "level-3 register blocksizes s d c z \n" );
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
@@ -566,7 +566,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
BLIS_DEFAULT_NI_Z );
*/
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-3 packing duplication \n" );
libblis_test_fprintf_c( os, "level-3 packing duplication s d c z \n" );
libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n",
BLIS_DEFAULT_NUM_DUPL_S,
BLIS_DEFAULT_NUM_DUPL_D,
@@ -578,7 +578,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
BLIS_NUM_ELEM_PER_REG_C,
BLIS_NUM_ELEM_PER_REG_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-2 cache blocksizes \n" );
libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" );
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
BLIS_DEFAULT_L2_MC_S,
BLIS_DEFAULT_L2_MC_D,
@@ -590,11 +590,27 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
BLIS_DEFAULT_L2_NC_C,
BLIS_DEFAULT_L2_NC_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-1f fusing factors %5u %5u %5u %5u\n",
BLIS_DEFAULT_FUSING_FACTOR_S,
BLIS_DEFAULT_FUSING_FACTOR_D,
BLIS_DEFAULT_FUSING_FACTOR_C,
BLIS_DEFAULT_FUSING_FACTOR_Z );
libblis_test_fprintf_c( os, "level-1f fusing factors s d c z \n" );
libblis_test_fprintf_c( os, " default %5u %5u %5u %5u\n",
BLIS_DEFAULT_FUSE_FAC_S,
BLIS_DEFAULT_FUSE_FAC_D,
BLIS_DEFAULT_FUSE_FAC_C,
BLIS_DEFAULT_FUSE_FAC_Z );
libblis_test_fprintf_c( os, " axpyf %5u %5u %5u %5u\n",
BLIS_AXPYF_FUSE_FAC_S,
BLIS_AXPYF_FUSE_FAC_D,
BLIS_AXPYF_FUSE_FAC_C,
BLIS_AXPYF_FUSE_FAC_Z );
libblis_test_fprintf_c( os, " dotxf %5u %5u %5u %5u\n",
BLIS_DOTXF_FUSE_FAC_S,
BLIS_DOTXF_FUSE_FAC_D,
BLIS_DOTXF_FUSE_FAC_C,
BLIS_DOTXF_FUSE_FAC_Z );
libblis_test_fprintf_c( os, " dotxaxpyf %5u %5u %5u %5u\n",
BLIS_DOTXAXPYF_FUSE_FAC_S,
BLIS_DOTXAXPYF_FUSE_FAC_D,
BLIS_DOTXAXPYF_FUSE_FAC_C,
BLIS_DOTXAXPYF_FUSE_FAC_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf( os, "\n" );