From 7118f87e18b4941423472afc00215c1d1f2a1fcd Mon Sep 17 00:00:00 2001 From: Tony Kelman Date: Sat, 14 Jun 2014 06:53:20 -0700 Subject: [PATCH 01/19] Use _aligned_malloc instead of posix_memalign on Windows --- frame/base/bli_malloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/frame/base/bli_malloc.c b/frame/base/bli_malloc.c index 3f3e8e804..1b9104b9d 100644 --- a/frame/base/bli_malloc.c +++ b/frame/base/bli_malloc.c @@ -43,6 +43,9 @@ void* bli_malloc( siz_t size ) #if BLIS_HEAP_ADDR_ALIGN_SIZE == 1 p = malloc( ( size_t )size ); +#elif defined(_WIN32) + p = _aligned_malloc( ( size_t )size, + ( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE ); #else r_val = posix_memalign( &p, ( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE, @@ -58,6 +61,10 @@ void* bli_malloc( siz_t size ) void bli_free( void* p ) { +#if BLIS_HEAP_ADDR_ALIGN_SIZE == 1 || !defined(_WIN32) free( p ); +#else + _aligned_free( p ); +#endif } From f064711a5e6fb3852c17c7520909b09dc27665f2 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Sun, 15 Jun 2014 06:27:37 -0400 Subject: [PATCH 02/19] SGEMM and DGEMM kernels for PNaCl --- .gitignore | 2 + config/pnacl/bli_config.h | 165 ++++++++++++++++ config/pnacl/bli_kernel.h | 232 ++++++++++++++++++++++ config/pnacl/kernels | 1 + config/pnacl/make_defs.mk | 117 +++++++++++ kernels/nacl/pnacl/3/bli_gemm_opt.c | 294 ++++++++++++++++++++++++++++ testsuite/Makefile | 79 +++++++- 7 files changed, 889 insertions(+), 1 deletion(-) create mode 100644 config/pnacl/bli_config.h create mode 100644 config/pnacl/bli_kernel.h create mode 120000 config/pnacl/kernels create mode 100644 config/pnacl/make_defs.mk create mode 100644 kernels/nacl/pnacl/3/bli_gemm_opt.c diff --git a/.gitignore b/.gitignore index 94936ab08..8accdfa50 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,8 @@ *.a # test executables *.x +*.pexe +*.nexe # -- build system files -- diff --git a/config/pnacl/bli_config.h b/config/pnacl/bli_config.h new file mode 100644 index 000000000..f6be2e573 --- /dev/null +++ b/config/pnacl/bli_config.h @@ -0,0 +1,165 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CONFIG_H +#define BLIS_CONFIG_H + + +// -- OPERATING SYSTEM --------------------------------------------------------- + + + +// -- INTEGER PROPERTIES ------------------------------------------------------- + +// The bit size of the integer type used to track values such as dimensions, +// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed +// integers while 64 results in 64-bit integers. Any other value results in use +// of the C99 type "long int". Note that this ONLY affects integers used +// internally within BLIS as well as those exposed in the native BLAS-like BLIS +// interface. +#define BLIS_INT_TYPE_SIZE 32 + + + +// -- FLOATING-POINT PROPERTIES ------------------------------------------------ + +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + +// Enable use of built-in C99 "float complex" and "double complex" types and +// associated overloaded operations and functions? Disabling results in +// scomplex and dcomplex being defined in terms of simple structs. +//#define BLIS_ENABLE_C99_COMPLEX + + + +// -- MULTITHREADING ----------------------------------------------------------- + +// The maximum number of BLIS threads that will run concurrently. +#define BLIS_MAX_NUM_THREADS 1 + + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +// -- Contiguous (static) memory allocator -- + +// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the +// contiguous memory pools. +#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_MC_X_NC_BLOCKS 0 + +// The maximum preload byte offset is used to pad the end of the contiguous +// memory pools so that the micro-kernel, when computing with the end of the +// last block, can exceed the bounds of the usable portion of the memory +// region without causing a segmentation fault. +#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 + +// -- Memory alignment -- + +// It is sometimes useful to define the various memory alignments in terms +// of some other characteristics of the system, such as the cache line size +// and the page size. +#define BLIS_CACHE_LINE_SIZE 64 +#define BLIS_PAGE_SIZE 4096 + +// Alignment size needed by the instruction set for aligned SIMD/vector +// instructions. +#define BLIS_SIMD_ALIGN_SIZE 16 + +// Alignment size used to align local stack buffers within macro-kernel +// functions. +#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when allocating memory dynamically from the operating +// system (eg: posix_memalign()). To disable heap alignment and just use +// malloc() instead, set this to 1. +#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when sizing leading dimensions of dynamically +// allocated memory. +#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE + +// Alignment size used when allocating entire blocks of contiguous memory +// from the contiguous memory allocator. +#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE + + + +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Basic (homogeneous) datatype support always enabled. + +// Enable mixed domain operations? +//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT + +// Enable extra mixed precision operations? +//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT + + + +// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- + +// Stay initialized after auto-initialization, unless and until the user +// explicitly calls bli_finalize(). +#define BLIS_ENABLE_STAY_AUTO_INITIALIZED + + + +// -- BLAS-to-BLIS COMPATIBILITY LAYER ----------------------------------------- + +// Enable the BLAS compatibility layer? +#define BLIS_ENABLE_BLAS2BLIS + +// The bit size of the integer type used to track values such as dimensions and +// leading dimensions (ie: column strides) within the BLAS compatibility layer. +// A value of 32 results in the compatibility layer using 32-bit signed integers +// while 64 results in 64-bit integers. Any other value results in use of the +// C99 type "long int". Note that this ONLY affects integers used within the +// BLAS compatibility layer. +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 + +// Fortran-77 name-mangling macros. +#define PASTEF770(name) name ## _ +#define PASTEF77(ch1,name) ch1 ## name ## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ + + + + +#endif + diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h new file mode 100644 index 000000000..d1d29185c --- /dev/null +++ b/config/pnacl/bli_kernel.h @@ -0,0 +1,232 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_H +#define BLIS_KERNEL_H + +/* + * SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible. + * Therefore, if compilation targets an older Chrome release, we use scalar kernels. + * The target Chrome version is indicated by PPAPI_MACRO defined in the header below. + */ +#include + +// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- + +// -- Cache blocksizes -- + +// +// Constraints: +// +// (1) MC must be a multiple of: +// (a) MR (for zero-padding purposes) +// (b) NR (for zero-padding purposes when MR and NR are "swapped") +// (2) NC must be a multiple of +// (a) NR (for zero-padding purposes) +// (b) MR (for zero-padding purposes when MR and NR are "swapped") +// (3) KC must be a multiple of +// (a) MR and +// (b) NR (for triangular operations such as trmm and trsm). +// + +#if PPAPI_RELEASE >= 36 +#define BLIS_DEFAULT_MC_S 256 +#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_NC_S 8192 +#else +#define BLIS_DEFAULT_MC_S 252 +#define BLIS_DEFAULT_KC_S 264 +#define BLIS_DEFAULT_NC_S 8196 +#endif + +#define BLIS_DEFAULT_MC_D 1080 +#define BLIS_DEFAULT_KC_D 120 +#define BLIS_DEFAULT_NC_D 8400 + +#define BLIS_DEFAULT_MC_C 128 +#define BLIS_DEFAULT_KC_C 256 +#define BLIS_DEFAULT_NC_C 4096 + +#define BLIS_DEFAULT_MC_Z 64 +#define BLIS_DEFAULT_KC_Z 256 +#define BLIS_DEFAULT_NC_Z 2048 + +// -- Register blocksizes -- + +#if PPAPI_RELEASE >= 36 +#define BLIS_DEFAULT_MR_S 8 +#define BLIS_DEFAULT_NR_S 4 +#else +#define BLIS_DEFAULT_MR_S 4 +#define BLIS_DEFAULT_NR_S 3 +#endif + +/* So far, PNaCl does not support double-precision SIMD */ +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 3 + +#define BLIS_DEFAULT_MR_C 4 +#define BLIS_DEFAULT_NR_C 4 + +#define BLIS_DEFAULT_MR_Z 4 +#define BLIS_DEFAULT_NR_Z 4 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) + +// -- Register blocksize extensions (for packed micro-panels) -- + +// NOTE: These register blocksize "extensions" determine whether the +// leading dimensions used within the packed micro-panels are equal to +// or greater than their corresponding register blocksizes above. + +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 + +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 + +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 + +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 + + + +// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- + + + + +// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ + + + + +// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- + +// -- gemm -- + +#if PPAPI_RELEASE >= 36 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x4 +#endif + +// -- trsm-related -- + + + + +// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- + +// -- packm -- + +// -- unpackm -- + + + + +// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- + +// -- axpy2v -- + +// -- dotaxpyv -- + +// -- axpyf -- + +// -- dotxf -- + +// -- dotxaxpyf -- + + + + +// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- + +// -- addv -- + +// -- axpyv -- + +// -- copyv -- + +// -- dotv -- + +// -- dotxv -- + +// -- invertv -- + +// -- scal2v -- + +// -- scalv -- + +// -- setv -- + +// -- subv -- + +// -- swapv -- + + + +#endif + diff --git a/config/pnacl/kernels b/config/pnacl/kernels new file mode 120000 index 000000000..4fa74dbb3 --- /dev/null +++ b/config/pnacl/kernels @@ -0,0 +1 @@ +../../kernels/nacl/pnacl \ No newline at end of file diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk new file mode 100644 index 000000000..540531203 --- /dev/null +++ b/config/pnacl/make_defs.mk @@ -0,0 +1,117 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# Only include this block of code once. +ifndef MAKE_DEFS_MK_INCLUDED +MAKE_DEFS_MK_INCLUDED := yes + + + +# +# --- Build definitions -------------------------------------------------------- +# + +# Variables corresponding to other configure-time options. +BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no +BLIS_ENABLE_STATIC_BUILD := yes +BLIS_ENABLE_DYNAMIC_BUILD := no + + + +# +# --- Utility program definitions ---------------------------------------------- +# + +SH := /bin/sh +MV := mv +MKDIR := mkdir -p +RM_F := rm -f +RM_RF := rm -rf +SYMLINK := ln -sf +FIND := find +GREP := grep +XARGS := xargs +RANLIB := pnacl-ranlib +INSTALL := install -c + +# Used to refresh CHANGELOG. +GIT := git +GIT_LOG := $(GIT) log --decorate + + + +# +# --- Development tools definitions -------------------------------------------- +# + +# --- Determine the C compiler and related flags --- +CC := pnacl-clang +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# NOTE: This is needed to enable posix_memalign(). +CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +CMISCFLAGS := -std=c99 -I$(NACL_SDK_ROOT)/include +CDBGFLAGS := -g +CWARNFLAGS := -Wall +COPTFLAGS := -O3 +CKOPTFLAGS := $(COPTFLAGS) -ffast-math +CVECFLAGS := + +# Aggregate all of the flags into multiple groups: one for standard +# compilation, and one for each of the supported "special" compilation +# modes. +CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) + +# --- Determine the archiver and related flags --- +AR := ar +ARFLAGS := cru + +# --- Determine the linker and related flags --- +LINKER := $(CC) +LDFLAGS := -lm + +# --- Determine the finalizer and related flags --- +FINALIZER := pnacl-finalize +FINFLAGS := + +# --- Determine the translator and related flags --- +TRANSLATOR := pnacl-translate +TRNSFLAGS := -O3 +TRNSAMD64FLAGS := -arch x86-64 +TRNSX86FLAGS := -arch i686 +TRNSARMFLAGS := -arch armv7 + +# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block +endif diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c new file mode 100644 index 000000000..8261cf0dd --- /dev/null +++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c @@ -0,0 +1,294 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#if PPAPI_RELEASE >= 36 +typedef float v4sf __attribute__ ((vector_size(16))); + +inline v4sf v4sf_splat(float x) { + return (v4sf) { x, x, x, x }; +} + +inline v4sf v4sf_load(const float* a) { + return *((const v4sf*)a); +} + +inline void v4sf_store(float* a, v4sf x) { + *((v4sf*)a) = x; +} + +inline v4sf v4sf_zero() { + return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; +} +#endif + +#if PPAPI_RELEASE >= 36 /* 8x4 blocks, SIMD enabled */ +void bli_sgemm_opt_8x4( + dim_t k, + float *restrict alpha, + float *restrict a, + float *restrict b, + float *restrict beta, + float *restrict c, + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) +{ + // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) + v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero(); + v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero(); + for (dim_t i = 0; i < k; i += 1) { + const v4sf avt = v4sf_load(a); + const v4sf avb = v4sf_load(a+4); + + const v4sf bv_xxxx = v4sf_splat(b[0]); + abv0t += avt * bv_xxxx; + abv0b += avb * bv_xxxx; + + const v4sf bv_yyyy = v4sf_splat(b[1]); + abv1t += avt * bv_yyyy; + abv1b += avb * bv_yyyy; + + const v4sf bv_zzzz = v4sf_splat(b[2]); + abv2t += avt * bv_zzzz; + abv2b += avb * bv_zzzz; + + const v4sf bv_wwww = v4sf_splat(b[3]); + abv3t += avt * bv_wwww; + abv3b += avb * bv_wwww; + + a += 8; + b += 4; + } + + const v4sf alphav = v4sf_splat(*alpha); + abv0t *= alphav; + abv0b *= alphav; + abv1t *= alphav; + abv1b *= alphav; + abv2t *= alphav; + abv2b *= alphav; + abv3t *= alphav; + abv3b *= alphav; + + if (rs_c == 1) { + v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]); + v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); + v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); + v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); + v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]); + v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); + v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); + v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); + + const v4sf betav = v4sf_splat(*beta); + cv0t = cv0t * betav + abv0t; + cv1t = cv1t * betav + abv1t; + cv2t = cv2t * betav + abv2t; + cv3t = cv3t * betav + abv3t; + cv0b = cv0b * betav + abv0b; + cv1b = cv1b * betav + abv1b; + cv2b = cv2b * betav + abv2b; + cv3b = cv3b * betav + abv3b; + + v4sf_store(&c[0*rs_c + 0*cs_c], cv0t); + v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); + v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); + v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); + v4sf_store(&c[4*rs_c + 0*cs_c], cv0b); + v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); + v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); + v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); + } else { + // Load columns 0, 1, 2, 3 (top part) + v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] }; + v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] }; + v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] }; + v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] }; + // Load columns 0, 1, 2, 3 (bottom part) + v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] }; + v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] }; + v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] }; + v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] }; + + const v4sf betav = v4sf_splat(*beta); + cv0t = cv0t * betav + abv0t; + cv1t = cv1t * betav + abv1t; + cv2t = cv2t * betav + abv2t; + cv3t = cv3t * betav + abv3t; + cv0b = cv0b * betav + abv0b; + cv1b = cv1b * betav + abv1b; + cv2b = cv2b * betav + abv2b; + cv3b = cv3b * betav + abv3b; + + // Store column 0 + c[0*rs_c + 0*cs_c] = cv0t[0]; + c[1*rs_c + 0*cs_c] = cv0t[1]; + c[2*rs_c + 0*cs_c] = cv0t[2]; + c[3*rs_c + 0*cs_c] = cv0t[3]; + c[4*rs_c + 0*cs_c] = cv0b[0]; + c[5*rs_c + 0*cs_c] = cv0b[1]; + c[6*rs_c + 0*cs_c] = cv0b[2]; + c[7*rs_c + 0*cs_c] = cv0b[3]; + + // Store column 1 + c[0*rs_c + 1*cs_c] = cv1t[0]; + c[1*rs_c + 1*cs_c] = cv1t[1]; + c[2*rs_c + 1*cs_c] = cv1t[2]; + c[3*rs_c + 1*cs_c] = cv1t[3]; + c[4*rs_c + 1*cs_c] = cv1b[0]; + c[5*rs_c + 1*cs_c] = cv1b[1]; + c[6*rs_c + 1*cs_c] = cv1b[2]; + c[7*rs_c + 1*cs_c] = cv1b[3]; + + // Store column 2 + c[0*rs_c + 2*cs_c] = cv2t[0]; + c[1*rs_c + 2*cs_c] = cv2t[1]; + c[2*rs_c + 2*cs_c] = cv2t[2]; + c[3*rs_c + 2*cs_c] = cv2t[3]; + c[4*rs_c + 2*cs_c] = cv2b[0]; + c[5*rs_c + 2*cs_c] = cv2b[1]; + c[6*rs_c + 2*cs_c] = cv2b[2]; + c[7*rs_c + 2*cs_c] = cv2b[3]; + + // Store column 3 + c[0*rs_c + 3*cs_c] = cv3t[0]; + c[1*rs_c + 3*cs_c] = cv3t[1]; + c[2*rs_c + 3*cs_c] = cv3t[2]; + c[3*rs_c + 3*cs_c] = cv3t[3]; + c[4*rs_c + 3*cs_c] = cv3b[0]; + c[5*rs_c + 3*cs_c] = cv3b[1]; + c[6*rs_c + 3*cs_c] = cv3b[2]; + c[7*rs_c + 3*cs_c] = cv3b[3]; + } +} +#else /* PPAPI_RELEASE < 36 case: 4x4 blocks, no SIMD */ +void bli_sgemm_opt_4x4( + dim_t k, + float *restrict alpha, + float *restrict a, + float *restrict b, + float *restrict beta, + float *restrict c, + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) +{ + /* Just call the reference implementation. */ + BLIS_SGEMM_UKERNEL_REF( + k, + alpha, + a, + b, + beta, + c, + rs_c, + cs_c, + data); +} +#endif + +void bli_dgemm_opt_4x4( + dim_t k, + double *restrict alpha, + double *restrict a, + double *restrict b, + double *restrict beta, + double *restrict c, + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) +{ + /* Just call the reference implementation. */ + BLIS_DGEMM_UKERNEL_REF( + k, + alpha, + a, + b, + beta, + c, + rs_c, + cs_c, + data); +} + +void bli_cgemm_opt_4x4( + dim_t k, + scomplex *restrict alpha, + scomplex *restrict a, + scomplex *restrict b, + scomplex *restrict beta, + scomplex *restrict c, + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) +{ + /* Just call the reference implementation. */ + BLIS_CGEMM_UKERNEL_REF( + k, + alpha, + a, + b, + beta, + c, + rs_c, + cs_c, + data); +} + +void bli_zgemm_opt_4x4( + dim_t k, + dcomplex *restrict alpha, + dcomplex *restrict a, + dcomplex *restrict b, + dcomplex *restrict beta, + dcomplex *restrict c, + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) +{ + /* Just call the reference implementation. */ + BLIS_ZGEMM_UKERNEL_REF( + k, + alpha, + a, + b, + beta, + c, + rs_c, + cs_c, + data); +} + diff --git a/testsuite/Makefile b/testsuite/Makefile index abd4c8354..850543bb6 100644 --- a/testsuite/Makefile +++ b/testsuite/Makefile @@ -45,7 +45,8 @@ # .PHONY: all bin clean \ - check-env check-env-mk check-env-fragments check-env-make-defs + check-env check-env-mk check-env-fragments check-env-make-defs \ + run run-amd64 run-x86 run-arm @@ -241,8 +242,21 @@ TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \ $(TEST_OBJ_PATH)/%.o, \ $(wildcard $(TEST_SRC_PATH)/*.c)) +ifeq ($(CONFIG_NAME),pnacl) +# Linked executable +TEST_BIN := test_libblis.unstable.pexe +# Finalized executable +TEST_BIN_PNACL := test_libblis.pexe +# Translated executable for x86-64 +TEST_BIN_AMD64 := test_libblis.x86-64.nexe +# Translated executable for x86 +TEST_BIN_X86 := test_libblis.x86.nexe +# Translated executable for ARM +TEST_BIN_ARM := test_libblis.arm.nexe +else # Binary executable name. TEST_BIN := test_libblis.x +endif # Add installed and local header paths to CFLAGS CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) @@ -257,7 +271,11 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) all: check-env bin +ifeq ($(CONFIG_NAME),pnacl) +bin: check-env $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM) +else bin: check-env $(TEST_BIN) +endif # --- Environment check rules --- @@ -301,9 +319,68 @@ else @$(LINKER) $(TEST_OBJS) $(BLIS_LIB) $(LDFLAGS) -o $@ endif +ifeq ($(CONFIG_NAME),pnacl) + +# Finalize PNaCl executable (i.e. convert from LLVM bitcode to PNaCl bitcode) +$(TEST_BIN_PNACL): $(TEST_BIN) +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) + $(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN) +else + @echo "Finalizing $@" + @$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN) +endif + +# Translate PNaCl executable to x86-64 NaCl executable +$(TEST_BIN_AMD64): $(TEST_BIN_PNACL) +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) + $(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@ +else + @echo "Translating $< -> $@" + @$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@ +endif + + +# Translate PNaCl executable to x86 NaCl executable +$(TEST_BIN_X86): $(TEST_BIN_PNACL) +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) + $(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@ +else + @echo "Translating $< -> $@" + @$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@ +endif + +# Translate PNaCl executable to ARMv7 NaCl executable +$(TEST_BIN_ARM): $(TEST_BIN_PNACL) +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) + $(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@ +else + @echo "Translating $< -> $@" + @$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@ +endif + +endif + +# -- Test run rules -- + +ifeq ($(CONFIG_NAME),pnacl) +run-amd64: $(TEST_BIN_AMD64) + $(NACL_SDK_ROOT)/tools/sel_ldr_x86_64 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_64.nexe -- $(TEST_BIN_AMD64) +run-x86: $(TEST_BIN_X86) + $(NACL_SDK_ROOT)/tools/sel_ldr_x86_32 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_32.nexe -- $(TEST_BIN_X86) +run-arm: $(TEST_BIN_ARM) + $(NACL_SDK_ROOT)/tools/sel_ldr_arm -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_arm.nexe -- $(TEST_BIN_ARM) +else +run: $(TEST_BIN) + ./$(TEST_BIN) +endif # -- Clean rules -- +ifeq ($(CONFIG_NAME),pnacl) +clean: + - $(RM_F) $(TEST_OBJS) $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM) +else clean: - $(RM_F) $(TEST_OBJS) $(TEST_BIN) +endif From 6de2d472d98baa215264a776f3d5291780a6a085 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Sun, 15 Jun 2014 08:44:31 -0400 Subject: [PATCH 03/19] CGEMM and ZGEMM kernels for PNaCl --- config/pnacl/bli_kernel.h | 23 +++- kernels/nacl/pnacl/3/bli_gemm_opt.c | 200 +++++++++++++++++++++++++++- 2 files changed, 214 insertions(+), 9 deletions(-) diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h index d1d29185c..678fddd33 100644 --- a/config/pnacl/bli_kernel.h +++ b/config/pnacl/bli_kernel.h @@ -74,13 +74,19 @@ #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 +#if PPAPI_RELEASE >= 36 #define BLIS_DEFAULT_MC_C 128 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 +#else +#define BLIS_DEFAULT_MC_C 120 +#define BLIS_DEFAULT_KC_C 264 +#define BLIS_DEFAULT_NC_C 4092 +#endif -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 256 -#define BLIS_DEFAULT_NC_Z 2048 +#define BLIS_DEFAULT_MC_Z 60 +#define BLIS_DEFAULT_KC_Z 264 +#define BLIS_DEFAULT_NC_Z 2040 // -- Register blocksizes -- @@ -92,15 +98,19 @@ #define BLIS_DEFAULT_NR_S 3 #endif -/* So far, PNaCl does not support double-precision SIMD */ #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 3 +#if PPAPI_RELEASE >= 36 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 4 +#else +#define BLIS_DEFAULT_MR_C 2 +#define BLIS_DEFAULT_NR_C 3 +#endif -#define BLIS_DEFAULT_MR_Z 4 -#define BLIS_DEFAULT_NR_Z 4 +#define BLIS_DEFAULT_MR_Z 2 +#define BLIS_DEFAULT_NR_Z 3 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these @@ -171,6 +181,7 @@ #if PPAPI_RELEASE >= 36 #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x4 +#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4 #endif // -- trsm-related -- diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c index 8261cf0dd..36f719a14 100644 --- a/kernels/nacl/pnacl/3/bli_gemm_opt.c +++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2012, The University of Texas + Copyright (C) 2014, The University of Texas Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -45,16 +45,24 @@ inline v4sf v4sf_load(const float* a) { return *((const v4sf*)a); } +inline v4sf v4sf_cload(const scomplex* a) { + return *((const v4sf*)a); +} + inline void v4sf_store(float* a, v4sf x) { *((v4sf*)a) = x; } +inline void v4sf_cstore(scomplex* a, v4sf x) { + *((v4sf*)a) = x; +} + inline v4sf v4sf_zero() { return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; } #endif -#if PPAPI_RELEASE >= 36 /* 8x4 blocks, SIMD enabled */ +#if PPAPI_RELEASE >= 36 void bli_sgemm_opt_8x4( dim_t k, float *restrict alpha, @@ -194,7 +202,7 @@ void bli_sgemm_opt_8x4( c[7*rs_c + 3*cs_c] = cv3b[3]; } } -#else /* PPAPI_RELEASE < 36 case: 4x4 blocks, no SIMD */ +#else void bli_sgemm_opt_4x4( dim_t k, float *restrict alpha, @@ -244,6 +252,191 @@ void bli_dgemm_opt_4x4( data); } + +#if PPAPI_RELEASE >= 36 +void bli_cgemm_opt_4x4( + dim_t k, + scomplex *restrict alpha, + scomplex *restrict a, + scomplex *restrict b, + scomplex *restrict beta, + scomplex *restrict c, + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) +{ + // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) + v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero(); + v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero(); + for (dim_t i = 0; i < k; i += 1) { + const v4sf avt = v4sf_cload(a); + const v4sf avb = v4sf_cload(a+2); + const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6); + const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7); + + const v4sf bv0r = v4sf_splat(b[0].real); + const v4sf bv0i = v4sf_splat(b[0].imag); + abv0r += avr * bv0r - avi * bv0i; + abv0i += avr * bv0i + avi * bv0r; + + + const v4sf bv1r = v4sf_splat(b[1].real); + const v4sf bv1i = v4sf_splat(b[1].imag); + abv1r += avr * bv1r - avi * bv1i; + abv1i += avr * bv1i + avi * bv1r; + + const v4sf bv2r = v4sf_splat(b[2].real); + const v4sf bv2i = v4sf_splat(b[2].imag); + abv2r += avr * bv2r - avi * bv2i; + abv2i += avr * bv2i + avi * bv2r; + + const v4sf bv3r = v4sf_splat(b[3].real); + const v4sf bv3i = v4sf_splat(b[3].imag); + abv3r += avr * bv3r - avi * bv3i; + abv3i += avr * bv3i + avi * bv3r; + + a += 4; + b += 4; + } + + const v4sf alphavr = v4sf_splat(alpha->real); + const v4sf alphavi = v4sf_splat(alpha->imag); + v4sf temp; + + temp = abv0r * alphavr - abv0i * alphavi; + abv0i = abv0r * alphavi + abv0i * alphavr; + abv0r = temp; + + temp = abv1r * alphavr - abv1i * alphavi; + abv1i = abv1r * alphavi + abv1i * alphavr; + abv1r = temp; + + temp = abv2r * alphavr - abv2i * alphavi; + abv2i = abv2r * alphavi + abv2i * alphavr; + abv2r = temp; + + temp = abv3r * alphavr - abv3i * alphavi; + abv3i = abv3r * alphavi + abv3i * alphavr; + abv3r = temp; + + if (rs_c == 1) { + const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]); + const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]); + const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]); + const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]); + const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]); + const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]); + const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]); + const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]); + + v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6); + v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7); + v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6); + v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7); + v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6); + v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7); + v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6); + v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7); + + const v4sf betavr = v4sf_splat(beta->real); + const v4sf betavi = v4sf_splat(beta->imag); + + temp = abv0r + cv0r * betavr - cv0i * betavi; + cv0i = abv0i + cv0r * betavi + cv0i * betavr; + cv0r = temp; + + temp = abv1r + cv1r * betavr - cv1i * betavi; + cv1i = abv1i + cv1r * betavi + cv1i * betavr; + cv1r = temp; + + temp = abv2r + cv2r * betavr - cv2i * betavi; + cv2i = abv2i + cv2r * betavi + cv2i * betavr; + cv2r = temp; + + temp = abv3r + cv3r * betavr - cv3i * betavi; + cv3i = abv3i + cv3r * betavi + cv3i * betavr; + cv3r = temp; + + v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7)); + v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7)); + v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7)); + v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7)); + } else { + // Load columns 0, 1, 2, 3 (real part) + v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real }; + v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real }; + v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real }; + v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real }; + // Load columns 0, 1, 2, 3 (imaginary part) + v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag }; + v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag }; + v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag }; + v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag }; + + const v4sf betavr = v4sf_splat(beta->real); + const v4sf betavi = v4sf_splat(beta->imag); + temp = abv0r + cv0r * betavr - cv0i * betavi; + cv0i = abv0i + cv0r * betavi + cv0i * betavr; + cv0r = temp; + + temp = abv1r + cv1r * betavr - cv1i * betavi; + cv1i = abv1i + cv1r * betavi + cv1i * betavr; + cv1r = temp; + + temp = abv2r + cv2r * betavr - cv2i * betavi; + cv2i = abv2i + cv2r * betavi + cv2i * betavr; + cv2r = temp; + + temp = abv3r + cv3r * betavr - cv3i * betavi; + cv3i = abv3i + cv3r * betavi + cv3i * betavr; + cv3r = temp; + + // Store column 0 + c[0*rs_c + 0*cs_c].real = cv0r[0]; + c[0*rs_c + 0*cs_c].imag = cv0i[0]; + c[1*rs_c + 0*cs_c].real = cv0r[1]; + c[1*rs_c + 0*cs_c].imag = cv0i[1]; + c[2*rs_c + 0*cs_c].real = cv0r[2]; + c[2*rs_c + 0*cs_c].imag = cv0i[2]; + c[3*rs_c + 0*cs_c].real = cv0r[3]; + c[3*rs_c + 0*cs_c].imag = cv0i[3]; + + // Store column 1 + c[0*rs_c + 1*cs_c].real = cv1r[0]; + c[0*rs_c + 1*cs_c].imag = cv1i[0]; + c[1*rs_c + 1*cs_c].real = cv1r[1]; + c[1*rs_c + 1*cs_c].imag = cv1i[1]; + c[2*rs_c + 1*cs_c].real = cv1r[2]; + c[2*rs_c + 1*cs_c].imag = cv1i[2]; + c[3*rs_c + 1*cs_c].real = cv1r[3]; + c[3*rs_c + 1*cs_c].imag = cv1i[3]; + + // Store column 2 + c[0*rs_c + 2*cs_c].real = cv2r[0]; + c[0*rs_c + 2*cs_c].imag = cv2i[0]; + c[1*rs_c + 2*cs_c].real = cv2r[1]; + c[1*rs_c + 2*cs_c].imag = cv2i[1]; + c[2*rs_c + 2*cs_c].real = cv2r[2]; + c[2*rs_c + 2*cs_c].imag = cv2i[2]; + c[3*rs_c + 2*cs_c].real = cv2r[3]; + c[3*rs_c + 2*cs_c].imag = cv2i[3]; + + // Store column 3 + c[0*rs_c + 3*cs_c].real = cv3r[0]; + c[0*rs_c + 3*cs_c].imag = cv3i[0]; + c[1*rs_c + 3*cs_c].real = cv3r[1]; + c[1*rs_c + 3*cs_c].imag = cv3i[1]; + c[2*rs_c + 3*cs_c].real = cv3r[2]; + c[2*rs_c + 3*cs_c].imag = cv3i[2]; + c[3*rs_c + 3*cs_c].real = cv3r[3]; + c[3*rs_c + 3*cs_c].imag = cv3i[3]; + } +} +#else void bli_cgemm_opt_4x4( dim_t k, scomplex *restrict alpha, @@ -267,6 +460,7 @@ void bli_cgemm_opt_4x4( cs_c, data); } +#endif void bli_zgemm_opt_4x4( dim_t k, From b2ffb4de8b6872cb23537ad282e557d11dcd9c8b Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Sun, 15 Jun 2014 18:41:30 -0400 Subject: [PATCH 04/19] Reformatted PNaCl GEMM kernels --- kernels/nacl/pnacl/3/bli_gemm_opt.c | 110 ++++++++++++++-------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c index 36f719a14..2dca6ae45 100644 --- a/kernels/nacl/pnacl/3/bli_gemm_opt.c +++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c @@ -64,15 +64,15 @@ inline v4sf v4sf_zero() { #if PPAPI_RELEASE >= 36 void bli_sgemm_opt_8x4( - dim_t k, - float *restrict alpha, - float *restrict a, - float *restrict b, - float *restrict beta, - float *restrict c, - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) + dim_t k, + float alpha[restrict static 1], + float a[restrict static 8*k], + float b[restrict static k*4], + float beta[restrict static 1], + float c[restrict static 8*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) { // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero(); @@ -204,15 +204,15 @@ void bli_sgemm_opt_8x4( } #else void bli_sgemm_opt_4x4( - dim_t k, - float *restrict alpha, - float *restrict a, - float *restrict b, - float *restrict beta, - float *restrict c, - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) + dim_t k, + float alpha[restrict static 1], + float a[restrict static 4*k], + float b[restrict static k*4], + float beta[restrict static 1], + float c[restrict static 4*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) { /* Just call the reference implementation. */ BLIS_SGEMM_UKERNEL_REF( @@ -229,15 +229,15 @@ void bli_sgemm_opt_4x4( #endif void bli_dgemm_opt_4x4( - dim_t k, - double *restrict alpha, - double *restrict a, - double *restrict b, - double *restrict beta, - double *restrict c, - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) + dim_t k, + double alpha[restrict static 1], + double a[restrict static 4*k], + double b[restrict static k*4], + double beta[restrict static 1], + double c[restrict static 4*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) { /* Just call the reference implementation. */ BLIS_DGEMM_UKERNEL_REF( @@ -255,15 +255,15 @@ void bli_dgemm_opt_4x4( #if PPAPI_RELEASE >= 36 void bli_cgemm_opt_4x4( - dim_t k, - scomplex *restrict alpha, - scomplex *restrict a, - scomplex *restrict b, - scomplex *restrict beta, - scomplex *restrict c, - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) + dim_t k, + scomplex alpha[restrict static 1], + scomplex a[restrict static 4*k], + scomplex b[restrict static k*4], + scomplex beta[restrict static 1], + scomplex c[restrict static 4*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) { // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero(); @@ -279,7 +279,6 @@ void bli_cgemm_opt_4x4( abv0r += avr * bv0r - avi * bv0i; abv0i += avr * bv0i + avi * bv0r; - const v4sf bv1r = v4sf_splat(b[1].real); const v4sf bv1i = v4sf_splat(b[1].imag); abv1r += avr * bv1r - avi * bv1i; @@ -379,6 +378,7 @@ void bli_cgemm_opt_4x4( const v4sf betavr = v4sf_splat(beta->real); const v4sf betavi = v4sf_splat(beta->imag); + temp = abv0r + cv0r * betavr - cv0i * betavi; cv0i = abv0i + cv0r * betavi + cv0i * betavr; cv0r = temp; @@ -438,15 +438,15 @@ void bli_cgemm_opt_4x4( } #else void bli_cgemm_opt_4x4( - dim_t k, - scomplex *restrict alpha, - scomplex *restrict a, - scomplex *restrict b, - scomplex *restrict beta, - scomplex *restrict c, - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) + dim_t k, + scomplex alpha[restrict static 1], + scomplex a[restrict static 4*k], + scomplex b[restrict static k*4], + scomplex beta[restrict static 1], + scomplex c[restrict static 4*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) { /* Just call the reference implementation. */ BLIS_CGEMM_UKERNEL_REF( @@ -463,15 +463,15 @@ void bli_cgemm_opt_4x4( #endif void bli_zgemm_opt_4x4( - dim_t k, - dcomplex *restrict alpha, - dcomplex *restrict a, - dcomplex *restrict b, - dcomplex *restrict beta, - dcomplex *restrict c, - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) + dim_t k, + dcomplex alpha[restrict static 1], + dcomplex a[restrict static 4*k], + dcomplex b[restrict static k*4], + dcomplex beta[restrict static 1], + dcomplex c[restrict static 4*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) { /* Just call the reference implementation. */ BLIS_ZGEMM_UKERNEL_REF( From 6f8462eb0ec278b89731e73ef583386a3371d095 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Wed, 18 Jun 2014 03:08:46 -0700 Subject: [PATCH 05/19] Fix inconsistent VERBOSE macro in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c6636932a..ca1d50fd5 100644 --- a/Makefile +++ b/Makefile @@ -469,7 +469,7 @@ endif blis-lib: check-env $(MK_LIBS) $(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS) -ifeq ($(FLA_ENABLE_VERBOSE_MAKE_OUTPUT),yes) +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) $(AR) $(ARFLAGS) $@ $? $(RANLIB) $@ else From 68a02976e3c3638f0a9821342e269a1743e3ace3 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Wed, 18 Jun 2014 03:10:25 -0700 Subject: [PATCH 06/19] Compile pnacl configuration in GNU11 mode to avoid warning about non-standard features --- config/pnacl/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index 540531203..b709441dc 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -80,7 +80,7 @@ CC := pnacl-clang # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -I$(NACL_SDK_ROOT)/include +CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include CDBGFLAGS := -g CWARNFLAGS := -Wall COPTFLAGS := -O3 From 031deb2a5c718d569bde842590a791b812f4cf1d Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Wed, 18 Jun 2014 03:11:34 -0700 Subject: [PATCH 07/19] PNaCl configuration: use pnacl-ar instead or ar (fixes build issue on Mac) --- config/pnacl/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index b709441dc..1a2dd8a18 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -95,7 +95,7 @@ CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCF CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) # --- Determine the archiver and related flags --- -AR := ar +AR := pnacl-ar ARFLAGS := cru # --- Determine the linker and related flags --- From 4b8e71aab80182873a2e138eb07902b8d8fd5480 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Thu, 19 Jun 2014 00:43:25 -0700 Subject: [PATCH 08/19] Use AR rcs flags for PNaCl target to avoid warning --- config/pnacl/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index 1a2dd8a18..1c4566012 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -96,7 +96,7 @@ CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCF # --- Determine the archiver and related flags --- AR := pnacl-ar -ARFLAGS := cru +ARFLAGS := rcs # --- Determine the linker and related flags --- LINKER := $(CC) From 491be4f91ed725522f5cc7184053857c6c376ada Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Thu, 19 Jun 2014 00:45:44 -0700 Subject: [PATCH 09/19] Optimized dot product kernels for PNaCl --- config/pnacl/bli_kernel.h | 3 + kernels/nacl/pnacl/1/bli_dotv_opt.c | 617 ++++++++++++++++++++++++++++ 2 files changed, 620 insertions(+) create mode 100644 kernels/nacl/pnacl/1/bli_dotv_opt.c diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h index 678fddd33..728f08f12 100644 --- a/config/pnacl/bli_kernel.h +++ b/config/pnacl/bli_kernel.h @@ -222,6 +222,9 @@ // -- copyv -- // -- dotv -- +#if PPAPI_RELEASE >= 36 +#define BLIS_SDOTV_KERNEL bli_sdotv_opt +#endif // -- dotxv -- diff --git a/kernels/nacl/pnacl/1/bli_dotv_opt.c b/kernels/nacl/pnacl/1/bli_dotv_opt.c new file mode 100644 index 000000000..14d203bd6 --- /dev/null +++ b/kernels/nacl/pnacl/1/bli_dotv_opt.c @@ -0,0 +1,617 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#if PPAPI_RELEASE >= 36 +typedef float v4sf __attribute__ ((vector_size(16))); + +inline v4sf v4sf_splat(float x) { + return (v4sf) { x, x, x, x }; +} + +inline v4sf v4sf_load(const float* a) { + return *((const v4sf*)a); +} + +inline v4sf v4sf_cload(const scomplex* a) { + return *((const v4sf*)a); +} + +inline void v4sf_store(float* a, v4sf x) { + *((v4sf*)a) = x; +} + +inline void v4sf_cstore(scomplex* a, v4sf x) { + *((v4sf*)a) = x; +} + +inline v4sf v4sf_zero() { + return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; +} +#endif + +void bli_sdotv_opt( + conj_t conjx, + conj_t conjy, + dim_t n, + float x[restrict static n], + inc_t incx, + float y[restrict static n], + inc_t incy, + float rho[restrict static 1]) +{ +#if PPAPI_RELEASE >= 36 + // If the vector lengths are zero, set rho to zero and return. + if (bli_zero_dim1(n)) { + *rho = 0.0f; + return; + } + + // If there is anything that would interfere with our use of aligned + // vector loads/stores, call the reference implementation. + if (bli_has_nonunit_inc2(incx, incy)) { + float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f; + while (n >= 6) { + sum0 += (*x) * (*y); + x += incx; + y += incy; + + sum1 += (*x) * (*y); + x += incx; + y += incy; + + sum2 += (*x) * (*y); + x += incx; + y += incy; + + sum3 += (*x) * (*y); + x += incx; + y += incy; + + sum4 += (*x) * (*y); + x += incx; + y += incy; + + sum5 += (*x) * (*y); + x += incx; + y += incy; + + n -= 6; + } + float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5); + while (n--) { + sum += (*x) * (*y); + x += incx; + y += incy; + } + *rho = sum; + } else { + v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero(); + v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero(); + while (n >= 24) { + vsum0 += v4sf_load(x) * v4sf_load(y); + vsum1 += v4sf_load(x+4) * v4sf_load(y+4); + vsum2 += v4sf_load(x+8) * v4sf_load(y+8); + vsum3 += v4sf_load(x+12) * v4sf_load(y+12); + vsum4 += v4sf_load(x+16) * v4sf_load(y+16); + vsum5 += v4sf_load(x+20) * v4sf_load(y+20); + + x += 24; + y += 24; + n -= 24; + } + v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5); + while (n >= 4) { + vsum += v4sf_load(x) * v4sf_load(y); + + x += 4; + y += 4; + n -= 4; + } + float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]); + while (n--) { + sum += (*x++) * (*y++); + } + *rho = sum; + } +#else + float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f; + while (n >= 6) { + sum0 += (*x) * (*y); + x += incx; + y += incy; + + sum1 += (*x) * (*y); + x += incx; + y += incy; + + sum2 += (*x) * (*y); + x += incx; + y += incy; + + sum3 += (*x) * (*y); + x += incx; + y += incy; + + sum4 += (*x) * (*y); + x += incx; + y += incy; + + sum5 += (*x) * (*y); + x += incx; + y += incy; + + n -= 6; + } + float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5); + while (n--) { + sum += (*x) * (*y); + x += incx; + y += incy; + } + *rho = sum; +#endif +} + +void bli_ddotv_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + double x[restrict static n], + inc_t incx, + double y[restrict static n], + inc_t incy, + double rho[restrict static 1]) +{ + double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0; + while (n >= 6) { + sum0 += (*x) * (*y); + x += incx; + y += incy; + + sum1 += (*x) * (*y); + x += incx; + y += incy; + + sum2 += (*x) * (*y); + x += incx; + y += incy; + + sum3 += (*x) * (*y); + x += incx; + y += incy; + + sum4 += (*x) * (*y); + x += incx; + y += incy; + + sum5 += (*x) * (*y); + x += incx; + y += incy; + + n -= 6; + } + double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5); + while (n--) { + sum += (*x) * (*y); + x += incx; + y += incy; + } + *rho = sum; +} + +void bli_cdotv_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + scomplex x[restrict static n], + inc_t incx, + scomplex y[restrict static n], + inc_t incy, + scomplex rho[restrict static 1]) +{ + if (bli_is_conj(conjy)) { + bli_toggle_conj(conjx); + } + + if (bli_zero_dim1(n)) { + rho->real = 0.0f; + rho->imag = 0.0f; + return; + } + + float sumr; + float sumi; +#if PPAPI_RELEASE >= 36 + if (bli_is_noconj(conjx)) { + if (bli_has_nonunit_inc2(incx, incy)) { + float sum0r = 0.0f, sum1r = 0.0f; + float sum0i = 0.0f, sum1i = 0.0f; + while (n >= 2) { + const float x0r = x->real; + const float x0i = x->imag; + const float y0r = y->real; + const float y0i = y->imag; + + sum0r += x0r * y0r - x0i * y0i; + sum0i += x0r * y0i + x0i * y0r; + + x += incx; + y += incy; + + const float x1r = x->real; + const float x1i = x->imag; + const float y1r = y->real; + const float y1i = y->imag; + + sum1r += x1r * y1r - x1i * y1i; + sum1i += x1r * y1i + x1i * y1r; + + x += incx; + y += incy; + + n -= 2; + } + sumr = sum0r + sum1r; + sumi = sum0i + sum1i; + } else { + v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero(); + v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero(); + while (n >= 8) { + const v4sf xv0t = v4sf_cload(x); + const v4sf xv0b = v4sf_cload(x+2); + const v4sf yv0t = v4sf_cload(y); + const v4sf yv0b = v4sf_cload(y+2); + + const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6); + const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7); + const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6); + const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7); + + sumv0r += xv0r * yv0r - xv0i * yv0i; + sumv0i += xv0r * yv0i + xv0i * yv0r; + + const v4sf xv1t = v4sf_cload(x+4); + const v4sf xv1b = v4sf_cload(x+6); + const v4sf yv1t = v4sf_cload(y+4); + const v4sf yv1b = v4sf_cload(y+6); + + const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6); + const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7); + const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6); + const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7); + + sumv1r += xv1r * yv1r - xv1i * yv1i; + sumv1i += xv1r * yv1i + xv1i * yv1r; + + x += 8; + y += 8; + + n -= 8; + } + const v4sf sumvr = sumv0r + sumv1r; + const v4sf sumvi = sumv0i + sumv1i; + sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]); + sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]); + } + while (n--) { + const float xr = x->real; + const float xi = x->imag; + const float yr = y->real; + const float yi = y->imag; + + sumr += xr * yr - xi * yi; + sumi += xr * yi + xi * yr; + + x += incx; + y += incy; + } + } else { + if (bli_has_nonunit_inc2(incx, incy)) { + float sum0r = 0.0f, sum1r = 0.0f; + float sum0i = 0.0f, sum1i = 0.0f; + while (n >= 2) { + const float x0r = x->real; + const float x0i = x->imag; + const float y0r = y->real; + const float y0i = y->imag; + + sum0r += x0r * y0r + x0i * y0i; + sum0i += x0r * y0i - x0i * y0r; + + x += incx; + y += incy; + + const float x1r = x->real; + const float x1i = x->imag; + const float y1r = y->real; + const float y1i = y->imag; + + sum1r += x1r * y1r + x1i * y1i; + sum1i += x1r * y1i - x1i * y1r; + + x += incx; + y += incy; + + n -= 2; + } + sumr = sum0r + sum1r; + sumi = sum0i + sum1i; + } else { + v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero(); + v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero(); + while (n >= 8) { + const v4sf xv0t = v4sf_cload(x); + const v4sf xv0b = v4sf_cload(x+2); + const v4sf yv0t = v4sf_cload(y); + const v4sf yv0b = v4sf_cload(y+2); + + const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6); + const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7); + const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6); + const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7); + + sumv0r += xv0r * yv0r + xv0i * yv0i; + sumv0i += xv0r * yv0i - xv0i * yv0r; + + const v4sf xv1t = v4sf_cload(x+4); + const v4sf xv1b = v4sf_cload(x+6); + const v4sf yv1t = v4sf_cload(y+4); + const v4sf yv1b = v4sf_cload(y+6); + + const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6); + const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7); + const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6); + const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7); + + sumv1r += xv1r * yv1r + xv1i * yv1i; + sumv1i += xv1r * yv1i - xv1i * yv1r; + + x += 8; + y += 8; + + n -= 8; + } + const v4sf sumvr = sumv0r + sumv1r; + const v4sf sumvi = sumv0i + sumv1i; + sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]); + sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]); + } + while (n--) { + const float xr = x->real; + const float xi = x->imag; + const float yr = y->real; + const float yi = y->imag; + + sumr += xr * yr + xi * yi; + sumi += xr * yi - xi * yr; + + x += incx; + y += incy; + } + } +#else + if (bli_is_noconj(conjx)) { + float sum0r = 0.0f, sum1r = 0.0f; + float sum0i = 0.0f, sum1i = 0.0f; + while (n >= 2) { + const float x0r = x->real; + const float x0i = x->imag; + const float y0r = y->real; + const float y0i = y->imag; + + sum0r += x0r * y0r - x0i * y0i; + sum0i += x0r * y0i + x0i * y0r; + + x += incx; + y += incy; + + const float x1r = x->real; + const float x1i = x->imag; + const float y1r = y->real; + const float y1i = y->imag; + + sum1r += x1r * y1r - x1i * y1i; + sum1i += x1r * y1i + x1i * y1r; + + x += incx; + y += incy; + + n -= 2; + } + sumr = sum0r + sum1r; + sumi = sum0i + sum1i; + if (n != 0) { + const float xr = x->real; + const float xi = x->imag; + const float yr = y->real; + const float yi = y->imag; + + sumr += xr * yr - xi * yi; + sumi += xr * yi + xi * yr; + } + } else { + float sum0r = 0.0f, sum1r = 0.0f; + float sum0i = 0.0f, sum1i = 0.0f; + while (n >= 2) { + const float x0r = x->real; + const float x0i = x->imag; + const float y0r = y->real; + const float y0i = y->imag; + + sum0r += x0r * y0r + x0i * y0i; + sum0i += x0r * y0i - x0i * y0r; + + x += incx; + y += incy; + + const float x1r = x->real; + const float x1i = x->imag; + const float y1r = y->real; + const float y1i = y->imag; + + sum1r += x1r * y1r + x1i * y1i; + sum1i += x1r * y1i - x1i * y1r; + + x += incx; + y += incy; + + n -= 2; + } + sumr = sum0r + sum1r; + sumi = sum0i + sum1i; + if (n != 0) { + const float xr = x->real; + const float xi = x->imag; + const float yr = y->real; + const float yi = y->imag; + + sumr += xr * yr + xi * yi; + sumi += xr * yi - xi * yr; + } + } +#endif + + rho->real = sumr; + rho->imag = bli_is_conj(conjy) ? -sumi : sumi; +} + + + +void bli_zdotv_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + dcomplex x[restrict static n], + inc_t incx, + dcomplex y[restrict static n], + inc_t incy, + dcomplex rho[restrict static 1]) +{ + if (bli_is_conj(conjy)) { + bli_toggle_conj(conjx); + } + + if (bli_zero_dim1(n)) { + rho->real = 0.0; + rho->imag = 0.0; + return; + } + + double sumr; + double sumi; + if (bli_is_noconj(conjx)) { + double sum0r = 0.0, sum1r = 0.0; + double sum0i = 0.0, sum1i = 0.0; + while (n >= 2) { + const double x0r = x->real; + const double x0i = x->imag; + const double y0r = y->real; + const double y0i = y->imag; + + sum0r += x0r * y0r - x0i * y0i; + sum0i += x0r * y0i + x0i * y0r; + + x += incx; + y += incy; + + const double x1r = x->real; + const double x1i = x->imag; + const double y1r = y->real; + const double y1i = y->imag; + + sum1r += x1r * y1r - x1i * y1i; + sum1i += x1r * y1i + x1i * y1r; + + x += incx; + y += incy; + + n -= 2; + } + sumr = sum0r + sum1r; + sumi = sum0i + sum1i; + if (n != 0) { + const double xr = x->real; + const double xi = x->imag; + const double yr = y->real; + const double yi = y->imag; + + sumr += xr * yr - xi * yi; + sumi += xr * yi + xi * yr; + } + } else { + double sum0r = 0.0, sum1r = 0.0; + double sum0i = 0.0, sum1i = 0.0; + while (n >= 2) { + const double x0r = x->real; + const double x0i = x->imag; + const double y0r = y->real; + const double y0i = y->imag; + + sum0r += x0r * y0r + x0i * y0i; + sum0i += x0r * y0i - x0i * y0r; + + x += incx; + y += incy; + + const double x1r = x->real; + const double x1i = x->imag; + const double y1r = y->real; + const double y1i = y->imag; + + sum1r += x1r * y1r + x1i * y1i; + sum1i += x1r * y1i - x1i * y1r; + + x += incx; + y += incy; + + n -= 2; + } + sumr = sum0r + sum1r; + sumi = sum0i + sum1i; + if (n != 0) { + const double xr = x->real; + const double xi = x->imag; + const double yr = y->real; + const double yi = y->imag; + + sumr += xr * yr + xi * yi; + sumi += xr * yi - xi * yr; + } + } + + rho->real = sumr; + rho->imag = bli_is_conj(conjy) ? -sumi : sumi; +} From 020a831bc5f61744cb8354886aa679b99b1285f6 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Thu, 19 Jun 2014 00:58:26 -0700 Subject: [PATCH 10/19] Code clean-up in PNaCl port --- config/pnacl/bli_kernel.h | 9 +- kernels/nacl/pnacl/1/bli_dotv_opt.c | 7 +- kernels/nacl/pnacl/3/bli_gemm_opt.c | 786 ++++++++++++---------------- 3 files changed, 351 insertions(+), 451 deletions(-) diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h index 728f08f12..d3ad1e721 100644 --- a/config/pnacl/bli_kernel.h +++ b/config/pnacl/bli_kernel.h @@ -180,8 +180,8 @@ // -- gemm -- #if PPAPI_RELEASE >= 36 -#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x4 -#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt +#define BLIS_CGEMM_UKERNEL bli_cgemm_opt #endif // -- trsm-related -- @@ -222,9 +222,10 @@ // -- copyv -- // -- dotv -- -#if PPAPI_RELEASE >= 36 #define BLIS_SDOTV_KERNEL bli_sdotv_opt -#endif +#define BLIS_DDOTV_KERNEL bli_ddotv_opt +#define BLIS_CDOTV_KERNEL bli_cdotv_opt +#define BLIS_ZDOTV_KERNEL bli_zdotv_opt // -- dotxv -- diff --git a/kernels/nacl/pnacl/1/bli_dotv_opt.c b/kernels/nacl/pnacl/1/bli_dotv_opt.c index 14d203bd6..9d6408245 100644 --- a/kernels/nacl/pnacl/1/bli_dotv_opt.c +++ b/kernels/nacl/pnacl/1/bli_dotv_opt.c @@ -185,7 +185,7 @@ void bli_sdotv_opt( #endif } -void bli_ddotv_opt_var1( +void bli_ddotv_opt( conj_t conjx, conj_t conjy, dim_t n, @@ -232,7 +232,7 @@ void bli_ddotv_opt_var1( *rho = sum; } -void bli_cdotv_opt_var1( +void bli_cdotv_opt( conj_t conjx, conj_t conjy, dim_t n, @@ -510,7 +510,7 @@ void bli_cdotv_opt_var1( -void bli_zdotv_opt_var1( +void bli_zdotv_opt( conj_t conjx, conj_t conjy, dim_t n, @@ -615,3 +615,4 @@ void bli_zdotv_opt_var1( rho->real = sumr; rho->imag = bli_is_conj(conjy) ? -sumi : sumi; } + diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c index 2dca6ae45..2c37397c7 100644 --- a/kernels/nacl/pnacl/3/bli_gemm_opt.c +++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c @@ -35,454 +35,352 @@ #include "blis.h" #if PPAPI_RELEASE >= 36 -typedef float v4sf __attribute__ ((vector_size(16))); + typedef float v4sf __attribute__ ((vector_size(16))); -inline v4sf v4sf_splat(float x) { - return (v4sf) { x, x, x, x }; -} + inline v4sf v4sf_splat(float x) { + return (v4sf) { x, x, x, x }; + } -inline v4sf v4sf_load(const float* a) { - return *((const v4sf*)a); -} + inline v4sf v4sf_load(const float* a) { + return *((const v4sf*)a); + } -inline v4sf v4sf_cload(const scomplex* a) { - return *((const v4sf*)a); -} + inline v4sf v4sf_cload(const scomplex* a) { + return *((const v4sf*)a); + } -inline void v4sf_store(float* a, v4sf x) { - *((v4sf*)a) = x; -} + inline void v4sf_store(float* a, v4sf x) { + *((v4sf*)a) = x; + } -inline void v4sf_cstore(scomplex* a, v4sf x) { - *((v4sf*)a) = x; -} + inline void v4sf_cstore(scomplex* a, v4sf x) { + *((v4sf*)a) = x; + } -inline v4sf v4sf_zero() { - return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; -} + inline v4sf v4sf_zero() { + return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; + } + + void bli_sgemm_opt( + dim_t k, + float alpha[restrict static 1], + float a[restrict static 8*k], + float b[restrict static k*4], + float beta[restrict static 1], + float c[restrict static 8*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) + { + // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) + v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero(); + v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero(); + for (dim_t i = 0; i < k; i += 1) { + const v4sf avt = v4sf_load(a); + const v4sf avb = v4sf_load(a+4); + + const v4sf bv_xxxx = v4sf_splat(b[0]); + abv0t += avt * bv_xxxx; + abv0b += avb * bv_xxxx; + + const v4sf bv_yyyy = v4sf_splat(b[1]); + abv1t += avt * bv_yyyy; + abv1b += avb * bv_yyyy; + + const v4sf bv_zzzz = v4sf_splat(b[2]); + abv2t += avt * bv_zzzz; + abv2b += avb * bv_zzzz; + + const v4sf bv_wwww = v4sf_splat(b[3]); + abv3t += avt * bv_wwww; + abv3b += avb * bv_wwww; + + a += 8; + b += 4; + } + + const v4sf alphav = v4sf_splat(*alpha); + abv0t *= alphav; + abv0b *= alphav; + abv1t *= alphav; + abv1b *= alphav; + abv2t *= alphav; + abv2b *= alphav; + abv3t *= alphav; + abv3b *= alphav; + + if (rs_c == 1) { + v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]); + v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); + v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); + v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); + v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]); + v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); + v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); + v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); + + const v4sf betav = v4sf_splat(*beta); + cv0t = cv0t * betav + abv0t; + cv1t = cv1t * betav + abv1t; + cv2t = cv2t * betav + abv2t; + cv3t = cv3t * betav + abv3t; + cv0b = cv0b * betav + abv0b; + cv1b = cv1b * betav + abv1b; + cv2b = cv2b * betav + abv2b; + cv3b = cv3b * betav + abv3b; + + v4sf_store(&c[0*rs_c + 0*cs_c], cv0t); + v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); + v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); + v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); + v4sf_store(&c[4*rs_c + 0*cs_c], cv0b); + v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); + v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); + v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); + } else { + // Load columns 0, 1, 2, 3 (top part) + v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] }; + v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] }; + v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] }; + v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] }; + // Load columns 0, 1, 2, 3 (bottom part) + v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] }; + v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] }; + v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] }; + v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] }; + + const v4sf betav = v4sf_splat(*beta); + cv0t = cv0t * betav + abv0t; + cv1t = cv1t * betav + abv1t; + cv2t = cv2t * betav + abv2t; + cv3t = cv3t * betav + abv3t; + cv0b = cv0b * betav + abv0b; + cv1b = cv1b * betav + abv1b; + cv2b = cv2b * betav + abv2b; + cv3b = cv3b * betav + abv3b; + + // Store column 0 + c[0*rs_c + 0*cs_c] = cv0t[0]; + c[1*rs_c + 0*cs_c] = cv0t[1]; + c[2*rs_c + 0*cs_c] = cv0t[2]; + c[3*rs_c + 0*cs_c] = cv0t[3]; + c[4*rs_c + 0*cs_c] = cv0b[0]; + c[5*rs_c + 0*cs_c] = cv0b[1]; + c[6*rs_c + 0*cs_c] = cv0b[2]; + c[7*rs_c + 0*cs_c] = cv0b[3]; + + // Store column 1 + c[0*rs_c + 1*cs_c] = cv1t[0]; + c[1*rs_c + 1*cs_c] = cv1t[1]; + c[2*rs_c + 1*cs_c] = cv1t[2]; + c[3*rs_c + 1*cs_c] = cv1t[3]; + c[4*rs_c + 1*cs_c] = cv1b[0]; + c[5*rs_c + 1*cs_c] = cv1b[1]; + c[6*rs_c + 1*cs_c] = cv1b[2]; + c[7*rs_c + 1*cs_c] = cv1b[3]; + + // Store column 2 + c[0*rs_c + 2*cs_c] = cv2t[0]; + c[1*rs_c + 2*cs_c] = cv2t[1]; + c[2*rs_c + 2*cs_c] = cv2t[2]; + c[3*rs_c + 2*cs_c] = cv2t[3]; + c[4*rs_c + 2*cs_c] = cv2b[0]; + c[5*rs_c + 2*cs_c] = cv2b[1]; + c[6*rs_c + 2*cs_c] = cv2b[2]; + c[7*rs_c + 2*cs_c] = cv2b[3]; + + // Store column 3 + c[0*rs_c + 3*cs_c] = cv3t[0]; + c[1*rs_c + 3*cs_c] = cv3t[1]; + c[2*rs_c + 3*cs_c] = cv3t[2]; + c[3*rs_c + 3*cs_c] = cv3t[3]; + c[4*rs_c + 3*cs_c] = cv3b[0]; + c[5*rs_c + 3*cs_c] = cv3b[1]; + c[6*rs_c + 3*cs_c] = cv3b[2]; + c[7*rs_c + 3*cs_c] = cv3b[3]; + } + } + + void bli_cgemm_opt( + dim_t k, + scomplex alpha[restrict static 1], + scomplex a[restrict static 4*k], + scomplex b[restrict static k*4], + scomplex beta[restrict static 1], + scomplex c[restrict static 4*4], + inc_t rs_c, + inc_t cs_c, + auxinfo_t* data) + { + // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) + v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero(); + v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero(); + for (dim_t i = 0; i < k; i += 1) { + const v4sf avt = v4sf_cload(a); + const v4sf avb = v4sf_cload(a+2); + const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6); + const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7); + + const v4sf bv0r = v4sf_splat(b[0].real); + const v4sf bv0i = v4sf_splat(b[0].imag); + abv0r += avr * bv0r - avi * bv0i; + abv0i += avr * bv0i + avi * bv0r; + + const v4sf bv1r = v4sf_splat(b[1].real); + const v4sf bv1i = v4sf_splat(b[1].imag); + abv1r += avr * bv1r - avi * bv1i; + abv1i += avr * bv1i + avi * bv1r; + + const v4sf bv2r = v4sf_splat(b[2].real); + const v4sf bv2i = v4sf_splat(b[2].imag); + abv2r += avr * bv2r - avi * bv2i; + abv2i += avr * bv2i + avi * bv2r; + + const v4sf bv3r = v4sf_splat(b[3].real); + const v4sf bv3i = v4sf_splat(b[3].imag); + abv3r += avr * bv3r - avi * bv3i; + abv3i += avr * bv3i + avi * bv3r; + + a += 4; + b += 4; + } + + const v4sf alphavr = v4sf_splat(alpha->real); + const v4sf alphavi = v4sf_splat(alpha->imag); + v4sf temp; + + temp = abv0r * alphavr - abv0i * alphavi; + abv0i = abv0r * alphavi + abv0i * alphavr; + abv0r = temp; + + temp = abv1r * alphavr - abv1i * alphavi; + abv1i = abv1r * alphavi + abv1i * alphavr; + abv1r = temp; + + temp = abv2r * alphavr - abv2i * alphavi; + abv2i = abv2r * alphavi + abv2i * alphavr; + abv2r = temp; + + temp = abv3r * alphavr - abv3i * alphavi; + abv3i = abv3r * alphavi + abv3i * alphavr; + abv3r = temp; + + if (rs_c == 1) { + const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]); + const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]); + const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]); + const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]); + const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]); + const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]); + const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]); + const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]); + + v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6); + v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7); + v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6); + v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7); + v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6); + v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7); + v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6); + v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7); + + const v4sf betavr = v4sf_splat(beta->real); + const v4sf betavi = v4sf_splat(beta->imag); + + temp = abv0r + cv0r * betavr - cv0i * betavi; + cv0i = abv0i + cv0r * betavi + cv0i * betavr; + cv0r = temp; + + temp = abv1r + cv1r * betavr - cv1i * betavi; + cv1i = abv1i + cv1r * betavi + cv1i * betavr; + cv1r = temp; + + temp = abv2r + cv2r * betavr - cv2i * betavi; + cv2i = abv2i + cv2r * betavi + cv2i * betavr; + cv2r = temp; + + temp = abv3r + cv3r * betavr - cv3i * betavi; + cv3i = abv3i + cv3r * betavi + cv3i * betavr; + cv3r = temp; + + v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7)); + v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7)); + v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7)); + v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5)); + v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7)); + } else { + // Load columns 0, 1, 2, 3 (real part) + v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real }; + v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real }; + v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real }; + v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real }; + // Load columns 0, 1, 2, 3 (imaginary part) + v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag }; + v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag }; + v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag }; + v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag }; + + const v4sf betavr = v4sf_splat(beta->real); + const v4sf betavi = v4sf_splat(beta->imag); + + temp = abv0r + cv0r * betavr - cv0i * betavi; + cv0i = abv0i + cv0r * betavi + cv0i * betavr; + cv0r = temp; + + temp = abv1r + cv1r * betavr - cv1i * betavi; + cv1i = abv1i + cv1r * betavi + cv1i * betavr; + cv1r = temp; + + temp = abv2r + cv2r * betavr - cv2i * betavi; + cv2i = abv2i + cv2r * betavi + cv2i * betavr; + cv2r = temp; + + temp = abv3r + cv3r * betavr - cv3i * betavi; + cv3i = abv3i + cv3r * betavi + cv3i * betavr; + cv3r = temp; + + // Store column 0 + c[0*rs_c + 0*cs_c].real = cv0r[0]; + c[0*rs_c + 0*cs_c].imag = cv0i[0]; + c[1*rs_c + 0*cs_c].real = cv0r[1]; + c[1*rs_c + 0*cs_c].imag = cv0i[1]; + c[2*rs_c + 0*cs_c].real = cv0r[2]; + c[2*rs_c + 0*cs_c].imag = cv0i[2]; + c[3*rs_c + 0*cs_c].real = cv0r[3]; + c[3*rs_c + 0*cs_c].imag = cv0i[3]; + + // Store column 1 + c[0*rs_c + 1*cs_c].real = cv1r[0]; + c[0*rs_c + 1*cs_c].imag = cv1i[0]; + c[1*rs_c + 1*cs_c].real = cv1r[1]; + c[1*rs_c + 1*cs_c].imag = cv1i[1]; + c[2*rs_c + 1*cs_c].real = cv1r[2]; + c[2*rs_c + 1*cs_c].imag = cv1i[2]; + c[3*rs_c + 1*cs_c].real = cv1r[3]; + c[3*rs_c + 1*cs_c].imag = cv1i[3]; + + // Store column 2 + c[0*rs_c + 2*cs_c].real = cv2r[0]; + c[0*rs_c + 2*cs_c].imag = cv2i[0]; + c[1*rs_c + 2*cs_c].real = cv2r[1]; + c[1*rs_c + 2*cs_c].imag = cv2i[1]; + c[2*rs_c + 2*cs_c].real = cv2r[2]; + c[2*rs_c + 2*cs_c].imag = cv2i[2]; + c[3*rs_c + 2*cs_c].real = cv2r[3]; + c[3*rs_c + 2*cs_c].imag = cv2i[3]; + + // Store column 3 + c[0*rs_c + 3*cs_c].real = cv3r[0]; + c[0*rs_c + 3*cs_c].imag = cv3i[0]; + c[1*rs_c + 3*cs_c].real = cv3r[1]; + c[1*rs_c + 3*cs_c].imag = cv3i[1]; + c[2*rs_c + 3*cs_c].real = cv3r[2]; + c[2*rs_c + 3*cs_c].imag = cv3i[2]; + c[3*rs_c + 3*cs_c].real = cv3r[3]; + c[3*rs_c + 3*cs_c].imag = cv3i[3]; + } + } #endif - -#if PPAPI_RELEASE >= 36 -void bli_sgemm_opt_8x4( - dim_t k, - float alpha[restrict static 1], - float a[restrict static 8*k], - float b[restrict static k*4], - float beta[restrict static 1], - float c[restrict static 8*4], - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) -{ - // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) - v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero(); - v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero(); - for (dim_t i = 0; i < k; i += 1) { - const v4sf avt = v4sf_load(a); - const v4sf avb = v4sf_load(a+4); - - const v4sf bv_xxxx = v4sf_splat(b[0]); - abv0t += avt * bv_xxxx; - abv0b += avb * bv_xxxx; - - const v4sf bv_yyyy = v4sf_splat(b[1]); - abv1t += avt * bv_yyyy; - abv1b += avb * bv_yyyy; - - const v4sf bv_zzzz = v4sf_splat(b[2]); - abv2t += avt * bv_zzzz; - abv2b += avb * bv_zzzz; - - const v4sf bv_wwww = v4sf_splat(b[3]); - abv3t += avt * bv_wwww; - abv3b += avb * bv_wwww; - - a += 8; - b += 4; - } - - const v4sf alphav = v4sf_splat(*alpha); - abv0t *= alphav; - abv0b *= alphav; - abv1t *= alphav; - abv1b *= alphav; - abv2t *= alphav; - abv2b *= alphav; - abv3t *= alphav; - abv3b *= alphav; - - if (rs_c == 1) { - v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]); - v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); - v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); - v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); - v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]); - v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); - v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); - v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); - - const v4sf betav = v4sf_splat(*beta); - cv0t = cv0t * betav + abv0t; - cv1t = cv1t * betav + abv1t; - cv2t = cv2t * betav + abv2t; - cv3t = cv3t * betav + abv3t; - cv0b = cv0b * betav + abv0b; - cv1b = cv1b * betav + abv1b; - cv2b = cv2b * betav + abv2b; - cv3b = cv3b * betav + abv3b; - - v4sf_store(&c[0*rs_c + 0*cs_c], cv0t); - v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); - v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); - v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); - v4sf_store(&c[4*rs_c + 0*cs_c], cv0b); - v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); - v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); - v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); - } else { - // Load columns 0, 1, 2, 3 (top part) - v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] }; - v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] }; - v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] }; - v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] }; - // Load columns 0, 1, 2, 3 (bottom part) - v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] }; - v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] }; - v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] }; - v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] }; - - const v4sf betav = v4sf_splat(*beta); - cv0t = cv0t * betav + abv0t; - cv1t = cv1t * betav + abv1t; - cv2t = cv2t * betav + abv2t; - cv3t = cv3t * betav + abv3t; - cv0b = cv0b * betav + abv0b; - cv1b = cv1b * betav + abv1b; - cv2b = cv2b * betav + abv2b; - cv3b = cv3b * betav + abv3b; - - // Store column 0 - c[0*rs_c + 0*cs_c] = cv0t[0]; - c[1*rs_c + 0*cs_c] = cv0t[1]; - c[2*rs_c + 0*cs_c] = cv0t[2]; - c[3*rs_c + 0*cs_c] = cv0t[3]; - c[4*rs_c + 0*cs_c] = cv0b[0]; - c[5*rs_c + 0*cs_c] = cv0b[1]; - c[6*rs_c + 0*cs_c] = cv0b[2]; - c[7*rs_c + 0*cs_c] = cv0b[3]; - - // Store column 1 - c[0*rs_c + 1*cs_c] = cv1t[0]; - c[1*rs_c + 1*cs_c] = cv1t[1]; - c[2*rs_c + 1*cs_c] = cv1t[2]; - c[3*rs_c + 1*cs_c] = cv1t[3]; - c[4*rs_c + 1*cs_c] = cv1b[0]; - c[5*rs_c + 1*cs_c] = cv1b[1]; - c[6*rs_c + 1*cs_c] = cv1b[2]; - c[7*rs_c + 1*cs_c] = cv1b[3]; - - // Store column 2 - c[0*rs_c + 2*cs_c] = cv2t[0]; - c[1*rs_c + 2*cs_c] = cv2t[1]; - c[2*rs_c + 2*cs_c] = cv2t[2]; - c[3*rs_c + 2*cs_c] = cv2t[3]; - c[4*rs_c + 2*cs_c] = cv2b[0]; - c[5*rs_c + 2*cs_c] = cv2b[1]; - c[6*rs_c + 2*cs_c] = cv2b[2]; - c[7*rs_c + 2*cs_c] = cv2b[3]; - - // Store column 3 - c[0*rs_c + 3*cs_c] = cv3t[0]; - c[1*rs_c + 3*cs_c] = cv3t[1]; - c[2*rs_c + 3*cs_c] = cv3t[2]; - c[3*rs_c + 3*cs_c] = cv3t[3]; - c[4*rs_c + 3*cs_c] = cv3b[0]; - c[5*rs_c + 3*cs_c] = cv3b[1]; - c[6*rs_c + 3*cs_c] = cv3b[2]; - c[7*rs_c + 3*cs_c] = cv3b[3]; - } -} -#else -void bli_sgemm_opt_4x4( - dim_t k, - float alpha[restrict static 1], - float a[restrict static 4*k], - float b[restrict static k*4], - float beta[restrict static 1], - float c[restrict static 4*4], - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) -{ - /* Just call the reference implementation. */ - BLIS_SGEMM_UKERNEL_REF( - k, - alpha, - a, - b, - beta, - c, - rs_c, - cs_c, - data); -} -#endif - -void bli_dgemm_opt_4x4( - dim_t k, - double alpha[restrict static 1], - double a[restrict static 4*k], - double b[restrict static k*4], - double beta[restrict static 1], - double c[restrict static 4*4], - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) -{ - /* Just call the reference implementation. */ - BLIS_DGEMM_UKERNEL_REF( - k, - alpha, - a, - b, - beta, - c, - rs_c, - cs_c, - data); -} - - -#if PPAPI_RELEASE >= 36 -void bli_cgemm_opt_4x4( - dim_t k, - scomplex alpha[restrict static 1], - scomplex a[restrict static 4*k], - scomplex b[restrict static k*4], - scomplex beta[restrict static 1], - scomplex c[restrict static 4*4], - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) -{ - // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) - v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero(); - v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero(); - for (dim_t i = 0; i < k; i += 1) { - const v4sf avt = v4sf_cload(a); - const v4sf avb = v4sf_cload(a+2); - const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6); - const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7); - - const v4sf bv0r = v4sf_splat(b[0].real); - const v4sf bv0i = v4sf_splat(b[0].imag); - abv0r += avr * bv0r - avi * bv0i; - abv0i += avr * bv0i + avi * bv0r; - - const v4sf bv1r = v4sf_splat(b[1].real); - const v4sf bv1i = v4sf_splat(b[1].imag); - abv1r += avr * bv1r - avi * bv1i; - abv1i += avr * bv1i + avi * bv1r; - - const v4sf bv2r = v4sf_splat(b[2].real); - const v4sf bv2i = v4sf_splat(b[2].imag); - abv2r += avr * bv2r - avi * bv2i; - abv2i += avr * bv2i + avi * bv2r; - - const v4sf bv3r = v4sf_splat(b[3].real); - const v4sf bv3i = v4sf_splat(b[3].imag); - abv3r += avr * bv3r - avi * bv3i; - abv3i += avr * bv3i + avi * bv3r; - - a += 4; - b += 4; - } - - const v4sf alphavr = v4sf_splat(alpha->real); - const v4sf alphavi = v4sf_splat(alpha->imag); - v4sf temp; - - temp = abv0r * alphavr - abv0i * alphavi; - abv0i = abv0r * alphavi + abv0i * alphavr; - abv0r = temp; - - temp = abv1r * alphavr - abv1i * alphavi; - abv1i = abv1r * alphavi + abv1i * alphavr; - abv1r = temp; - - temp = abv2r * alphavr - abv2i * alphavi; - abv2i = abv2r * alphavi + abv2i * alphavr; - abv2r = temp; - - temp = abv3r * alphavr - abv3i * alphavi; - abv3i = abv3r * alphavi + abv3i * alphavr; - abv3r = temp; - - if (rs_c == 1) { - const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]); - const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]); - const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]); - const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]); - const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]); - const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]); - const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]); - const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]); - - v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6); - v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7); - v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6); - v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7); - v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6); - v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7); - v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6); - v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7); - - const v4sf betavr = v4sf_splat(beta->real); - const v4sf betavi = v4sf_splat(beta->imag); - - temp = abv0r + cv0r * betavr - cv0i * betavi; - cv0i = abv0i + cv0r * betavi + cv0i * betavr; - cv0r = temp; - - temp = abv1r + cv1r * betavr - cv1i * betavi; - cv1i = abv1i + cv1r * betavi + cv1i * betavr; - cv1r = temp; - - temp = abv2r + cv2r * betavr - cv2i * betavi; - cv2i = abv2i + cv2r * betavi + cv2i * betavr; - cv2r = temp; - - temp = abv3r + cv3r * betavr - cv3i * betavi; - cv3i = abv3i + cv3r * betavi + cv3i * betavr; - cv3r = temp; - - v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5)); - v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7)); - v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5)); - v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7)); - v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5)); - v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7)); - v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5)); - v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7)); - } else { - // Load columns 0, 1, 2, 3 (real part) - v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real }; - v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real }; - v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real }; - v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real }; - // Load columns 0, 1, 2, 3 (imaginary part) - v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag }; - v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag }; - v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag }; - v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag }; - - const v4sf betavr = v4sf_splat(beta->real); - const v4sf betavi = v4sf_splat(beta->imag); - - temp = abv0r + cv0r * betavr - cv0i * betavi; - cv0i = abv0i + cv0r * betavi + cv0i * betavr; - cv0r = temp; - - temp = abv1r + cv1r * betavr - cv1i * betavi; - cv1i = abv1i + cv1r * betavi + cv1i * betavr; - cv1r = temp; - - temp = abv2r + cv2r * betavr - cv2i * betavi; - cv2i = abv2i + cv2r * betavi + cv2i * betavr; - cv2r = temp; - - temp = abv3r + cv3r * betavr - cv3i * betavi; - cv3i = abv3i + cv3r * betavi + cv3i * betavr; - cv3r = temp; - - // Store column 0 - c[0*rs_c + 0*cs_c].real = cv0r[0]; - c[0*rs_c + 0*cs_c].imag = cv0i[0]; - c[1*rs_c + 0*cs_c].real = cv0r[1]; - c[1*rs_c + 0*cs_c].imag = cv0i[1]; - c[2*rs_c + 0*cs_c].real = cv0r[2]; - c[2*rs_c + 0*cs_c].imag = cv0i[2]; - c[3*rs_c + 0*cs_c].real = cv0r[3]; - c[3*rs_c + 0*cs_c].imag = cv0i[3]; - - // Store column 1 - c[0*rs_c + 1*cs_c].real = cv1r[0]; - c[0*rs_c + 1*cs_c].imag = cv1i[0]; - c[1*rs_c + 1*cs_c].real = cv1r[1]; - c[1*rs_c + 1*cs_c].imag = cv1i[1]; - c[2*rs_c + 1*cs_c].real = cv1r[2]; - c[2*rs_c + 1*cs_c].imag = cv1i[2]; - c[3*rs_c + 1*cs_c].real = cv1r[3]; - c[3*rs_c + 1*cs_c].imag = cv1i[3]; - - // Store column 2 - c[0*rs_c + 2*cs_c].real = cv2r[0]; - c[0*rs_c + 2*cs_c].imag = cv2i[0]; - c[1*rs_c + 2*cs_c].real = cv2r[1]; - c[1*rs_c + 2*cs_c].imag = cv2i[1]; - c[2*rs_c + 2*cs_c].real = cv2r[2]; - c[2*rs_c + 2*cs_c].imag = cv2i[2]; - c[3*rs_c + 2*cs_c].real = cv2r[3]; - c[3*rs_c + 2*cs_c].imag = cv2i[3]; - - // Store column 3 - c[0*rs_c + 3*cs_c].real = cv3r[0]; - c[0*rs_c + 3*cs_c].imag = cv3i[0]; - c[1*rs_c + 3*cs_c].real = cv3r[1]; - c[1*rs_c + 3*cs_c].imag = cv3i[1]; - c[2*rs_c + 3*cs_c].real = cv3r[2]; - c[2*rs_c + 3*cs_c].imag = cv3i[2]; - c[3*rs_c + 3*cs_c].real = cv3r[3]; - c[3*rs_c + 3*cs_c].imag = cv3i[3]; - } -} -#else -void bli_cgemm_opt_4x4( - dim_t k, - scomplex alpha[restrict static 1], - scomplex a[restrict static 4*k], - scomplex b[restrict static k*4], - scomplex beta[restrict static 1], - scomplex c[restrict static 4*4], - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) -{ - /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( - k, - alpha, - a, - b, - beta, - c, - rs_c, - cs_c, - data); -} -#endif - -void bli_zgemm_opt_4x4( - dim_t k, - dcomplex alpha[restrict static 1], - dcomplex a[restrict static 4*k], - dcomplex b[restrict static k*4], - dcomplex beta[restrict static 1], - dcomplex c[restrict static 4*4], - inc_t rs_c, - inc_t cs_c, - auxinfo_t* data) -{ - /* Just call the reference implementation. */ - BLIS_ZGEMM_UKERNEL_REF( - k, - alpha, - a, - b, - beta, - c, - rs_c, - cs_c, - data); -} - From b693b0cddcfb41450e3c09a3ab97acb44c1ccdec Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Sun, 22 Jun 2014 13:44:25 -0700 Subject: [PATCH 11/19] [SC]AXPY kernels for PNaCl --- config/pnacl/bli_kernel.h | 4 + kernels/nacl/pnacl/1/bli_axpyv_opt.c | 203 +++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 kernels/nacl/pnacl/1/bli_axpyv_opt.c diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h index d3ad1e721..439d9d715 100644 --- a/config/pnacl/bli_kernel.h +++ b/config/pnacl/bli_kernel.h @@ -218,6 +218,10 @@ // -- addv -- // -- axpyv -- +#if PPAPI_RELEASE >= 36 +#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt +#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt +#endif // -- copyv -- diff --git a/kernels/nacl/pnacl/1/bli_axpyv_opt.c b/kernels/nacl/pnacl/1/bli_axpyv_opt.c new file mode 100644 index 000000000..c26a34b98 --- /dev/null +++ b/kernels/nacl/pnacl/1/bli_axpyv_opt.c @@ -0,0 +1,203 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#if PPAPI_RELEASE >= 36 +typedef float v4sf __attribute__ ((vector_size(16))); + +inline v4sf v4sf_splat(float x) { + return (v4sf) { x, x, x, x }; +} + +inline v4sf v4sf_load(const float* a) { + return *((const v4sf*)a); +} + +inline v4sf v4sf_cload(const scomplex* a) { + return *((const v4sf*)a); +} + +inline void v4sf_store(float* a, v4sf x) { + *((v4sf*)a) = x; +} + +inline void v4sf_cstore(scomplex* a, v4sf x) { + *((v4sf*)a) = x; +} + +inline v4sf v4sf_zero() { + return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; +} +#endif + + +void bli_saxpyv_opt( + conj_t conjx, + dim_t n, + float alpha[restrict static 1], + float x[restrict static n], + inc_t incx, + float y[restrict static n], + inc_t incy) +{ + if (bli_zero_dim1(n)) { + return; + } + + if (bli_seq0(*alpha)) { + return; + } + +#if PPAPI_RELEASE >= 36 + if (!bli_has_nonunit_inc2(incx, incy)) { + const v4sf alphav = v4sf_splat(*alpha); + while (n >= 4) { + const v4sf xv = v4sf_load(x); + v4sf yv = v4sf_load(y); + yv += xv * alphav; + v4sf_store(y, yv); + + x += 4; + y += 4; + n -= 4; + } + const float alphac = *alpha; + while (n--) { + (*y++) += (*x++) * alphac; + } + } +#endif + /* Just call the reference implementation. */ + BLIS_SAXPYV_KERNEL_REF( + conjx, + n, + alpha, + x, + incx, + y, + incy); +} + + +void bli_caxpyv_opt( + conj_t conjx, + dim_t n, + scomplex alpha[restrict static 1], + scomplex x[restrict static n], + inc_t incx, + scomplex y[restrict static n], + inc_t incy) +{ + if (bli_zero_dim1(n)) { + return; + } + + if (bli_ceq0(*alpha)) { + return; + } + +#if PPAPI_RELEASE >= 36 + if (!bli_has_nonunit_inc2(incx, incy)) { + if (bli_is_noconj(conjx)) { + const v4sf alphav0 = v4sf_splat(alpha->real); + const v4sf alphav1 = (v4sf) { -alpha->imag, alpha->imag, -alpha->imag, alpha->imag }; + while (n >= 2) { + const v4sf xv0 = v4sf_cload(x); + v4sf yv = v4sf_cload(y); + const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2); + yv += xv0 * alphav0 + xv1 * alphav1; + v4sf_cstore(y, yv); + + x += 2; + y += 2; + n -= 2; + } + const float alphar = alpha->real; + const float alphai = alpha->imag; + while (n--) { + const float xr = x->real; + const float xi = x->imag; + const float yr = y->real; + const float yi = y->imag; + + y->real = yr + xr * alphar - xi * alphai; + y->imag = yi + xr * alphai + xi * alphar; + + x += 1; + y += 1; + } + } else { + const v4sf alphav0 = (v4sf) { alpha->real, -alpha->real, alpha->real, -alpha->real }; + const v4sf alphav1 = v4sf_splat(alpha->imag); + while (n >= 2) { + const v4sf xv0 = v4sf_cload(x); + v4sf yv = v4sf_cload(y); + const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2); + yv += xv0 * alphav0 + xv1 * alphav1; + v4sf_cstore(y, yv); + + x += 2; + y += 2; + n -= 2; + } + const float alphar = alpha->real; + const float alphai = alpha->imag; + while (n--) { + const float xr = x->real; + const float xi = x->imag; + const float yr = y->real; + const float yi = y->imag; + + y->real = yr + xr * alphar + xi * alphai; + y->imag = yi + xr * alphai - xi * alphar; + + x += 1; + y += 1; + } + } + } +#endif + + + /* Just call the reference implementation. */ + BLIS_CAXPYV_KERNEL_REF( + conjx, + n, + alpha, + x, + incx, + y, + incy); +} From 2cb9a5501a3cbeb6692cf68e896087ba73b6af69 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 23 Jun 2014 10:42:29 -0500 Subject: [PATCH 12/19] Removed 'version' from .gitignore file. --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8accdfa50..69e5f00be 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,6 @@ # -- build system files -- config.mk -version # -- makefile fragments -- From ebb33965981dcb2b0bdee5fc7fdf6c959420f311 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 23 Jun 2014 11:22:50 -0500 Subject: [PATCH 13/19] Added 'version' file. --- version | 1 + 1 file changed, 1 insertion(+) create mode 100644 version diff --git a/version b/version new file mode 100644 index 000000000..b1e80bb24 --- /dev/null +++ b/version @@ -0,0 +1 @@ +0.1.3 From 09d9a3bf6763932d9f571085b2cfd1b8631eccba Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 23 Jun 2014 13:43:26 -0500 Subject: [PATCH 14/19] Reverting version file to test new version script. Details: - Changed version file contents to 0.1.2 so that I can test out a new version file bumping script. --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index b1e80bb24..d917d3e26 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.1.3 +0.1.2 From 036cc634918463b1caa0fd89c9a211f2f5639af7 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 23 Jun 2014 13:48:17 -0500 Subject: [PATCH 15/19] Version file update (0.1.3) --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index d917d3e26..b1e80bb24 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.1.2 +0.1.3 From 9ef1f1e21d083697fc730e48d7d9169c201f3da2 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 23 Jun 2014 13:48:17 -0500 Subject: [PATCH 16/19] CHANGELOG update (0.1.3) --- CHANGELOG | 135 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index eb33b31ef..9c18f47ae 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,137 @@ -commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (HEAD, tag: 0.1.2, origin/master, master) +commit 036cc634918463b1caa0fd89c9a211f2f5639af7 (HEAD, tag: 0.1.3, master) +Author: Field G. Van Zee +Date: Mon Jun 23 13:48:17 2014 -0500 + + Version file update (0.1.3) + +commit 09d9a3bf6763932d9f571085b2cfd1b8631eccba +Author: Field G. Van Zee +Date: Mon Jun 23 13:43:26 2014 -0500 + + Reverting version file to test new version script. + + Details: + - Changed version file contents to 0.1.2 so that I can test out a new + version file bumping script. + +commit ebb33965981dcb2b0bdee5fc7fdf6c959420f311 +Author: Field G. Van Zee +Date: Mon Jun 23 11:22:50 2014 -0500 + + Added 'version' file. + +commit 2cb9a5501a3cbeb6692cf68e896087ba73b6af69 +Author: Field G. Van Zee +Date: Mon Jun 23 10:42:29 2014 -0500 + + Removed 'version' from .gitignore file. + +commit b40dcefc5ee31f67aa3990e2e9d2ef8ed1386a25 (origin/master) +Merge: 7101a8e b693b0c +Author: Field G. Van Zee +Date: Mon Jun 23 10:39:05 2014 -0500 + + Merge pull request #11 from Maratyszcza/stable + + [sc]axpy kernels for PNaCl + +commit b693b0cddcfb41450e3c09a3ab97acb44c1ccdec +Author: Marat Dukhan +Date: Sun Jun 22 13:44:25 2014 -0700 + + [SC]AXPY kernels for PNaCl + +commit 7101a8eec0327d6c3a7eb36eb4b0fd45c1c6d162 +Merge: ad48dca 020a831 +Author: Field G. Van Zee +Date: Thu Jun 19 21:46:50 2014 -0500 + + Merge pull request #10 from Maratyszcza/stable + + Portable Native Client port + +commit 020a831bc5f61744cb8354886aa679b99b1285f6 +Author: Marat Dukhan +Date: Thu Jun 19 00:58:26 2014 -0700 + + Code clean-up in PNaCl port + +commit 491be4f91ed725522f5cc7184053857c6c376ada +Author: Marat Dukhan +Date: Thu Jun 19 00:45:44 2014 -0700 + + Optimized dot product kernels for PNaCl + +commit 4b8e71aab80182873a2e138eb07902b8d8fd5480 +Author: Marat Dukhan +Date: Thu Jun 19 00:43:25 2014 -0700 + + Use AR rcs flags for PNaCl target to avoid warning + +commit 031deb2a5c718d569bde842590a791b812f4cf1d +Author: Marat Dukhan +Date: Wed Jun 18 03:11:34 2014 -0700 + + PNaCl configuration: use pnacl-ar instead or ar (fixes build issue on Mac) + +commit 68a02976e3c3638f0a9821342e269a1743e3ace3 +Author: Marat Dukhan +Date: Wed Jun 18 03:10:25 2014 -0700 + + Compile pnacl configuration in GNU11 mode to avoid warning about non-standard features + +commit 6f8462eb0ec278b89731e73ef583386a3371d095 +Author: Marat Dukhan +Date: Wed Jun 18 03:08:46 2014 -0700 + + Fix inconsistent VERBOSE macro in Makefile + +commit b2ffb4de8b6872cb23537ad282e557d11dcd9c8b +Author: Marat Dukhan +Date: Sun Jun 15 18:41:30 2014 -0400 + + Reformatted PNaCl GEMM kernels + +commit 6de2d472d98baa215264a776f3d5291780a6a085 +Author: Marat Dukhan +Date: Sun Jun 15 08:44:31 2014 -0400 + + CGEMM and ZGEMM kernels for PNaCl + +commit f064711a5e6fb3852c17c7520909b09dc27665f2 +Author: Marat Dukhan +Date: Sun Jun 15 06:27:37 2014 -0400 + + SGEMM and DGEMM kernels for PNaCl + +commit ad48dca22913a363899f0bef45553898718eebb1 +Merge: ee2b679 7118f87 +Author: Field G. Van Zee +Date: Sat Jun 14 15:10:13 2014 -0500 + + Merge pull request #9 from tkelman/memalign_windows + + Use _aligned_malloc instead of posix_memalign on Windows + +commit 7118f87e18b4941423472afc00215c1d1f2a1fcd +Author: Tony Kelman +Date: Sat Jun 14 06:53:20 2014 -0700 + + Use _aligned_malloc instead of posix_memalign on Windows + +commit ee2b679281ca45fb40b2198e293bc3bc3d446632 +Author: Tyler Smith +Date: Fri Jun 6 12:41:55 2014 -0500 + + Only include omp.h if BLIS_ENABLE_OPENMP is set + +commit 19c05dfaac43c627f86e897c8c00f1f9440754aa +Author: Field G. Van Zee +Date: Thu Jun 5 10:54:16 2014 -0500 + + CHANGELOG update (for 0.1.2). + +commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (tag: 0.1.2) Author: Tyler Smith Date: Mon Jun 2 13:40:57 2014 -0500 From b80df0f2cffb015da02e70a82b8512da9891ab67 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 23 Jun 2014 13:52:39 -0500 Subject: [PATCH 17/19] Added bump-version.sh script to 'build' directory. Details: - Added a bash script, bump-version.sh, to aid in incrementing the BLIS version string. --- build/bump-version.sh | 201 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100755 build/bump-version.sh diff --git a/build/bump-version.sh b/build/bump-version.sh new file mode 100755 index 000000000..4b0fece2f --- /dev/null +++ b/build/bump-version.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# bump-version.sh +# +# Field G. Van Zee +# + + +print_usage() +{ + #local script_name + + # Get the script name + #script_name=${0##*/} + + # Echo usage info + echo " " + echo " "$script_name + echo " " + echo " Field G. Van Zee" + echo " " + echo " Performs a series of actions needed when incrementing (bumping) the" + echo " BLIS version number." + echo " " + echo " Usage:" + echo " ${script_name} [options] new_vers" + echo " " + echo " Arguments:" + echo " " + echo " new_vers The new version string." + echo " " + echo " Options:" + echo " " + echo " -d dry-run" + echo " Go through all the motions, but don't actually make any" + echo " changes to files or perform any git commits." + echo " -f VERSFILE version file name" + echo " Update VERSFILE with new version string instead of default" + echo " 'version' file." + + # Exit with non-zero exit status + exit 1 +} + + +main() +{ + # -- BEGIN GLOBAL VARIABLE DECLARATIONS -- + + # The name of the script, stripped of any preceeding path. + script_name=${0##*/} + + # The name of the CHANGELOG file. + changelog_file='CHANGELOG' + + # The name of the default version file. + version_file_def='version' + + # The name of the specified version file. + version_file='' + + # Strings used during version query. + git_commit_str='' + new_version_str='' + + # The script name to use instead of the $0 when outputting messages. + output_name='' + + # The git directory. + gitdir='.git' + + # Whether we are performing a dry run or not. + dry_run_flag="" + + # -- END GLOBAL VARIABLE DECLARATIONS -- + + + # Process our command line options. + while getopts ":dhf:" opt; do + case $opt in + d ) dry_run_flag="1" ;; + f ) version_file=$OPTARG ;; + h ) print_usage ;; + \? ) print_usage + esac + done + shift $(($OPTIND - 1)) + + + # If a version file name was not given, set version_file to the default + # value. + if [ -n "${version_file}" ]; then + + echo "${script_name}: version file specified: '${version_file}'." + else + + echo "${script_name}: no version file specified; defaulting to '${version_file_def}'." + version_file="${version_file_def}" + fi + + + # Check the number of arguments after command line option processing. + if [ $# = "1" ]; then + + new_version_str=$1 + echo "${script_name}: preparing to bump to version '${new_version_str}'." + + else + print_usage + fi + + + # Check if the .git dir exists; if it does not, we do nothing. + if [ -d "${gitdir}" ]; then + + echo "${script_name}: found '${gitdir}' directory; assuming git clone." + + #echo "${script_name}: executing: git describe --always." + git_commit_str=$(git describe --always) + echo "${script_name}: starting commit: ${git_commit_str}." + + echo "${script_name}: updating version file '${version_file}'." + if [ -z "$dry_run_flag" ]; then + echo "${new_version_str}" > ${version_file} + fi + + echo "${script_name}: executing: git commit -m \"Version file update (${new_version_str})\" ${version_file}." + if [ -z "$dry_run_flag" ]; then + git commit -m "Version file update (${new_version_str})" ${version_file} + fi + + #echo "${script_name}: executing: git describe --always." + git_commit_str=$(git describe --always) + echo "${script_name}: commit to be tagged: ${git_commit_str}." + + echo "${script_name}: executing: git tag ${new_version_str} ${git_commit_str}." + if [ -z "$dry_run_flag" ]; then + git tag ${new_version_str} ${git_commit_str} + fi + + echo "${script_name}: updating ${changelog_file}." + if [ -z "$dry_run_flag" ]; then + make changelog + fi + + echo "${script_name}: executing: git commit -m \"CHANGELOG update (${new_version_str})\" ${changelog_file}." + if [ -z "$dry_run_flag" ]; then + git commit -m "CHANGELOG update (${new_version_str})" ${changelog_file} + fi + + #echo "${script_name}: executing: git describe --always." + git_commit_str=$(git describe --always) + echo "${script_name}: latest commit: ${git_commit_str}." + + else + + echo "${script_name}: could not find '${gitdir}' directory; bailing out." + + fi + + + # Exit peacefully. + return 0 +} + + +# The script's main entry point, passing all parameters given. +main "$@" From e8ef69692831db07ddbe9485a5e504ac3f03e496 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 2 Jul 2014 14:59:27 -0500 Subject: [PATCH 18/19] Added shared library support to build system. Details: - Modified top-level Makefile to support building shared (dynamic) libraries. - Updated most configurations' make_defs.mk files to include necessary compiler/linker flags needed by top-level Makefile. - Note that by default, all configurations presently do NOT build shared libraries. To enable, one must change the value of BLIS_ENABLE_DYNAMIC_BUILD to 'yes'. --- Makefile | 72 ++++++++++++++++++++++++++++----- config/armv7a/make_defs.mk | 8 ++-- config/bgq/make_defs.mk | 7 ++-- config/cortex-a15/make_defs.mk | 8 ++-- config/cortex-a9/make_defs.mk | 8 ++-- config/dunnington/make_defs.mk | 8 ++-- config/loongson3a/make_defs.mk | 8 ++-- config/mic/make_defs.mk | 8 ++-- config/piledriver/make_defs.mk | 8 ++-- config/pnacl/make_defs.mk | 8 ++-- config/power7/make_defs.mk | 11 ++--- config/reference/make_defs.mk | 10 +++-- config/sandybridge/make_defs.mk | 8 ++-- config/template/make_defs.mk | 8 ++-- 14 files changed, 129 insertions(+), 51 deletions(-) diff --git a/Makefile b/Makefile index ca1d50fd5..8c49a2d6c 100644 --- a/Makefile +++ b/Makefile @@ -174,7 +174,7 @@ VERS_CONF := $(VERSION)-$(CONFIG_NAME) # Note: These names will be modified later to include the configuration and # version strings. BLIS_LIB_NAME := $(BLIS_LIB_BASE_NAME).a -#BLIS_DLL_NAME := $(BLIS_LIB_BASE_NAME).so +BLIS_DLL_NAME := $(BLIS_LIB_BASE_NAME).so # --- BLIS framework source and object variable names --- @@ -196,8 +196,9 @@ MK_CONFIG_OBJS := MK_CONFIG_NOOPT_OBJS := MK_CONFIG_KERNELS_OBJS := -# Append the base library path to the library name. +# Append the base library path to the library names. MK_ALL_BLIS_LIB := $(BASE_LIB_PATH)/$(BLIS_LIB_NAME) +MK_ALL_BLIS_DLL := $(BASE_LIB_PATH)/$(BLIS_DLL_NAME) # --- Define install target names for static libraries --- @@ -209,6 +210,16 @@ MK_BLIS_LIB_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.a, \ $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).a, \ $(MK_BLIS_LIB)) +# --- Define install target names for shared libraries --- + +MK_BLIS_DLL := $(MK_ALL_BLIS_DLL) +MK_BLIS_DLL_INST := $(patsubst $(BASE_LIB_PATH)/%.so, \ + $(INSTALL_PREFIX)/lib/%.so, \ + $(MK_BLIS_DLL)) +MK_BLIS_DLL_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.so, \ + $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so, \ + $(MK_BLIS_DLL)) + # --- Determine which libraries to build --- MK_LIBS := @@ -221,6 +232,12 @@ MK_LIBS_INST += $(MK_BLIS_LIB_INST) MK_LIBS_INST_W_VERS_CONF += $(MK_BLIS_LIB_INST_W_VERS_CONF) endif +ifeq ($(BLIS_ENABLE_DYNAMIC_BUILD),yes) +MK_LIBS += $(MK_BLIS_DLL) +MK_LIBS_INST += $(MK_BLIS_DLL_INST) +MK_LIBS_INST_W_VERS_CONF += $(MK_BLIS_DLL_INST_W_VERS_CONF) +endif + # Strip leading, internal, and trailing whitespace. MK_LIBS_INST := $(strip $(MK_LIBS_INST)) MK_LIBS_INST_W_VERS_CONF := $(strip $(MK_LIBS_INST_W_VERS_CONF)) @@ -385,12 +402,12 @@ TESTSUITE_BIN := $(TESTSUITE_NAME).x # --- Uninstall definitions ---------------------------------------------------- # -# This shell command grabs all files named "libblis-*.a" in the installation -# directory and then filters out the name of the library archive for the -# current version/configuration. We consider this remaining set of libraries -# to be "old" and eligible for removal upon running of the uninstall-old -# target. -UNINSTALL_LIBS := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.a" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).a" | $(GREP) -v $(BLIS_LIB_NAME)) +# This shell command grabs all files named "libblis-*.a" or "libblis-*.so" in +# the installation directory and then filters out the name of the library +# archive for the current version/configuration. We consider this remaining set +# of libraries to be "old" and eligible for removal upon running of the +# uninstall-old target. +UNINSTALL_LIBS := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.[a|so]" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).[a|so]" | $(GREP) -v $(BLIS_LIB_NAME)) @@ -464,10 +481,13 @@ ifeq ($(MAKE_DEFS_MK_PRESENT),no) endif -# --- Static library archiver rules --- +# --- All-purpose library rule (static and shared) --- blis-lib: check-env $(MK_LIBS) + +# --- Static library archiver rules --- + $(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS) ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) $(AR) $(ARFLAGS) $@ $? @@ -479,6 +499,17 @@ else endif +# --- Dynamic library linker rules --- + +$(MK_ALL_BLIS_DLL): $(MK_ALL_BLIS_OBJS) +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) + $(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $? +else + @echo "Dynamically linking $@" + @$(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $? +endif + + # --- Test suite rules --- testsuite: testsuite-run @@ -540,6 +571,16 @@ else @$(INSTALL) -m 0644 $< $@ endif +$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so: $(BASE_LIB_PATH)/%.so $(CONFIG_MK_FILE) +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) + $(INSTALL) -m 0755 -d $(@D) + $(INSTALL) -m 0644 $< $@ +else + @echo "Installing $(@F) into $(INSTALL_PREFIX)/lib/" + @$(INSTALL) -m 0755 -d $(@D) + @$(INSTALL) -m 0644 $< $@ +endif + # --- Install-symlinks rules --- @@ -555,6 +596,16 @@ else @$(MV) $(@F) $(INSTALL_PREFIX)/lib/ endif +$(INSTALL_PREFIX)/lib/%.so: $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so +ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) + $(SYMLINK) $( Date: Thu, 3 Jul 2014 11:48:23 -0500 Subject: [PATCH 19/19] Defined _ukernel_void() wrappers to micro-kernels. Details: - Added wrappers for micro-kernels so that users may invoke the micro-kernels without knowing what the function names actually are. This is useful when an application wishes to call the micro-kernel from a shared library instance of BLIS, where the application may not necessarily have the luxury of grabbing the micro-kernel name(s) from C preprocessor macros at compile-time. Also, since the wrappers use void* pointers, one's environment does not need to be aware of some BLIS types such as scomplex and dcomplex. These wrappers now join the level-1 and level-1f kernel wrappers, which pre-dated this commit. - Removed the wrapper definitions and prototypes from the micro-kernel test suite modules, and replaced calls to them with calls to the new wrappers mentioned above. --- frame/3/gemm/bli_gemm.h | 2 + frame/3/gemm/bli_gemm_ukernel.c | 126 +++++++++++++++++++++++++ frame/3/gemm/bli_gemm_ukernel.h | 60 ++++++++++++ frame/3/trsm/bli_gemmtrsm_ukernel.c | 141 ++++++++++++++++++++++++++++ frame/3/trsm/bli_gemmtrsm_ukernel.h | 63 +++++++++++++ frame/3/trsm/bli_trsm.h | 3 + frame/3/trsm/bli_trsm_ukernel.c | 111 ++++++++++++++++++++++ frame/3/trsm/bli_trsm_ukernel.h | 56 +++++++++++ testsuite/src/test_gemm_ukr.c | 98 +------------------ testsuite/src/test_gemm_ukr.h | 25 ----- testsuite/src/test_gemmtrsm_ukr.c | 114 +--------------------- testsuite/src/test_gemmtrsm_ukr.h | 27 ------ testsuite/src/test_trsm_ukr.c | 83 +--------------- testsuite/src/test_trsm_ukr.h | 20 ---- 14 files changed, 565 insertions(+), 364 deletions(-) create mode 100644 frame/3/gemm/bli_gemm_ukernel.c create mode 100644 frame/3/gemm/bli_gemm_ukernel.h create mode 100644 frame/3/trsm/bli_gemmtrsm_ukernel.c create mode 100644 frame/3/trsm/bli_gemmtrsm_ukernel.h create mode 100644 frame/3/trsm/bli_trsm_ukernel.c create mode 100644 frame/3/trsm/bli_trsm_ukernel.h diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index c47985a3e..704402f58 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -38,6 +38,8 @@ #include "bli_gemm_int.h" #include "bli_gemm_target.h" +#include "bli_gemm_ukernel.h" + #include "bli_gemm_blk_var1f.h" #include "bli_gemm_blk_var2f.h" #include "bli_gemm_blk_var3f.h" diff --git a/frame/3/gemm/bli_gemm_ukernel.c b/frame/3/gemm/bli_gemm_ukernel.c new file mode 100644 index 000000000..5418d40a4 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ukernel.c @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_ukr_fp + +typedef void (*FUNCPTR_T)( + dim_t k, + void* alpha, + void* a, + void* b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + +static FUNCPTR_T GENARRAY(ftypes,gemm_ukernel_void); + + +void bli_gemm_ukernel( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + num_t dt = bli_obj_datatype( *c ); + + dim_t k = bli_obj_width( *a ); + + void* buf_a = bli_obj_buffer_at_off( *a ); + + void* buf_b = bli_obj_buffer_at_off( *b ); + + void* buf_c = bli_obj_buffer_at_off( *c ); + inc_t rs_c = bli_obj_row_stride( *c ); + inc_t cs_c = bli_obj_col_stride( *c ); + + void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); + + void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta ); + + inc_t ps_a = bli_obj_panel_stride( *a ); + inc_t ps_b = bli_obj_panel_stride( *b ); + + FUNCPTR_T f; + + auxinfo_t data; + + + // Fill the auxinfo_t struct in case the micro-kernel uses it. + bli_auxinfo_set_next_a( buf_a, data ); + bli_auxinfo_set_next_b( buf_b, data ); + bli_auxinfo_set_ps_a( ps_a, data ); + bli_auxinfo_set_ps_b( ps_b, data ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt]; + + // Invoke the function. + f( k, + buf_alpha, + buf_a, + buf_b, + buf_beta, + buf_c, rs_c, cs_c, + &data ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, ukrname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + void* alpha, \ + void* a, \ + void* b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + PASTEMAC(ch,ukrname)( k, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + data ); \ +} + +INSERT_GENTFUNC_BASIC( gemm_ukernel_void, GEMM_UKERNEL ) + diff --git a/frame/3/gemm/bli_gemm_ukernel.h b/frame/3/gemm/bli_gemm_ukernel.h new file mode 100644 index 000000000..a4c4e0576 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ukernel.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm_ukernel( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype the void pointer kernel wrappers. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + void* alpha, \ + void* a, \ + void* b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( gemm_ukernel_void ) + diff --git a/frame/3/trsm/bli_gemmtrsm_ukernel.c b/frame/3/trsm/bli_gemmtrsm_ukernel.c new file mode 100644 index 000000000..65ef744d3 --- /dev/null +++ b/frame/3/trsm/bli_gemmtrsm_ukernel.c @@ -0,0 +1,141 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef FUNCPTR_T +#define FUNCPTR_T gemmtrsm_ukr_fp + +typedef void (*FUNCPTR_T)( + dim_t k, + void* alpha, + void* a1x, + void* a11, + void* bx1, + void* b11, + void* c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + +static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukernel_void); +static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukernel_void); + + +void bli_gemmtrsm_ukernel( obj_t* alpha, + obj_t* a1x, + obj_t* a11, + obj_t* bx1, + obj_t* b11, + obj_t* c11 ) +{ + dim_t k = bli_obj_width( *a1x ); + + num_t dt = bli_obj_datatype( *c11 ); + + void* buf_a1x = bli_obj_buffer_at_off( *a1x ); + + void* buf_a11 = bli_obj_buffer_at_off( *a11 ); + + void* buf_bx1 = bli_obj_buffer_at_off( *bx1 ); + + void* buf_b11 = bli_obj_buffer_at_off( *b11 ); + + void* buf_c11 = bli_obj_buffer_at_off( *c11 ); + inc_t rs_c = bli_obj_row_stride( *c11 ); + inc_t cs_c = bli_obj_col_stride( *c11 ); + + void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); + + inc_t ps_a = bli_obj_panel_stride( *a1x ); + inc_t ps_b = bli_obj_panel_stride( *bx1 ); + + FUNCPTR_T f; + + auxinfo_t data; + + + // Fill the auxinfo_t struct in case the micro-kernel uses it. + if ( bli_obj_is_lower( *a11 ) ) + { bli_auxinfo_set_next_a( buf_a1x, data ); } + else + { bli_auxinfo_set_next_a( buf_a11, data ); } + bli_auxinfo_set_next_b( buf_bx1, data ); + + bli_auxinfo_set_ps_a( ps_a, data ); + bli_auxinfo_set_ps_b( ps_b, data ); + + // Index into the type combination array to extract the correct + // function pointer. + if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt]; + else f = ftypes_u[dt]; + + // Invoke the function. + f( k, + buf_alpha, + buf_a1x, + buf_a11, + buf_bx1, + buf_b11, + buf_c11, rs_c, cs_c, + &data ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, ukrname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + void* alpha, \ + void* a1x, \ + void* a11, \ + void* bx1, \ + void* b11, \ + void* c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + PASTEMAC(ch,ukrname)( k, \ + alpha, \ + a1x, \ + a11, \ + bx1, \ + b11, \ + c11, rs_c, cs_c, \ + data ); \ +} + +INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukernel_void, GEMMTRSM_L_UKERNEL ) +INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukernel_void, GEMMTRSM_U_UKERNEL ) + diff --git a/frame/3/trsm/bli_gemmtrsm_ukernel.h b/frame/3/trsm/bli_gemmtrsm_ukernel.h new file mode 100644 index 000000000..2cfb47bbd --- /dev/null +++ b/frame/3/trsm/bli_gemmtrsm_ukernel.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemmtrsm_ukernel( obj_t* alpha, + obj_t* a1x, + obj_t* a11, + obj_t* bx1, + obj_t* b11, + obj_t* c11 ); + + +// +// Prototype the void pointer kernel wrappers. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + void* alpha, \ + void* a1x, \ + void* a11, \ + void* bx1, \ + void* b11, \ + void* c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( gemmtrsm_l_ukernel_void ) +INSERT_GENTPROT_BASIC( gemmtrsm_u_ukernel_void ) + diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h index 5fa21cccc..d358dd66c 100644 --- a/frame/3/trsm/bli_trsm.h +++ b/frame/3/trsm/bli_trsm.h @@ -37,6 +37,9 @@ #include "bli_trsm_front.h" #include "bli_trsm_int.h" +#include "bli_gemmtrsm_ukernel.h" +#include "bli_trsm_ukernel.h" + #include "bli_trsm_blk_var1f.h" #include "bli_trsm_blk_var1b.h" diff --git a/frame/3/trsm/bli_trsm_ukernel.c b/frame/3/trsm/bli_trsm_ukernel.c new file mode 100644 index 000000000..fb19e0abb --- /dev/null +++ b/frame/3/trsm/bli_trsm_ukernel.c @@ -0,0 +1,111 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef FUNCPTR_T +#define FUNCPTR_T trsm_ukr_fp + +typedef void (*FUNCPTR_T)( + void* a, + void* b, + void* c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + +static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukernel_void); +static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukernel_void); + + +void bli_trsm_ukernel( obj_t* a, + obj_t* b, + obj_t* c ) +{ + num_t dt = bli_obj_datatype( *c ); + + void* buf_a = bli_obj_buffer_at_off( *a ); + + void* buf_b = bli_obj_buffer_at_off( *b ); + + void* buf_c = bli_obj_buffer_at_off( *c ); + inc_t rs_c = bli_obj_row_stride( *c ); + inc_t cs_c = bli_obj_col_stride( *c ); + + inc_t ps_a = bli_obj_panel_stride( *a ); + inc_t ps_b = bli_obj_panel_stride( *b ); + + FUNCPTR_T f; + + auxinfo_t data; + + + // Fill the auxinfo_t struct in case the micro-kernel uses it. + bli_auxinfo_set_next_a( buf_a, data ); + bli_auxinfo_set_next_b( buf_b, data ); + + bli_auxinfo_set_ps_a( ps_a, data ); + bli_auxinfo_set_ps_b( ps_b, data ); + + // Index into the type combination array to extract the correct + // function pointer. + if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt]; + else f = ftypes_u[dt]; + + // Invoke the function. + f( buf_a, + buf_b, + buf_c, rs_c, cs_c, + &data ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, ukrname ) \ +\ +void PASTEMAC(ch,varname)( \ + void* a, \ + void* b, \ + void* c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + PASTEMAC(ch,ukrname)( a, \ + b, \ + c, rs_c, cs_c, \ + data ); \ +} + +INSERT_GENTFUNC_BASIC( trsm_l_ukernel_void, TRSM_L_UKERNEL ) +INSERT_GENTFUNC_BASIC( trsm_u_ukernel_void, TRSM_U_UKERNEL ) + diff --git a/frame/3/trsm/bli_trsm_ukernel.h b/frame/3/trsm/bli_trsm_ukernel.h new file mode 100644 index 000000000..eaf18d1d4 --- /dev/null +++ b/frame/3/trsm/bli_trsm_ukernel.h @@ -0,0 +1,56 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trsm_ukernel( obj_t* a, + obj_t* b, + obj_t* c ); + + +// +// Prototype the void pointer kernel wrappers. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + void* a, \ + void* b, \ + void* c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( trsm_l_ukernel_void ) +INSERT_GENTPROT_BASIC( trsm_u_ukernel_void ) + diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 3ce1b688f..c7464ce21 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -270,7 +270,7 @@ void libblis_test_gemm_ukr_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_UKERNEL: - bli_gemm_ukr( alpha, a, b, beta, c ); + bli_gemm_ukernel( alpha, a, b, beta, c ); break; default: @@ -354,99 +354,3 @@ void libblis_test_gemm_ukr_check( obj_t* alpha, bli_obj_free( &z ); } - - -// -// Define object-wrapper to GEMM_UKERNEL micro-kernels. -// - -#define FUNCPTR_T gemm_ukr_fp - -typedef void (*FUNCPTR_T)( - dim_t k, - void* alpha, - void* a, - void* b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ); - -static FUNCPTR_T GENARRAY(ftypes,gemm_ukr); - - -void bli_gemm_ukr( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c ) -{ - num_t dt = bli_obj_datatype( *c ); - - dim_t k = bli_obj_width( *a ); - - void* buf_a = bli_obj_buffer_at_off( *a ); - - void* buf_b = bli_obj_buffer_at_off( *b ); - - void* buf_c = bli_obj_buffer_at_off( *c ); - inc_t rs_c = bli_obj_row_stride( *c ); - inc_t cs_c = bli_obj_col_stride( *c ); - - void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); - - void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta ); - - inc_t ps_a = bli_obj_panel_stride( *a ); - inc_t ps_b = bli_obj_panel_stride( *b ); - - FUNCPTR_T f; - - auxinfo_t data; - - - // Fill the auxinfo_t struct in case the micro-kernel uses it. - bli_auxinfo_set_next_a( buf_a, data ); - bli_auxinfo_set_next_b( buf_b, data ); - bli_auxinfo_set_ps_a( ps_a, data ); - bli_auxinfo_set_ps_b( ps_b, data ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt]; - - // Invoke the function. - f( k, - buf_alpha, - buf_a, - buf_b, - buf_beta, - buf_c, rs_c, cs_c, - &data ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, ukrname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - void* alpha, \ - void* a, \ - void* b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ) \ -{ \ - PASTEMAC(ch,ukrname)( k, \ - alpha, \ - a, \ - b, \ - beta, \ - c, rs_c, cs_c, \ - data ); \ -} - -INSERT_GENTFUNC_BASIC( gemm_ukr, GEMM_UKERNEL ) - diff --git a/testsuite/src/test_gemm_ukr.h b/testsuite/src/test_gemm_ukr.h index 002e033b6..72a563412 100644 --- a/testsuite/src/test_gemm_ukr.h +++ b/testsuite/src/test_gemm_ukr.h @@ -34,28 +34,3 @@ void libblis_test_gemm_ukr( test_params_t* params, test_op_t* op ); - -// -// Prototype wrapper interfaces to micro-kernel. -// -void bli_gemm_ukr( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - void* alpha, \ - void* a, \ - void* b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_ukr ) - diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index e2701d313..ff86640ae 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -314,7 +314,7 @@ void libblis_test_gemmtrsm_ukr_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_UKERNEL: - bli_gemmtrsm_ukr( alpha, a1x, a11, bx1, b11, c11 ); + bli_gemmtrsm_ukernel( alpha, a1x, a11, bx1, b11, c11 ); break; default: @@ -474,115 +474,3 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k, bli_obj_set_diag_offset( 0, *a11 ); } - - -// -// Define object-wrapper to GEMMTRSM_L_UKERNEL, GEMMTRSM_U_UKERNEL -// micro-kernels. -// - -#undef FUNCPTR_T -#define FUNCPTR_T gemmtrsm_ukr_fp - -typedef void (*FUNCPTR_T)( - dim_t k, - void* alpha, - void* a1x, - void* a11, - void* bx1, - void* b11, - void* c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ); - -static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukr); -static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukr); - - -void bli_gemmtrsm_ukr( obj_t* alpha, - obj_t* a1x, - obj_t* a11, - obj_t* bx1, - obj_t* b11, - obj_t* c11 ) -{ - dim_t k = bli_obj_width( *a1x ); - - num_t dt = bli_obj_datatype( *c11 ); - - void* buf_a1x = bli_obj_buffer_at_off( *a1x ); - - void* buf_a11 = bli_obj_buffer_at_off( *a11 ); - - void* buf_bx1 = bli_obj_buffer_at_off( *bx1 ); - - void* buf_b11 = bli_obj_buffer_at_off( *b11 ); - - void* buf_c11 = bli_obj_buffer_at_off( *c11 ); - inc_t rs_c = bli_obj_row_stride( *c11 ); - inc_t cs_c = bli_obj_col_stride( *c11 ); - - void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); - - inc_t ps_a = bli_obj_panel_stride( *a1x ); - inc_t ps_b = bli_obj_panel_stride( *bx1 ); - - FUNCPTR_T f; - - auxinfo_t data; - - - // Fill the auxinfo_t struct in case the micro-kernel uses it. - if ( bli_obj_is_lower( *a11 ) ) - { bli_auxinfo_set_next_a( buf_a1x, data ); } - else - { bli_auxinfo_set_next_a( buf_a11, data ); } - bli_auxinfo_set_next_b( buf_bx1, data ); - - bli_auxinfo_set_ps_a( ps_a, data ); - bli_auxinfo_set_ps_b( ps_b, data ); - - // Index into the type combination array to extract the correct - // function pointer. - if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt]; - else f = ftypes_u[dt]; - - // Invoke the function. - f( k, - buf_alpha, - buf_a1x, - buf_a11, - buf_bx1, - buf_b11, - buf_c11, rs_c, cs_c, - &data ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, ukrname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - void* alpha, \ - void* a1x, \ - void* a11, \ - void* bx1, \ - void* b11, \ - void* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ) \ -{ \ - PASTEMAC(ch,ukrname)( k, \ - alpha, \ - a1x, \ - a11, \ - bx1, \ - b11, \ - c11, rs_c, cs_c, \ - data ); \ -} - -INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukr, GEMMTRSM_L_UKERNEL ) -INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukr, GEMMTRSM_U_UKERNEL ) - diff --git a/testsuite/src/test_gemmtrsm_ukr.h b/testsuite/src/test_gemmtrsm_ukr.h index e2c2c311e..fd0d4cb9e 100644 --- a/testsuite/src/test_gemmtrsm_ukr.h +++ b/testsuite/src/test_gemmtrsm_ukr.h @@ -34,30 +34,3 @@ void libblis_test_gemmtrsm_ukr( test_params_t* params, test_op_t* op ); -// -// Prototype wrapper interfaces to micro-kernel. -// -void bli_gemmtrsm_ukr( obj_t* alpha, - obj_t* a1x, - obj_t* a11, - obj_t* bx1, - obj_t* b11, - obj_t* c11 ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - void* alpha, \ - void* a1x, \ - void* a11, \ - void* bx1, \ - void* b11, \ - void* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemmtrsm_l_ukr ) -INSERT_GENTPROT_BASIC( gemmtrsm_u_ukr ) - diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 234bec68d..675520321 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -267,7 +267,7 @@ void libblis_test_trsm_ukr_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_UKERNEL: - bli_trsm_ukr( a, b, c ); + bli_trsm_ukernel( a, b, c ); break; default: @@ -367,84 +367,3 @@ void libblis_test_trsm_ukr_check( side_t side, bli_obj_free( &z ); } - - -// -// Define object-wrapper to TRSM_L_UKERNEL, TRSM_U_UKERNEL micro-kernels. -// - -#undef FUNCPTR_T -#define FUNCPTR_T trsm_ukr_fp - -typedef void (*FUNCPTR_T)( - void* a, - void* b, - void* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ); - -static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukr); -static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukr); - - -void bli_trsm_ukr( obj_t* a, - obj_t* b, - obj_t* c ) -{ - num_t dt = bli_obj_datatype( *c ); - - void* buf_a = bli_obj_buffer_at_off( *a ); - - void* buf_b = bli_obj_buffer_at_off( *b ); - - void* buf_c = bli_obj_buffer_at_off( *c ); - inc_t rs_c = bli_obj_row_stride( *c ); - inc_t cs_c = bli_obj_col_stride( *c ); - - inc_t ps_a = bli_obj_panel_stride( *a ); - inc_t ps_b = bli_obj_panel_stride( *b ); - - FUNCPTR_T f; - - auxinfo_t data; - - - // Fill the auxinfo_t struct in case the micro-kernel uses it. - bli_auxinfo_set_next_a( buf_a, data ); - bli_auxinfo_set_next_b( buf_b, data ); - - bli_auxinfo_set_ps_a( ps_a, data ); - bli_auxinfo_set_ps_b( ps_b, data ); - - // Index into the type combination array to extract the correct - // function pointer. - if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt]; - else f = ftypes_u[dt]; - - // Invoke the function. - f( buf_a, - buf_b, - buf_c, rs_c, cs_c, - &data ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, ukrname ) \ -\ -void PASTEMAC(ch,varname)( \ - void* a, \ - void* b, \ - void* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ) \ -{ \ - PASTEMAC(ch,ukrname)( a, \ - b, \ - c, rs_c, cs_c, \ - data ); \ -} - -INSERT_GENTFUNC_BASIC( trsm_l_ukr, TRSM_L_UKERNEL ) -INSERT_GENTFUNC_BASIC( trsm_u_ukr, TRSM_U_UKERNEL ) - diff --git a/testsuite/src/test_trsm_ukr.h b/testsuite/src/test_trsm_ukr.h index 83129d7a3..1057f2853 100644 --- a/testsuite/src/test_trsm_ukr.h +++ b/testsuite/src/test_trsm_ukr.h @@ -34,23 +34,3 @@ void libblis_test_trsm_ukr( test_params_t* params, test_op_t* op ); -// -// Prototype wrapper interfaces to micro-kernel. -// -void bli_trsm_ukr( obj_t* a, - obj_t* b, - obj_t* c ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - void* a, \ - void* b, \ - void* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( trsm_l_ukr ) -INSERT_GENTPROT_BASIC( trsm_u_ukr ) -