Merge pull request #10 from Maratyszcza/stable

Portable Native Client port
This commit is contained in:
Field G. Van Zee
2014-06-19 21:46:50 -05:00
9 changed files with 1615 additions and 2 deletions

2
.gitignore vendored
View File

@@ -19,6 +19,8 @@
*.a
# test executables
*.x
*.pexe
*.nexe
# -- build system files --

View File

@@ -469,7 +469,7 @@ endif
blis-lib: check-env $(MK_LIBS)
$(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS)
ifeq ($(FLA_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(AR) $(ARFLAGS) $@ $?
$(RANLIB) $@
else

165
config/pnacl/bli_config.h Normal file
View File

@@ -0,0 +1,165 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// -- OPERATING SYSTEM ---------------------------------------------------------
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
// integers while 64 results in 64-bit integers. Any other value results in use
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
// Define the number of floating-point types supported, and the size of the
// largest type.
#define BLIS_NUM_FP_TYPES 4
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
// Enable use of built-in C99 "float complex" and "double complex" types and
// associated overloaded operations and functions? Disabling results in
// scomplex and dcomplex being defined in terms of simple structs.
//#define BLIS_ENABLE_C99_COMPLEX
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 1
// -- MEMORY ALLOCATION --------------------------------------------------------
// -- Contiguous (static) memory allocator --
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
// memory pools so that the micro-kernel, when computing with the end of the
// last block, can exceed the bounds of the usable portion of the memory
// region without causing a segmentation fault.
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
// -- Memory alignment --
// It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size
// and the page size.
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
// Basic (homogeneous) datatype support always enabled.
// Enable mixed domain operations?
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
// Enable extra mixed precision operations?
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
// Stay initialized after auto-initialization, unless and until the user
// explicitly calls bli_finalize().
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
// Enable the BLAS compatibility layer?
#define BLIS_ENABLE_BLAS2BLIS
// The bit size of the integer type used to track values such as dimensions and
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
// A value of 32 results in the compatibility layer using 32-bit signed integers
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
#endif

247
config/pnacl/bli_kernel.h Normal file
View File

@@ -0,0 +1,247 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
/*
* SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible.
* Therefore, if compilation targets an older Chrome release, we use scalar kernels.
* The target Chrome version is indicated by PPAPI_MACRO defined in the header below.
*/
#include <ppapi/c/pp_macros.h>
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
#else
#define BLIS_DEFAULT_MC_S 252
#define BLIS_DEFAULT_KC_S 264
#define BLIS_DEFAULT_NC_S 8196
#endif
#define BLIS_DEFAULT_MC_D 1080
#define BLIS_DEFAULT_KC_D 120
#define BLIS_DEFAULT_NC_D 8400
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#else
#define BLIS_DEFAULT_MC_C 120
#define BLIS_DEFAULT_KC_C 264
#define BLIS_DEFAULT_NC_C 4092
#endif
#define BLIS_DEFAULT_MC_Z 60
#define BLIS_DEFAULT_KC_Z 264
#define BLIS_DEFAULT_NC_Z 2040
// -- Register blocksizes --
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#else
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 3
#endif
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 3
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MR_C 4
#define BLIS_DEFAULT_NR_C 4
#else
#define BLIS_DEFAULT_MR_C 2
#define BLIS_DEFAULT_NR_C 3
#endif
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 3
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#if PPAPI_RELEASE >= 36
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt
#endif
// -- trsm-related --
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
// -- unpackm --
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
// -- dotaxpyv --
// -- axpyf --
// -- dotxf --
// -- dotxaxpyf --
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
// -- axpyv --
// -- copyv --
// -- dotv --
#define BLIS_SDOTV_KERNEL bli_sdotv_opt
#define BLIS_DDOTV_KERNEL bli_ddotv_opt
#define BLIS_CDOTV_KERNEL bli_cdotv_opt
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt
// -- dotxv --
// -- invertv --
// -- scal2v --
// -- scalv --
// -- setv --
// -- subv --
// -- swapv --
#endif

1
config/pnacl/kernels Symbolic link
View File

@@ -0,0 +1 @@
../../kernels/nacl/pnacl

117
config/pnacl/make_defs.mk Normal file
View File

@@ -0,0 +1,117 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := pnacl-ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := pnacl-clang
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O3
CKOPTFLAGS := $(COPTFLAGS) -ffast-math
CVECFLAGS :=
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := pnacl-ar
ARFLAGS := rcs
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS := -lm
# --- Determine the finalizer and related flags ---
FINALIZER := pnacl-finalize
FINFLAGS :=
# --- Determine the translator and related flags ---
TRANSLATOR := pnacl-translate
TRNSFLAGS := -O3
TRNSAMD64FLAGS := -arch x86-64
TRNSX86FLAGS := -arch i686
TRNSARMFLAGS := -arch armv7
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif

View File

@@ -0,0 +1,618 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#if PPAPI_RELEASE >= 36
typedef float v4sf __attribute__ ((vector_size(16)));
inline v4sf v4sf_splat(float x) {
return (v4sf) { x, x, x, x };
}
inline v4sf v4sf_load(const float* a) {
return *((const v4sf*)a);
}
inline v4sf v4sf_cload(const scomplex* a) {
return *((const v4sf*)a);
}
inline void v4sf_store(float* a, v4sf x) {
*((v4sf*)a) = x;
}
inline void v4sf_cstore(scomplex* a, v4sf x) {
*((v4sf*)a) = x;
}
inline v4sf v4sf_zero() {
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
}
#endif
void bli_sdotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
float x[restrict static n],
inc_t incx,
float y[restrict static n],
inc_t incy,
float rho[restrict static 1])
{
#if PPAPI_RELEASE >= 36
// If the vector lengths are zero, set rho to zero and return.
if (bli_zero_dim1(n)) {
*rho = 0.0f;
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if (bli_has_nonunit_inc2(incx, incy)) {
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
while (n >= 6) {
sum0 += (*x) * (*y);
x += incx;
y += incy;
sum1 += (*x) * (*y);
x += incx;
y += incy;
sum2 += (*x) * (*y);
x += incx;
y += incy;
sum3 += (*x) * (*y);
x += incx;
y += incy;
sum4 += (*x) * (*y);
x += incx;
y += incy;
sum5 += (*x) * (*y);
x += incx;
y += incy;
n -= 6;
}
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
while (n--) {
sum += (*x) * (*y);
x += incx;
y += incy;
}
*rho = sum;
} else {
v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero();
v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero();
while (n >= 24) {
vsum0 += v4sf_load(x) * v4sf_load(y);
vsum1 += v4sf_load(x+4) * v4sf_load(y+4);
vsum2 += v4sf_load(x+8) * v4sf_load(y+8);
vsum3 += v4sf_load(x+12) * v4sf_load(y+12);
vsum4 += v4sf_load(x+16) * v4sf_load(y+16);
vsum5 += v4sf_load(x+20) * v4sf_load(y+20);
x += 24;
y += 24;
n -= 24;
}
v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5);
while (n >= 4) {
vsum += v4sf_load(x) * v4sf_load(y);
x += 4;
y += 4;
n -= 4;
}
float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]);
while (n--) {
sum += (*x++) * (*y++);
}
*rho = sum;
}
#else
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
while (n >= 6) {
sum0 += (*x) * (*y);
x += incx;
y += incy;
sum1 += (*x) * (*y);
x += incx;
y += incy;
sum2 += (*x) * (*y);
x += incx;
y += incy;
sum3 += (*x) * (*y);
x += incx;
y += incy;
sum4 += (*x) * (*y);
x += incx;
y += incy;
sum5 += (*x) * (*y);
x += incx;
y += incy;
n -= 6;
}
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
while (n--) {
sum += (*x) * (*y);
x += incx;
y += incy;
}
*rho = sum;
#endif
}
void bli_ddotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
double x[restrict static n],
inc_t incx,
double y[restrict static n],
inc_t incy,
double rho[restrict static 1])
{
double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0;
while (n >= 6) {
sum0 += (*x) * (*y);
x += incx;
y += incy;
sum1 += (*x) * (*y);
x += incx;
y += incy;
sum2 += (*x) * (*y);
x += incx;
y += incy;
sum3 += (*x) * (*y);
x += incx;
y += incy;
sum4 += (*x) * (*y);
x += incx;
y += incy;
sum5 += (*x) * (*y);
x += incx;
y += incy;
n -= 6;
}
double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
while (n--) {
sum += (*x) * (*y);
x += incx;
y += incy;
}
*rho = sum;
}
void bli_cdotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex x[restrict static n],
inc_t incx,
scomplex y[restrict static n],
inc_t incy,
scomplex rho[restrict static 1])
{
if (bli_is_conj(conjy)) {
bli_toggle_conj(conjx);
}
if (bli_zero_dim1(n)) {
rho->real = 0.0f;
rho->imag = 0.0f;
return;
}
float sumr;
float sumi;
#if PPAPI_RELEASE >= 36
if (bli_is_noconj(conjx)) {
if (bli_has_nonunit_inc2(incx, incy)) {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r - x0i * y0i;
sum0i += x0r * y0i + x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r - x1i * y1i;
sum1i += x1r * y1i + x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
} else {
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
while (n >= 8) {
const v4sf xv0t = v4sf_cload(x);
const v4sf xv0b = v4sf_cload(x+2);
const v4sf yv0t = v4sf_cload(y);
const v4sf yv0b = v4sf_cload(y+2);
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
sumv0r += xv0r * yv0r - xv0i * yv0i;
sumv0i += xv0r * yv0i + xv0i * yv0r;
const v4sf xv1t = v4sf_cload(x+4);
const v4sf xv1b = v4sf_cload(x+6);
const v4sf yv1t = v4sf_cload(y+4);
const v4sf yv1b = v4sf_cload(y+6);
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
sumv1r += xv1r * yv1r - xv1i * yv1i;
sumv1i += xv1r * yv1i + xv1i * yv1r;
x += 8;
y += 8;
n -= 8;
}
const v4sf sumvr = sumv0r + sumv1r;
const v4sf sumvi = sumv0i + sumv1i;
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
}
while (n--) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr - xi * yi;
sumi += xr * yi + xi * yr;
x += incx;
y += incy;
}
} else {
if (bli_has_nonunit_inc2(incx, incy)) {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r + x0i * y0i;
sum0i += x0r * y0i - x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r + x1i * y1i;
sum1i += x1r * y1i - x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
} else {
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
while (n >= 8) {
const v4sf xv0t = v4sf_cload(x);
const v4sf xv0b = v4sf_cload(x+2);
const v4sf yv0t = v4sf_cload(y);
const v4sf yv0b = v4sf_cload(y+2);
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
sumv0r += xv0r * yv0r + xv0i * yv0i;
sumv0i += xv0r * yv0i - xv0i * yv0r;
const v4sf xv1t = v4sf_cload(x+4);
const v4sf xv1b = v4sf_cload(x+6);
const v4sf yv1t = v4sf_cload(y+4);
const v4sf yv1b = v4sf_cload(y+6);
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
sumv1r += xv1r * yv1r + xv1i * yv1i;
sumv1i += xv1r * yv1i - xv1i * yv1r;
x += 8;
y += 8;
n -= 8;
}
const v4sf sumvr = sumv0r + sumv1r;
const v4sf sumvi = sumv0i + sumv1i;
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
}
while (n--) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr + xi * yi;
sumi += xr * yi - xi * yr;
x += incx;
y += incy;
}
}
#else
if (bli_is_noconj(conjx)) {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r - x0i * y0i;
sum0i += x0r * y0i + x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r - x1i * y1i;
sum1i += x1r * y1i + x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr - xi * yi;
sumi += xr * yi + xi * yr;
}
} else {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r + x0i * y0i;
sum0i += x0r * y0i - x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r + x1i * y1i;
sum1i += x1r * y1i - x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr + xi * yi;
sumi += xr * yi - xi * yr;
}
}
#endif
rho->real = sumr;
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
}
void bli_zdotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex x[restrict static n],
inc_t incx,
dcomplex y[restrict static n],
inc_t incy,
dcomplex rho[restrict static 1])
{
if (bli_is_conj(conjy)) {
bli_toggle_conj(conjx);
}
if (bli_zero_dim1(n)) {
rho->real = 0.0;
rho->imag = 0.0;
return;
}
double sumr;
double sumi;
if (bli_is_noconj(conjx)) {
double sum0r = 0.0, sum1r = 0.0;
double sum0i = 0.0, sum1i = 0.0;
while (n >= 2) {
const double x0r = x->real;
const double x0i = x->imag;
const double y0r = y->real;
const double y0i = y->imag;
sum0r += x0r * y0r - x0i * y0i;
sum0i += x0r * y0i + x0i * y0r;
x += incx;
y += incy;
const double x1r = x->real;
const double x1i = x->imag;
const double y1r = y->real;
const double y1i = y->imag;
sum1r += x1r * y1r - x1i * y1i;
sum1i += x1r * y1i + x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const double xr = x->real;
const double xi = x->imag;
const double yr = y->real;
const double yi = y->imag;
sumr += xr * yr - xi * yi;
sumi += xr * yi + xi * yr;
}
} else {
double sum0r = 0.0, sum1r = 0.0;
double sum0i = 0.0, sum1i = 0.0;
while (n >= 2) {
const double x0r = x->real;
const double x0i = x->imag;
const double y0r = y->real;
const double y0i = y->imag;
sum0r += x0r * y0r + x0i * y0i;
sum0i += x0r * y0i - x0i * y0r;
x += incx;
y += incy;
const double x1r = x->real;
const double x1i = x->imag;
const double y1r = y->real;
const double y1i = y->imag;
sum1r += x1r * y1r + x1i * y1i;
sum1i += x1r * y1i - x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const double xr = x->real;
const double xi = x->imag;
const double yr = y->real;
const double yi = y->imag;
sumr += xr * yr + xi * yi;
sumi += xr * yi - xi * yr;
}
}
rho->real = sumr;
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
}

View File

@@ -0,0 +1,386 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#if PPAPI_RELEASE >= 36
typedef float v4sf __attribute__ ((vector_size(16)));
inline v4sf v4sf_splat(float x) {
return (v4sf) { x, x, x, x };
}
inline v4sf v4sf_load(const float* a) {
return *((const v4sf*)a);
}
inline v4sf v4sf_cload(const scomplex* a) {
return *((const v4sf*)a);
}
inline void v4sf_store(float* a, v4sf x) {
*((v4sf*)a) = x;
}
inline void v4sf_cstore(scomplex* a, v4sf x) {
*((v4sf*)a) = x;
}
inline v4sf v4sf_zero() {
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
}
void bli_sgemm_opt(
dim_t k,
float alpha[restrict static 1],
float a[restrict static 8*k],
float b[restrict static k*4],
float beta[restrict static 1],
float c[restrict static 8*4],
inc_t rs_c,
inc_t cs_c,
auxinfo_t* data)
{
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
for (dim_t i = 0; i < k; i += 1) {
const v4sf avt = v4sf_load(a);
const v4sf avb = v4sf_load(a+4);
const v4sf bv_xxxx = v4sf_splat(b[0]);
abv0t += avt * bv_xxxx;
abv0b += avb * bv_xxxx;
const v4sf bv_yyyy = v4sf_splat(b[1]);
abv1t += avt * bv_yyyy;
abv1b += avb * bv_yyyy;
const v4sf bv_zzzz = v4sf_splat(b[2]);
abv2t += avt * bv_zzzz;
abv2b += avb * bv_zzzz;
const v4sf bv_wwww = v4sf_splat(b[3]);
abv3t += avt * bv_wwww;
abv3b += avb * bv_wwww;
a += 8;
b += 4;
}
const v4sf alphav = v4sf_splat(*alpha);
abv0t *= alphav;
abv0b *= alphav;
abv1t *= alphav;
abv1b *= alphav;
abv2t *= alphav;
abv2b *= alphav;
abv3t *= alphav;
abv3b *= alphav;
if (rs_c == 1) {
v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]);
v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]);
v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]);
v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]);
v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]);
v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]);
const v4sf betav = v4sf_splat(*beta);
cv0t = cv0t * betav + abv0t;
cv1t = cv1t * betav + abv1t;
cv2t = cv2t * betav + abv2t;
cv3t = cv3t * betav + abv3t;
cv0b = cv0b * betav + abv0b;
cv1b = cv1b * betav + abv1b;
cv2b = cv2b * betav + abv2b;
cv3b = cv3b * betav + abv3b;
v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
v4sf_store(&c[0*rs_c + 1*cs_c], cv1t);
v4sf_store(&c[0*rs_c + 2*cs_c], cv2t);
v4sf_store(&c[0*rs_c + 3*cs_c], cv3t);
v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
v4sf_store(&c[4*rs_c + 1*cs_c], cv1b);
v4sf_store(&c[4*rs_c + 2*cs_c], cv2b);
v4sf_store(&c[4*rs_c + 3*cs_c], cv3b);
} else {
// Load columns 0, 1, 2, 3 (top part)
v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
// Load columns 0, 1, 2, 3 (bottom part)
v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
const v4sf betav = v4sf_splat(*beta);
cv0t = cv0t * betav + abv0t;
cv1t = cv1t * betav + abv1t;
cv2t = cv2t * betav + abv2t;
cv3t = cv3t * betav + abv3t;
cv0b = cv0b * betav + abv0b;
cv1b = cv1b * betav + abv1b;
cv2b = cv2b * betav + abv2b;
cv3b = cv3b * betav + abv3b;
// Store column 0
c[0*rs_c + 0*cs_c] = cv0t[0];
c[1*rs_c + 0*cs_c] = cv0t[1];
c[2*rs_c + 0*cs_c] = cv0t[2];
c[3*rs_c + 0*cs_c] = cv0t[3];
c[4*rs_c + 0*cs_c] = cv0b[0];
c[5*rs_c + 0*cs_c] = cv0b[1];
c[6*rs_c + 0*cs_c] = cv0b[2];
c[7*rs_c + 0*cs_c] = cv0b[3];
// Store column 1
c[0*rs_c + 1*cs_c] = cv1t[0];
c[1*rs_c + 1*cs_c] = cv1t[1];
c[2*rs_c + 1*cs_c] = cv1t[2];
c[3*rs_c + 1*cs_c] = cv1t[3];
c[4*rs_c + 1*cs_c] = cv1b[0];
c[5*rs_c + 1*cs_c] = cv1b[1];
c[6*rs_c + 1*cs_c] = cv1b[2];
c[7*rs_c + 1*cs_c] = cv1b[3];
// Store column 2
c[0*rs_c + 2*cs_c] = cv2t[0];
c[1*rs_c + 2*cs_c] = cv2t[1];
c[2*rs_c + 2*cs_c] = cv2t[2];
c[3*rs_c + 2*cs_c] = cv2t[3];
c[4*rs_c + 2*cs_c] = cv2b[0];
c[5*rs_c + 2*cs_c] = cv2b[1];
c[6*rs_c + 2*cs_c] = cv2b[2];
c[7*rs_c + 2*cs_c] = cv2b[3];
// Store column 3
c[0*rs_c + 3*cs_c] = cv3t[0];
c[1*rs_c + 3*cs_c] = cv3t[1];
c[2*rs_c + 3*cs_c] = cv3t[2];
c[3*rs_c + 3*cs_c] = cv3t[3];
c[4*rs_c + 3*cs_c] = cv3b[0];
c[5*rs_c + 3*cs_c] = cv3b[1];
c[6*rs_c + 3*cs_c] = cv3b[2];
c[7*rs_c + 3*cs_c] = cv3b[3];
}
}
void bli_cgemm_opt(
dim_t k,
scomplex alpha[restrict static 1],
scomplex a[restrict static 4*k],
scomplex b[restrict static k*4],
scomplex beta[restrict static 1],
scomplex c[restrict static 4*4],
inc_t rs_c,
inc_t cs_c,
auxinfo_t* data)
{
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
for (dim_t i = 0; i < k; i += 1) {
const v4sf avt = v4sf_cload(a);
const v4sf avb = v4sf_cload(a+2);
const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
const v4sf bv0r = v4sf_splat(b[0].real);
const v4sf bv0i = v4sf_splat(b[0].imag);
abv0r += avr * bv0r - avi * bv0i;
abv0i += avr * bv0i + avi * bv0r;
const v4sf bv1r = v4sf_splat(b[1].real);
const v4sf bv1i = v4sf_splat(b[1].imag);
abv1r += avr * bv1r - avi * bv1i;
abv1i += avr * bv1i + avi * bv1r;
const v4sf bv2r = v4sf_splat(b[2].real);
const v4sf bv2i = v4sf_splat(b[2].imag);
abv2r += avr * bv2r - avi * bv2i;
abv2i += avr * bv2i + avi * bv2r;
const v4sf bv3r = v4sf_splat(b[3].real);
const v4sf bv3i = v4sf_splat(b[3].imag);
abv3r += avr * bv3r - avi * bv3i;
abv3i += avr * bv3i + avi * bv3r;
a += 4;
b += 4;
}
const v4sf alphavr = v4sf_splat(alpha->real);
const v4sf alphavi = v4sf_splat(alpha->imag);
v4sf temp;
temp = abv0r * alphavr - abv0i * alphavi;
abv0i = abv0r * alphavi + abv0i * alphavr;
abv0r = temp;
temp = abv1r * alphavr - abv1i * alphavi;
abv1i = abv1r * alphavi + abv1i * alphavr;
abv1r = temp;
temp = abv2r * alphavr - abv2i * alphavi;
abv2i = abv2r * alphavi + abv2i * alphavr;
abv2r = temp;
temp = abv3r * alphavr - abv3i * alphavi;
abv3i = abv3r * alphavi + abv3i * alphavr;
abv3r = temp;
if (rs_c == 1) {
const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
const v4sf betavr = v4sf_splat(beta->real);
const v4sf betavi = v4sf_splat(beta->imag);
temp = abv0r + cv0r * betavr - cv0i * betavi;
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
cv0r = temp;
temp = abv1r + cv1r * betavr - cv1i * betavi;
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
cv1r = temp;
temp = abv2r + cv2r * betavr - cv2i * betavi;
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
cv2r = temp;
temp = abv3r + cv3r * betavr - cv3i * betavi;
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
cv3r = temp;
v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
} else {
// Load columns 0, 1, 2, 3 (real part)
v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
// Load columns 0, 1, 2, 3 (imaginary part)
v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
const v4sf betavr = v4sf_splat(beta->real);
const v4sf betavi = v4sf_splat(beta->imag);
temp = abv0r + cv0r * betavr - cv0i * betavi;
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
cv0r = temp;
temp = abv1r + cv1r * betavr - cv1i * betavi;
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
cv1r = temp;
temp = abv2r + cv2r * betavr - cv2i * betavi;
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
cv2r = temp;
temp = abv3r + cv3r * betavr - cv3i * betavi;
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
cv3r = temp;
// Store column 0
c[0*rs_c + 0*cs_c].real = cv0r[0];
c[0*rs_c + 0*cs_c].imag = cv0i[0];
c[1*rs_c + 0*cs_c].real = cv0r[1];
c[1*rs_c + 0*cs_c].imag = cv0i[1];
c[2*rs_c + 0*cs_c].real = cv0r[2];
c[2*rs_c + 0*cs_c].imag = cv0i[2];
c[3*rs_c + 0*cs_c].real = cv0r[3];
c[3*rs_c + 0*cs_c].imag = cv0i[3];
// Store column 1
c[0*rs_c + 1*cs_c].real = cv1r[0];
c[0*rs_c + 1*cs_c].imag = cv1i[0];
c[1*rs_c + 1*cs_c].real = cv1r[1];
c[1*rs_c + 1*cs_c].imag = cv1i[1];
c[2*rs_c + 1*cs_c].real = cv1r[2];
c[2*rs_c + 1*cs_c].imag = cv1i[2];
c[3*rs_c + 1*cs_c].real = cv1r[3];
c[3*rs_c + 1*cs_c].imag = cv1i[3];
// Store column 2
c[0*rs_c + 2*cs_c].real = cv2r[0];
c[0*rs_c + 2*cs_c].imag = cv2i[0];
c[1*rs_c + 2*cs_c].real = cv2r[1];
c[1*rs_c + 2*cs_c].imag = cv2i[1];
c[2*rs_c + 2*cs_c].real = cv2r[2];
c[2*rs_c + 2*cs_c].imag = cv2i[2];
c[3*rs_c + 2*cs_c].real = cv2r[3];
c[3*rs_c + 2*cs_c].imag = cv2i[3];
// Store column 3
c[0*rs_c + 3*cs_c].real = cv3r[0];
c[0*rs_c + 3*cs_c].imag = cv3i[0];
c[1*rs_c + 3*cs_c].real = cv3r[1];
c[1*rs_c + 3*cs_c].imag = cv3i[1];
c[2*rs_c + 3*cs_c].real = cv3r[2];
c[2*rs_c + 3*cs_c].imag = cv3i[2];
c[3*rs_c + 3*cs_c].real = cv3r[3];
c[3*rs_c + 3*cs_c].imag = cv3i[3];
}
}
#endif

View File

@@ -45,7 +45,8 @@
#
.PHONY: all bin clean \
check-env check-env-mk check-env-fragments check-env-make-defs
check-env check-env-mk check-env-fragments check-env-make-defs \
run run-amd64 run-x86 run-arm
@@ -241,8 +242,21 @@ TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
$(TEST_OBJ_PATH)/%.o, \
$(wildcard $(TEST_SRC_PATH)/*.c))
ifeq ($(CONFIG_NAME),pnacl)
# Linked executable
TEST_BIN := test_libblis.unstable.pexe
# Finalized executable
TEST_BIN_PNACL := test_libblis.pexe
# Translated executable for x86-64
TEST_BIN_AMD64 := test_libblis.x86-64.nexe
# Translated executable for x86
TEST_BIN_X86 := test_libblis.x86.nexe
# Translated executable for ARM
TEST_BIN_ARM := test_libblis.arm.nexe
else
# Binary executable name.
TEST_BIN := test_libblis.x
endif
# Add installed and local header paths to CFLAGS
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
@@ -257,7 +271,11 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
all: check-env bin
ifeq ($(CONFIG_NAME),pnacl)
bin: check-env $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
else
bin: check-env $(TEST_BIN)
endif
# --- Environment check rules ---
@@ -301,9 +319,68 @@ else
@$(LINKER) $(TEST_OBJS) $(BLIS_LIB) $(LDFLAGS) -o $@
endif
ifeq ($(CONFIG_NAME),pnacl)
# Finalize PNaCl executable (i.e. convert from LLVM bitcode to PNaCl bitcode)
$(TEST_BIN_PNACL): $(TEST_BIN)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
else
@echo "Finalizing $@"
@$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
endif
# Translate PNaCl executable to x86-64 NaCl executable
$(TEST_BIN_AMD64): $(TEST_BIN_PNACL)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
else
@echo "Translating $< -> $@"
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
endif
# Translate PNaCl executable to x86 NaCl executable
$(TEST_BIN_X86): $(TEST_BIN_PNACL)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
else
@echo "Translating $< -> $@"
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
endif
# Translate PNaCl executable to ARMv7 NaCl executable
$(TEST_BIN_ARM): $(TEST_BIN_PNACL)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
else
@echo "Translating $< -> $@"
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
endif
endif
# -- Test run rules --
ifeq ($(CONFIG_NAME),pnacl)
run-amd64: $(TEST_BIN_AMD64)
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_64 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_64.nexe -- $(TEST_BIN_AMD64)
run-x86: $(TEST_BIN_X86)
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_32 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_32.nexe -- $(TEST_BIN_X86)
run-arm: $(TEST_BIN_ARM)
$(NACL_SDK_ROOT)/tools/sel_ldr_arm -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_arm.nexe -- $(TEST_BIN_ARM)
else
run: $(TEST_BIN)
./$(TEST_BIN)
endif
# -- Clean rules --
ifeq ($(CONFIG_NAME),pnacl)
clean:
- $(RM_F) $(TEST_OBJS) $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
else
clean:
- $(RM_F) $(TEST_OBJS) $(TEST_BIN)
endif