mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Merge pull request #10 from Maratyszcza/stable
Portable Native Client port
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -19,6 +19,8 @@
|
||||
*.a
|
||||
# test executables
|
||||
*.x
|
||||
*.pexe
|
||||
*.nexe
|
||||
|
||||
# -- build system files --
|
||||
|
||||
|
||||
2
Makefile
2
Makefile
@@ -469,7 +469,7 @@ endif
|
||||
blis-lib: check-env $(MK_LIBS)
|
||||
|
||||
$(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS)
|
||||
ifeq ($(FLA_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(AR) $(ARFLAGS) $@ $?
|
||||
$(RANLIB) $@
|
||||
else
|
||||
|
||||
165
config/pnacl/bli_config.h
Normal file
165
config/pnacl/bli_config.h
Normal file
@@ -0,0 +1,165 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
// -- INTEGER PROPERTIES -------------------------------------------------------
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions,
|
||||
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
|
||||
// integers while 64 results in 64-bit integers. Any other value results in use
|
||||
// of the C99 type "long int". Note that this ONLY affects integers used
|
||||
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
|
||||
// interface.
|
||||
#define BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
|
||||
|
||||
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
|
||||
|
||||
// Define the number of floating-point types supported, and the size of the
|
||||
// largest type.
|
||||
#define BLIS_NUM_FP_TYPES 4
|
||||
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
|
||||
|
||||
// Enable use of built-in C99 "float complex" and "double complex" types and
|
||||
// associated overloaded operations and functions? Disabling results in
|
||||
// scomplex and dcomplex being defined in terms of simple structs.
|
||||
//#define BLIS_ENABLE_C99_COMPLEX
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 1
|
||||
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
// -- Contiguous (static) memory allocator --
|
||||
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
// memory pools so that the micro-kernel, when computing with the end of the
|
||||
// last block, can exceed the bounds of the usable portion of the memory
|
||||
// region without causing a segmentation fault.
|
||||
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
|
||||
|
||||
// -- Memory alignment --
|
||||
|
||||
// It is sometimes useful to define the various memory alignments in terms
|
||||
// of some other characteristics of the system, such as the cache line size
|
||||
// and the page size.
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
// Alignment size used when allocating entire blocks of contiguous memory
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
// Basic (homogeneous) datatype support always enabled.
|
||||
|
||||
// Enable mixed domain operations?
|
||||
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
|
||||
// Enable extra mixed precision operations?
|
||||
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
|
||||
|
||||
|
||||
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
|
||||
|
||||
// Stay initialized after auto-initialization, unless and until the user
|
||||
// explicitly calls bli_finalize().
|
||||
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
|
||||
|
||||
|
||||
|
||||
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
|
||||
|
||||
// Enable the BLAS compatibility layer?
|
||||
#define BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions and
|
||||
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
|
||||
// A value of 32 results in the compatibility layer using 32-bit signed integers
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
247
config/pnacl/bli_kernel.h
Normal file
247
config/pnacl/bli_kernel.h
Normal file
@@ -0,0 +1,247 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
/*
|
||||
* SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible.
|
||||
* Therefore, if compilation targets an older Chrome release, we use scalar kernels.
|
||||
* The target Chrome version is indicated by PPAPI_MACRO defined in the header below.
|
||||
*/
|
||||
#include <ppapi/c/pp_macros.h>
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
#else
|
||||
#define BLIS_DEFAULT_MC_S 252
|
||||
#define BLIS_DEFAULT_KC_S 264
|
||||
#define BLIS_DEFAULT_NC_S 8196
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 1080
|
||||
#define BLIS_DEFAULT_KC_D 120
|
||||
#define BLIS_DEFAULT_NC_D 8400
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
#else
|
||||
#define BLIS_DEFAULT_MC_C 120
|
||||
#define BLIS_DEFAULT_KC_C 264
|
||||
#define BLIS_DEFAULT_NC_C 4092
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 60
|
||||
#define BLIS_DEFAULT_KC_Z 264
|
||||
#define BLIS_DEFAULT_NC_Z 2040
|
||||
|
||||
// -- Register blocksizes --
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
#else
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 3
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 3
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MR_C 4
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
#else
|
||||
#define BLIS_DEFAULT_MR_C 2
|
||||
#define BLIS_DEFAULT_NR_C 3
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 2
|
||||
#define BLIS_DEFAULT_NR_Z 3
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt
|
||||
#endif
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
// -- copyv --
|
||||
|
||||
// -- dotv --
|
||||
#define BLIS_SDOTV_KERNEL bli_sdotv_opt
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt
|
||||
#define BLIS_CDOTV_KERNEL bli_cdotv_opt
|
||||
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
// -- invertv --
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
// -- scalv --
|
||||
|
||||
// -- setv --
|
||||
|
||||
// -- subv --
|
||||
|
||||
// -- swapv --
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
1
config/pnacl/kernels
Symbolic link
1
config/pnacl/kernels
Symbolic link
@@ -0,0 +1 @@
|
||||
../../kernels/nacl/pnacl
|
||||
117
config/pnacl/make_defs.mk
Normal file
117
config/pnacl/make_defs.mk
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
# Only include this block of code once.
|
||||
ifndef MAKE_DEFS_MK_INCLUDED
|
||||
MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := pnacl-ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := pnacl-clang
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3
|
||||
CKOPTFLAGS := $(COPTFLAGS) -ffast-math
|
||||
CVECFLAGS :=
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := pnacl-ar
|
||||
ARFLAGS := rcs
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := -lm
|
||||
|
||||
# --- Determine the finalizer and related flags ---
|
||||
FINALIZER := pnacl-finalize
|
||||
FINFLAGS :=
|
||||
|
||||
# --- Determine the translator and related flags ---
|
||||
TRANSLATOR := pnacl-translate
|
||||
TRNSFLAGS := -O3
|
||||
TRNSAMD64FLAGS := -arch x86-64
|
||||
TRNSX86FLAGS := -arch i686
|
||||
TRNSARMFLAGS := -arch armv7
|
||||
|
||||
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
|
||||
endif
|
||||
618
kernels/nacl/pnacl/1/bli_dotv_opt.c
Normal file
618
kernels/nacl/pnacl/1/bli_dotv_opt.c
Normal file
@@ -0,0 +1,618 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
typedef float v4sf __attribute__ ((vector_size(16)));
|
||||
|
||||
inline v4sf v4sf_splat(float x) {
|
||||
return (v4sf) { x, x, x, x };
|
||||
}
|
||||
|
||||
inline v4sf v4sf_load(const float* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline v4sf v4sf_cload(const scomplex* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline void v4sf_store(float* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline void v4sf_cstore(scomplex* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline v4sf v4sf_zero() {
|
||||
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
#endif
|
||||
|
||||
void bli_sdotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float x[restrict static n],
|
||||
inc_t incx,
|
||||
float y[restrict static n],
|
||||
inc_t incy,
|
||||
float rho[restrict static 1])
|
||||
{
|
||||
#if PPAPI_RELEASE >= 36
|
||||
// If the vector lengths are zero, set rho to zero and return.
|
||||
if (bli_zero_dim1(n)) {
|
||||
*rho = 0.0f;
|
||||
return;
|
||||
}
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if (bli_has_nonunit_inc2(incx, incy)) {
|
||||
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
|
||||
while (n >= 6) {
|
||||
sum0 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum1 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum2 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum3 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum4 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum5 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 6;
|
||||
}
|
||||
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
|
||||
while (n--) {
|
||||
sum += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
*rho = sum;
|
||||
} else {
|
||||
v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero();
|
||||
v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero();
|
||||
while (n >= 24) {
|
||||
vsum0 += v4sf_load(x) * v4sf_load(y);
|
||||
vsum1 += v4sf_load(x+4) * v4sf_load(y+4);
|
||||
vsum2 += v4sf_load(x+8) * v4sf_load(y+8);
|
||||
vsum3 += v4sf_load(x+12) * v4sf_load(y+12);
|
||||
vsum4 += v4sf_load(x+16) * v4sf_load(y+16);
|
||||
vsum5 += v4sf_load(x+20) * v4sf_load(y+20);
|
||||
|
||||
x += 24;
|
||||
y += 24;
|
||||
n -= 24;
|
||||
}
|
||||
v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5);
|
||||
while (n >= 4) {
|
||||
vsum += v4sf_load(x) * v4sf_load(y);
|
||||
|
||||
x += 4;
|
||||
y += 4;
|
||||
n -= 4;
|
||||
}
|
||||
float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]);
|
||||
while (n--) {
|
||||
sum += (*x++) * (*y++);
|
||||
}
|
||||
*rho = sum;
|
||||
}
|
||||
#else
|
||||
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
|
||||
while (n >= 6) {
|
||||
sum0 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum1 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum2 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum3 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum4 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum5 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 6;
|
||||
}
|
||||
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
|
||||
while (n--) {
|
||||
sum += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
*rho = sum;
|
||||
#endif
|
||||
}
|
||||
|
||||
void bli_ddotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double x[restrict static n],
|
||||
inc_t incx,
|
||||
double y[restrict static n],
|
||||
inc_t incy,
|
||||
double rho[restrict static 1])
|
||||
{
|
||||
double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0;
|
||||
while (n >= 6) {
|
||||
sum0 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum1 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum2 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum3 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum4 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum5 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 6;
|
||||
}
|
||||
double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
|
||||
while (n--) {
|
||||
sum += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
*rho = sum;
|
||||
}
|
||||
|
||||
void bli_cdotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex x[restrict static n],
|
||||
inc_t incx,
|
||||
scomplex y[restrict static n],
|
||||
inc_t incy,
|
||||
scomplex rho[restrict static 1])
|
||||
{
|
||||
if (bli_is_conj(conjy)) {
|
||||
bli_toggle_conj(conjx);
|
||||
}
|
||||
|
||||
if (bli_zero_dim1(n)) {
|
||||
rho->real = 0.0f;
|
||||
rho->imag = 0.0f;
|
||||
return;
|
||||
}
|
||||
|
||||
float sumr;
|
||||
float sumi;
|
||||
#if PPAPI_RELEASE >= 36
|
||||
if (bli_is_noconj(conjx)) {
|
||||
if (bli_has_nonunit_inc2(incx, incy)) {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r - x0i * y0i;
|
||||
sum0i += x0r * y0i + x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r - x1i * y1i;
|
||||
sum1i += x1r * y1i + x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
} else {
|
||||
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
|
||||
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
|
||||
while (n >= 8) {
|
||||
const v4sf xv0t = v4sf_cload(x);
|
||||
const v4sf xv0b = v4sf_cload(x+2);
|
||||
const v4sf yv0t = v4sf_cload(y);
|
||||
const v4sf yv0b = v4sf_cload(y+2);
|
||||
|
||||
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
|
||||
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
|
||||
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
|
||||
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
|
||||
|
||||
sumv0r += xv0r * yv0r - xv0i * yv0i;
|
||||
sumv0i += xv0r * yv0i + xv0i * yv0r;
|
||||
|
||||
const v4sf xv1t = v4sf_cload(x+4);
|
||||
const v4sf xv1b = v4sf_cload(x+6);
|
||||
const v4sf yv1t = v4sf_cload(y+4);
|
||||
const v4sf yv1b = v4sf_cload(y+6);
|
||||
|
||||
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
|
||||
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
|
||||
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
|
||||
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
|
||||
|
||||
sumv1r += xv1r * yv1r - xv1i * yv1i;
|
||||
sumv1i += xv1r * yv1i + xv1i * yv1r;
|
||||
|
||||
x += 8;
|
||||
y += 8;
|
||||
|
||||
n -= 8;
|
||||
}
|
||||
const v4sf sumvr = sumv0r + sumv1r;
|
||||
const v4sf sumvi = sumv0i + sumv1i;
|
||||
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
|
||||
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
|
||||
}
|
||||
while (n--) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr - xi * yi;
|
||||
sumi += xr * yi + xi * yr;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
} else {
|
||||
if (bli_has_nonunit_inc2(incx, incy)) {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r + x0i * y0i;
|
||||
sum0i += x0r * y0i - x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r + x1i * y1i;
|
||||
sum1i += x1r * y1i - x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
} else {
|
||||
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
|
||||
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
|
||||
while (n >= 8) {
|
||||
const v4sf xv0t = v4sf_cload(x);
|
||||
const v4sf xv0b = v4sf_cload(x+2);
|
||||
const v4sf yv0t = v4sf_cload(y);
|
||||
const v4sf yv0b = v4sf_cload(y+2);
|
||||
|
||||
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
|
||||
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
|
||||
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
|
||||
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
|
||||
|
||||
sumv0r += xv0r * yv0r + xv0i * yv0i;
|
||||
sumv0i += xv0r * yv0i - xv0i * yv0r;
|
||||
|
||||
const v4sf xv1t = v4sf_cload(x+4);
|
||||
const v4sf xv1b = v4sf_cload(x+6);
|
||||
const v4sf yv1t = v4sf_cload(y+4);
|
||||
const v4sf yv1b = v4sf_cload(y+6);
|
||||
|
||||
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
|
||||
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
|
||||
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
|
||||
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
|
||||
|
||||
sumv1r += xv1r * yv1r + xv1i * yv1i;
|
||||
sumv1i += xv1r * yv1i - xv1i * yv1r;
|
||||
|
||||
x += 8;
|
||||
y += 8;
|
||||
|
||||
n -= 8;
|
||||
}
|
||||
const v4sf sumvr = sumv0r + sumv1r;
|
||||
const v4sf sumvi = sumv0i + sumv1i;
|
||||
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
|
||||
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
|
||||
}
|
||||
while (n--) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr + xi * yi;
|
||||
sumi += xr * yi - xi * yr;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (bli_is_noconj(conjx)) {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r - x0i * y0i;
|
||||
sum0i += x0r * y0i + x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r - x1i * y1i;
|
||||
sum1i += x1r * y1i + x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr - xi * yi;
|
||||
sumi += xr * yi + xi * yr;
|
||||
}
|
||||
} else {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r + x0i * y0i;
|
||||
sum0i += x0r * y0i - x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r + x1i * y1i;
|
||||
sum1i += x1r * y1i - x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr + xi * yi;
|
||||
sumi += xr * yi - xi * yr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
rho->real = sumr;
|
||||
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zdotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex x[restrict static n],
|
||||
inc_t incx,
|
||||
dcomplex y[restrict static n],
|
||||
inc_t incy,
|
||||
dcomplex rho[restrict static 1])
|
||||
{
|
||||
if (bli_is_conj(conjy)) {
|
||||
bli_toggle_conj(conjx);
|
||||
}
|
||||
|
||||
if (bli_zero_dim1(n)) {
|
||||
rho->real = 0.0;
|
||||
rho->imag = 0.0;
|
||||
return;
|
||||
}
|
||||
|
||||
double sumr;
|
||||
double sumi;
|
||||
if (bli_is_noconj(conjx)) {
|
||||
double sum0r = 0.0, sum1r = 0.0;
|
||||
double sum0i = 0.0, sum1i = 0.0;
|
||||
while (n >= 2) {
|
||||
const double x0r = x->real;
|
||||
const double x0i = x->imag;
|
||||
const double y0r = y->real;
|
||||
const double y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r - x0i * y0i;
|
||||
sum0i += x0r * y0i + x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const double x1r = x->real;
|
||||
const double x1i = x->imag;
|
||||
const double y1r = y->real;
|
||||
const double y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r - x1i * y1i;
|
||||
sum1i += x1r * y1i + x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const double xr = x->real;
|
||||
const double xi = x->imag;
|
||||
const double yr = y->real;
|
||||
const double yi = y->imag;
|
||||
|
||||
sumr += xr * yr - xi * yi;
|
||||
sumi += xr * yi + xi * yr;
|
||||
}
|
||||
} else {
|
||||
double sum0r = 0.0, sum1r = 0.0;
|
||||
double sum0i = 0.0, sum1i = 0.0;
|
||||
while (n >= 2) {
|
||||
const double x0r = x->real;
|
||||
const double x0i = x->imag;
|
||||
const double y0r = y->real;
|
||||
const double y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r + x0i * y0i;
|
||||
sum0i += x0r * y0i - x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const double x1r = x->real;
|
||||
const double x1i = x->imag;
|
||||
const double y1r = y->real;
|
||||
const double y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r + x1i * y1i;
|
||||
sum1i += x1r * y1i - x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const double xr = x->real;
|
||||
const double xi = x->imag;
|
||||
const double yr = y->real;
|
||||
const double yi = y->imag;
|
||||
|
||||
sumr += xr * yr + xi * yi;
|
||||
sumi += xr * yi - xi * yr;
|
||||
}
|
||||
}
|
||||
|
||||
rho->real = sumr;
|
||||
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
|
||||
}
|
||||
|
||||
386
kernels/nacl/pnacl/3/bli_gemm_opt.c
Normal file
386
kernels/nacl/pnacl/3/bli_gemm_opt.c
Normal file
@@ -0,0 +1,386 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
typedef float v4sf __attribute__ ((vector_size(16)));
|
||||
|
||||
inline v4sf v4sf_splat(float x) {
|
||||
return (v4sf) { x, x, x, x };
|
||||
}
|
||||
|
||||
inline v4sf v4sf_load(const float* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline v4sf v4sf_cload(const scomplex* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline void v4sf_store(float* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline void v4sf_cstore(scomplex* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline v4sf v4sf_zero() {
|
||||
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
void bli_sgemm_opt(
|
||||
dim_t k,
|
||||
float alpha[restrict static 1],
|
||||
float a[restrict static 8*k],
|
||||
float b[restrict static k*4],
|
||||
float beta[restrict static 1],
|
||||
float c[restrict static 8*4],
|
||||
inc_t rs_c,
|
||||
inc_t cs_c,
|
||||
auxinfo_t* data)
|
||||
{
|
||||
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
|
||||
v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
|
||||
v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
|
||||
for (dim_t i = 0; i < k; i += 1) {
|
||||
const v4sf avt = v4sf_load(a);
|
||||
const v4sf avb = v4sf_load(a+4);
|
||||
|
||||
const v4sf bv_xxxx = v4sf_splat(b[0]);
|
||||
abv0t += avt * bv_xxxx;
|
||||
abv0b += avb * bv_xxxx;
|
||||
|
||||
const v4sf bv_yyyy = v4sf_splat(b[1]);
|
||||
abv1t += avt * bv_yyyy;
|
||||
abv1b += avb * bv_yyyy;
|
||||
|
||||
const v4sf bv_zzzz = v4sf_splat(b[2]);
|
||||
abv2t += avt * bv_zzzz;
|
||||
abv2b += avb * bv_zzzz;
|
||||
|
||||
const v4sf bv_wwww = v4sf_splat(b[3]);
|
||||
abv3t += avt * bv_wwww;
|
||||
abv3b += avb * bv_wwww;
|
||||
|
||||
a += 8;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
const v4sf alphav = v4sf_splat(*alpha);
|
||||
abv0t *= alphav;
|
||||
abv0b *= alphav;
|
||||
abv1t *= alphav;
|
||||
abv1b *= alphav;
|
||||
abv2t *= alphav;
|
||||
abv2b *= alphav;
|
||||
abv3t *= alphav;
|
||||
abv3b *= alphav;
|
||||
|
||||
if (rs_c == 1) {
|
||||
v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
|
||||
v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]);
|
||||
v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]);
|
||||
v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]);
|
||||
v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
|
||||
v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]);
|
||||
v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]);
|
||||
v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]);
|
||||
|
||||
const v4sf betav = v4sf_splat(*beta);
|
||||
cv0t = cv0t * betav + abv0t;
|
||||
cv1t = cv1t * betav + abv1t;
|
||||
cv2t = cv2t * betav + abv2t;
|
||||
cv3t = cv3t * betav + abv3t;
|
||||
cv0b = cv0b * betav + abv0b;
|
||||
cv1b = cv1b * betav + abv1b;
|
||||
cv2b = cv2b * betav + abv2b;
|
||||
cv3b = cv3b * betav + abv3b;
|
||||
|
||||
v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
|
||||
v4sf_store(&c[0*rs_c + 1*cs_c], cv1t);
|
||||
v4sf_store(&c[0*rs_c + 2*cs_c], cv2t);
|
||||
v4sf_store(&c[0*rs_c + 3*cs_c], cv3t);
|
||||
v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
|
||||
v4sf_store(&c[4*rs_c + 1*cs_c], cv1b);
|
||||
v4sf_store(&c[4*rs_c + 2*cs_c], cv2b);
|
||||
v4sf_store(&c[4*rs_c + 3*cs_c], cv3b);
|
||||
} else {
|
||||
// Load columns 0, 1, 2, 3 (top part)
|
||||
v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
|
||||
v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
|
||||
v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
|
||||
v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
|
||||
// Load columns 0, 1, 2, 3 (bottom part)
|
||||
v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
|
||||
v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
|
||||
v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
|
||||
v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
|
||||
|
||||
const v4sf betav = v4sf_splat(*beta);
|
||||
cv0t = cv0t * betav + abv0t;
|
||||
cv1t = cv1t * betav + abv1t;
|
||||
cv2t = cv2t * betav + abv2t;
|
||||
cv3t = cv3t * betav + abv3t;
|
||||
cv0b = cv0b * betav + abv0b;
|
||||
cv1b = cv1b * betav + abv1b;
|
||||
cv2b = cv2b * betav + abv2b;
|
||||
cv3b = cv3b * betav + abv3b;
|
||||
|
||||
// Store column 0
|
||||
c[0*rs_c + 0*cs_c] = cv0t[0];
|
||||
c[1*rs_c + 0*cs_c] = cv0t[1];
|
||||
c[2*rs_c + 0*cs_c] = cv0t[2];
|
||||
c[3*rs_c + 0*cs_c] = cv0t[3];
|
||||
c[4*rs_c + 0*cs_c] = cv0b[0];
|
||||
c[5*rs_c + 0*cs_c] = cv0b[1];
|
||||
c[6*rs_c + 0*cs_c] = cv0b[2];
|
||||
c[7*rs_c + 0*cs_c] = cv0b[3];
|
||||
|
||||
// Store column 1
|
||||
c[0*rs_c + 1*cs_c] = cv1t[0];
|
||||
c[1*rs_c + 1*cs_c] = cv1t[1];
|
||||
c[2*rs_c + 1*cs_c] = cv1t[2];
|
||||
c[3*rs_c + 1*cs_c] = cv1t[3];
|
||||
c[4*rs_c + 1*cs_c] = cv1b[0];
|
||||
c[5*rs_c + 1*cs_c] = cv1b[1];
|
||||
c[6*rs_c + 1*cs_c] = cv1b[2];
|
||||
c[7*rs_c + 1*cs_c] = cv1b[3];
|
||||
|
||||
// Store column 2
|
||||
c[0*rs_c + 2*cs_c] = cv2t[0];
|
||||
c[1*rs_c + 2*cs_c] = cv2t[1];
|
||||
c[2*rs_c + 2*cs_c] = cv2t[2];
|
||||
c[3*rs_c + 2*cs_c] = cv2t[3];
|
||||
c[4*rs_c + 2*cs_c] = cv2b[0];
|
||||
c[5*rs_c + 2*cs_c] = cv2b[1];
|
||||
c[6*rs_c + 2*cs_c] = cv2b[2];
|
||||
c[7*rs_c + 2*cs_c] = cv2b[3];
|
||||
|
||||
// Store column 3
|
||||
c[0*rs_c + 3*cs_c] = cv3t[0];
|
||||
c[1*rs_c + 3*cs_c] = cv3t[1];
|
||||
c[2*rs_c + 3*cs_c] = cv3t[2];
|
||||
c[3*rs_c + 3*cs_c] = cv3t[3];
|
||||
c[4*rs_c + 3*cs_c] = cv3b[0];
|
||||
c[5*rs_c + 3*cs_c] = cv3b[1];
|
||||
c[6*rs_c + 3*cs_c] = cv3b[2];
|
||||
c[7*rs_c + 3*cs_c] = cv3b[3];
|
||||
}
|
||||
}
|
||||
|
||||
void bli_cgemm_opt(
|
||||
dim_t k,
|
||||
scomplex alpha[restrict static 1],
|
||||
scomplex a[restrict static 4*k],
|
||||
scomplex b[restrict static k*4],
|
||||
scomplex beta[restrict static 1],
|
||||
scomplex c[restrict static 4*4],
|
||||
inc_t rs_c,
|
||||
inc_t cs_c,
|
||||
auxinfo_t* data)
|
||||
{
|
||||
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
|
||||
v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
|
||||
v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
|
||||
for (dim_t i = 0; i < k; i += 1) {
|
||||
const v4sf avt = v4sf_cload(a);
|
||||
const v4sf avb = v4sf_cload(a+2);
|
||||
const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
|
||||
const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
|
||||
|
||||
const v4sf bv0r = v4sf_splat(b[0].real);
|
||||
const v4sf bv0i = v4sf_splat(b[0].imag);
|
||||
abv0r += avr * bv0r - avi * bv0i;
|
||||
abv0i += avr * bv0i + avi * bv0r;
|
||||
|
||||
const v4sf bv1r = v4sf_splat(b[1].real);
|
||||
const v4sf bv1i = v4sf_splat(b[1].imag);
|
||||
abv1r += avr * bv1r - avi * bv1i;
|
||||
abv1i += avr * bv1i + avi * bv1r;
|
||||
|
||||
const v4sf bv2r = v4sf_splat(b[2].real);
|
||||
const v4sf bv2i = v4sf_splat(b[2].imag);
|
||||
abv2r += avr * bv2r - avi * bv2i;
|
||||
abv2i += avr * bv2i + avi * bv2r;
|
||||
|
||||
const v4sf bv3r = v4sf_splat(b[3].real);
|
||||
const v4sf bv3i = v4sf_splat(b[3].imag);
|
||||
abv3r += avr * bv3r - avi * bv3i;
|
||||
abv3i += avr * bv3i + avi * bv3r;
|
||||
|
||||
a += 4;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
const v4sf alphavr = v4sf_splat(alpha->real);
|
||||
const v4sf alphavi = v4sf_splat(alpha->imag);
|
||||
v4sf temp;
|
||||
|
||||
temp = abv0r * alphavr - abv0i * alphavi;
|
||||
abv0i = abv0r * alphavi + abv0i * alphavr;
|
||||
abv0r = temp;
|
||||
|
||||
temp = abv1r * alphavr - abv1i * alphavi;
|
||||
abv1i = abv1r * alphavi + abv1i * alphavr;
|
||||
abv1r = temp;
|
||||
|
||||
temp = abv2r * alphavr - abv2i * alphavi;
|
||||
abv2i = abv2r * alphavi + abv2i * alphavr;
|
||||
abv2r = temp;
|
||||
|
||||
temp = abv3r * alphavr - abv3i * alphavi;
|
||||
abv3i = abv3r * alphavi + abv3i * alphavr;
|
||||
abv3r = temp;
|
||||
|
||||
if (rs_c == 1) {
|
||||
const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
|
||||
const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
|
||||
const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
|
||||
const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
|
||||
const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
|
||||
const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
|
||||
const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
|
||||
const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
|
||||
|
||||
v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
|
||||
v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
|
||||
v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
|
||||
v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
|
||||
v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
|
||||
v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
|
||||
v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
|
||||
v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
|
||||
|
||||
const v4sf betavr = v4sf_splat(beta->real);
|
||||
const v4sf betavi = v4sf_splat(beta->imag);
|
||||
|
||||
temp = abv0r + cv0r * betavr - cv0i * betavi;
|
||||
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
|
||||
cv0r = temp;
|
||||
|
||||
temp = abv1r + cv1r * betavr - cv1i * betavi;
|
||||
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
|
||||
cv1r = temp;
|
||||
|
||||
temp = abv2r + cv2r * betavr - cv2i * betavi;
|
||||
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
|
||||
cv2r = temp;
|
||||
|
||||
temp = abv3r + cv3r * betavr - cv3i * betavi;
|
||||
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
|
||||
cv3r = temp;
|
||||
|
||||
v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
|
||||
v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
|
||||
v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
|
||||
v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
|
||||
} else {
|
||||
// Load columns 0, 1, 2, 3 (real part)
|
||||
v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
|
||||
v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
|
||||
v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
|
||||
v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
|
||||
// Load columns 0, 1, 2, 3 (imaginary part)
|
||||
v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
|
||||
v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
|
||||
v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
|
||||
v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
|
||||
|
||||
const v4sf betavr = v4sf_splat(beta->real);
|
||||
const v4sf betavi = v4sf_splat(beta->imag);
|
||||
|
||||
temp = abv0r + cv0r * betavr - cv0i * betavi;
|
||||
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
|
||||
cv0r = temp;
|
||||
|
||||
temp = abv1r + cv1r * betavr - cv1i * betavi;
|
||||
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
|
||||
cv1r = temp;
|
||||
|
||||
temp = abv2r + cv2r * betavr - cv2i * betavi;
|
||||
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
|
||||
cv2r = temp;
|
||||
|
||||
temp = abv3r + cv3r * betavr - cv3i * betavi;
|
||||
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
|
||||
cv3r = temp;
|
||||
|
||||
// Store column 0
|
||||
c[0*rs_c + 0*cs_c].real = cv0r[0];
|
||||
c[0*rs_c + 0*cs_c].imag = cv0i[0];
|
||||
c[1*rs_c + 0*cs_c].real = cv0r[1];
|
||||
c[1*rs_c + 0*cs_c].imag = cv0i[1];
|
||||
c[2*rs_c + 0*cs_c].real = cv0r[2];
|
||||
c[2*rs_c + 0*cs_c].imag = cv0i[2];
|
||||
c[3*rs_c + 0*cs_c].real = cv0r[3];
|
||||
c[3*rs_c + 0*cs_c].imag = cv0i[3];
|
||||
|
||||
// Store column 1
|
||||
c[0*rs_c + 1*cs_c].real = cv1r[0];
|
||||
c[0*rs_c + 1*cs_c].imag = cv1i[0];
|
||||
c[1*rs_c + 1*cs_c].real = cv1r[1];
|
||||
c[1*rs_c + 1*cs_c].imag = cv1i[1];
|
||||
c[2*rs_c + 1*cs_c].real = cv1r[2];
|
||||
c[2*rs_c + 1*cs_c].imag = cv1i[2];
|
||||
c[3*rs_c + 1*cs_c].real = cv1r[3];
|
||||
c[3*rs_c + 1*cs_c].imag = cv1i[3];
|
||||
|
||||
// Store column 2
|
||||
c[0*rs_c + 2*cs_c].real = cv2r[0];
|
||||
c[0*rs_c + 2*cs_c].imag = cv2i[0];
|
||||
c[1*rs_c + 2*cs_c].real = cv2r[1];
|
||||
c[1*rs_c + 2*cs_c].imag = cv2i[1];
|
||||
c[2*rs_c + 2*cs_c].real = cv2r[2];
|
||||
c[2*rs_c + 2*cs_c].imag = cv2i[2];
|
||||
c[3*rs_c + 2*cs_c].real = cv2r[3];
|
||||
c[3*rs_c + 2*cs_c].imag = cv2i[3];
|
||||
|
||||
// Store column 3
|
||||
c[0*rs_c + 3*cs_c].real = cv3r[0];
|
||||
c[0*rs_c + 3*cs_c].imag = cv3i[0];
|
||||
c[1*rs_c + 3*cs_c].real = cv3r[1];
|
||||
c[1*rs_c + 3*cs_c].imag = cv3i[1];
|
||||
c[2*rs_c + 3*cs_c].real = cv3r[2];
|
||||
c[2*rs_c + 3*cs_c].imag = cv3i[2];
|
||||
c[3*rs_c + 3*cs_c].real = cv3r[3];
|
||||
c[3*rs_c + 3*cs_c].imag = cv3i[3];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -45,7 +45,8 @@
|
||||
#
|
||||
|
||||
.PHONY: all bin clean \
|
||||
check-env check-env-mk check-env-fragments check-env-make-defs
|
||||
check-env check-env-mk check-env-fragments check-env-make-defs \
|
||||
run run-amd64 run-x86 run-arm
|
||||
|
||||
|
||||
|
||||
@@ -241,8 +242,21 @@ TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
|
||||
$(TEST_OBJ_PATH)/%.o, \
|
||||
$(wildcard $(TEST_SRC_PATH)/*.c))
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
# Linked executable
|
||||
TEST_BIN := test_libblis.unstable.pexe
|
||||
# Finalized executable
|
||||
TEST_BIN_PNACL := test_libblis.pexe
|
||||
# Translated executable for x86-64
|
||||
TEST_BIN_AMD64 := test_libblis.x86-64.nexe
|
||||
# Translated executable for x86
|
||||
TEST_BIN_X86 := test_libblis.x86.nexe
|
||||
# Translated executable for ARM
|
||||
TEST_BIN_ARM := test_libblis.arm.nexe
|
||||
else
|
||||
# Binary executable name.
|
||||
TEST_BIN := test_libblis.x
|
||||
endif
|
||||
|
||||
# Add installed and local header paths to CFLAGS
|
||||
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
|
||||
@@ -257,7 +271,11 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
|
||||
|
||||
all: check-env bin
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
bin: check-env $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
|
||||
else
|
||||
bin: check-env $(TEST_BIN)
|
||||
endif
|
||||
|
||||
|
||||
# --- Environment check rules ---
|
||||
@@ -301,9 +319,68 @@ else
|
||||
@$(LINKER) $(TEST_OBJS) $(BLIS_LIB) $(LDFLAGS) -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
|
||||
# Finalize PNaCl executable (i.e. convert from LLVM bitcode to PNaCl bitcode)
|
||||
$(TEST_BIN_PNACL): $(TEST_BIN)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
|
||||
else
|
||||
@echo "Finalizing $@"
|
||||
@$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
|
||||
endif
|
||||
|
||||
# Translate PNaCl executable to x86-64 NaCl executable
|
||||
$(TEST_BIN_AMD64): $(TEST_BIN_PNACL)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
|
||||
else
|
||||
@echo "Translating $< -> $@"
|
||||
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
|
||||
# Translate PNaCl executable to x86 NaCl executable
|
||||
$(TEST_BIN_X86): $(TEST_BIN_PNACL)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
|
||||
else
|
||||
@echo "Translating $< -> $@"
|
||||
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
# Translate PNaCl executable to ARMv7 NaCl executable
|
||||
$(TEST_BIN_ARM): $(TEST_BIN_PNACL)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
|
||||
else
|
||||
@echo "Translating $< -> $@"
|
||||
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
# -- Test run rules --
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
run-amd64: $(TEST_BIN_AMD64)
|
||||
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_64 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_64.nexe -- $(TEST_BIN_AMD64)
|
||||
run-x86: $(TEST_BIN_X86)
|
||||
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_32 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_32.nexe -- $(TEST_BIN_X86)
|
||||
run-arm: $(TEST_BIN_ARM)
|
||||
$(NACL_SDK_ROOT)/tools/sel_ldr_arm -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_arm.nexe -- $(TEST_BIN_ARM)
|
||||
else
|
||||
run: $(TEST_BIN)
|
||||
./$(TEST_BIN)
|
||||
endif
|
||||
|
||||
# -- Clean rules --
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
clean:
|
||||
- $(RM_F) $(TEST_OBJS) $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
|
||||
else
|
||||
clean:
|
||||
- $(RM_F) $(TEST_OBJS) $(TEST_BIN)
|
||||
endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user