This commit is contained in:
Tyler Smith
2014-07-03 12:52:52 -05:00
40 changed files with 2856 additions and 416 deletions

3
.gitignore vendored
View File

@@ -19,11 +19,12 @@
*.a
# test executables
*.x
*.pexe
*.nexe
# -- build system files --
config.mk
version
# -- makefile fragments --

135
CHANGELOG
View File

@@ -1,4 +1,137 @@
commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (HEAD, tag: 0.1.2, origin/master, master)
commit 036cc634918463b1caa0fd89c9a211f2f5639af7 (HEAD, tag: 0.1.3, master)
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Mon Jun 23 13:48:17 2014 -0500
Version file update (0.1.3)
commit 09d9a3bf6763932d9f571085b2cfd1b8631eccba
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Mon Jun 23 13:43:26 2014 -0500
Reverting version file to test new version script.
Details:
- Changed version file contents to 0.1.2 so that I can test out a new
version file bumping script.
commit ebb33965981dcb2b0bdee5fc7fdf6c959420f311
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Mon Jun 23 11:22:50 2014 -0500
Added 'version' file.
commit 2cb9a5501a3cbeb6692cf68e896087ba73b6af69
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Mon Jun 23 10:42:29 2014 -0500
Removed 'version' from .gitignore file.
commit b40dcefc5ee31f67aa3990e2e9d2ef8ed1386a25 (origin/master)
Merge: 7101a8e b693b0c
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Mon Jun 23 10:39:05 2014 -0500
Merge pull request #11 from Maratyszcza/stable
[sc]axpy kernels for PNaCl
commit b693b0cddcfb41450e3c09a3ab97acb44c1ccdec
Author: Marat Dukhan <maratek@gmail.com>
Date: Sun Jun 22 13:44:25 2014 -0700
[SC]AXPY kernels for PNaCl
commit 7101a8eec0327d6c3a7eb36eb4b0fd45c1c6d162
Merge: ad48dca 020a831
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Thu Jun 19 21:46:50 2014 -0500
Merge pull request #10 from Maratyszcza/stable
Portable Native Client port
commit 020a831bc5f61744cb8354886aa679b99b1285f6
Author: Marat Dukhan <maratek@gmail.com>
Date: Thu Jun 19 00:58:26 2014 -0700
Code clean-up in PNaCl port
commit 491be4f91ed725522f5cc7184053857c6c376ada
Author: Marat Dukhan <maratek@gmail.com>
Date: Thu Jun 19 00:45:44 2014 -0700
Optimized dot product kernels for PNaCl
commit 4b8e71aab80182873a2e138eb07902b8d8fd5480
Author: Marat Dukhan <maratek@gmail.com>
Date: Thu Jun 19 00:43:25 2014 -0700
Use AR rcs flags for PNaCl target to avoid warning
commit 031deb2a5c718d569bde842590a791b812f4cf1d
Author: Marat Dukhan <maratek@gmail.com>
Date: Wed Jun 18 03:11:34 2014 -0700
PNaCl configuration: use pnacl-ar instead or ar (fixes build issue on Mac)
commit 68a02976e3c3638f0a9821342e269a1743e3ace3
Author: Marat Dukhan <maratek@gmail.com>
Date: Wed Jun 18 03:10:25 2014 -0700
Compile pnacl configuration in GNU11 mode to avoid warning about non-standard features
commit 6f8462eb0ec278b89731e73ef583386a3371d095
Author: Marat Dukhan <maratek@gmail.com>
Date: Wed Jun 18 03:08:46 2014 -0700
Fix inconsistent VERBOSE macro in Makefile
commit b2ffb4de8b6872cb23537ad282e557d11dcd9c8b
Author: Marat Dukhan <maratek@gmail.com>
Date: Sun Jun 15 18:41:30 2014 -0400
Reformatted PNaCl GEMM kernels
commit 6de2d472d98baa215264a776f3d5291780a6a085
Author: Marat Dukhan <maratek@gmail.com>
Date: Sun Jun 15 08:44:31 2014 -0400
CGEMM and ZGEMM kernels for PNaCl
commit f064711a5e6fb3852c17c7520909b09dc27665f2
Author: Marat Dukhan <maratek@gmail.com>
Date: Sun Jun 15 06:27:37 2014 -0400
SGEMM and DGEMM kernels for PNaCl
commit ad48dca22913a363899f0bef45553898718eebb1
Merge: ee2b679 7118f87
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Sat Jun 14 15:10:13 2014 -0500
Merge pull request #9 from tkelman/memalign_windows
Use _aligned_malloc instead of posix_memalign on Windows
commit 7118f87e18b4941423472afc00215c1d1f2a1fcd
Author: Tony Kelman <tony@kelman.net>
Date: Sat Jun 14 06:53:20 2014 -0700
Use _aligned_malloc instead of posix_memalign on Windows
commit ee2b679281ca45fb40b2198e293bc3bc3d446632
Author: Tyler Smith <tms@cs.utexas.edu>
Date: Fri Jun 6 12:41:55 2014 -0500
Only include omp.h if BLIS_ENABLE_OPENMP is set
commit 19c05dfaac43c627f86e897c8c00f1f9440754aa
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Thu Jun 5 10:54:16 2014 -0500
CHANGELOG update (for 0.1.2).
commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (tag: 0.1.2)
Author: Tyler Smith <tms@cs.utexas.edu>
Date: Mon Jun 2 13:40:57 2014 -0500

View File

@@ -174,7 +174,7 @@ VERS_CONF := $(VERSION)-$(CONFIG_NAME)
# Note: These names will be modified later to include the configuration and
# version strings.
BLIS_LIB_NAME := $(BLIS_LIB_BASE_NAME).a
#BLIS_DLL_NAME := $(BLIS_LIB_BASE_NAME).so
BLIS_DLL_NAME := $(BLIS_LIB_BASE_NAME).so
# --- BLIS framework source and object variable names ---
@@ -196,8 +196,9 @@ MK_CONFIG_OBJS :=
MK_CONFIG_NOOPT_OBJS :=
MK_CONFIG_KERNELS_OBJS :=
# Append the base library path to the library name.
# Append the base library path to the library names.
MK_ALL_BLIS_LIB := $(BASE_LIB_PATH)/$(BLIS_LIB_NAME)
MK_ALL_BLIS_DLL := $(BASE_LIB_PATH)/$(BLIS_DLL_NAME)
# --- Define install target names for static libraries ---
@@ -209,6 +210,16 @@ MK_BLIS_LIB_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.a, \
$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).a, \
$(MK_BLIS_LIB))
# --- Define install target names for shared libraries ---
MK_BLIS_DLL := $(MK_ALL_BLIS_DLL)
MK_BLIS_DLL_INST := $(patsubst $(BASE_LIB_PATH)/%.so, \
$(INSTALL_PREFIX)/lib/%.so, \
$(MK_BLIS_DLL))
MK_BLIS_DLL_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.so, \
$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so, \
$(MK_BLIS_DLL))
# --- Determine which libraries to build ---
MK_LIBS :=
@@ -221,6 +232,12 @@ MK_LIBS_INST += $(MK_BLIS_LIB_INST)
MK_LIBS_INST_W_VERS_CONF += $(MK_BLIS_LIB_INST_W_VERS_CONF)
endif
ifeq ($(BLIS_ENABLE_DYNAMIC_BUILD),yes)
MK_LIBS += $(MK_BLIS_DLL)
MK_LIBS_INST += $(MK_BLIS_DLL_INST)
MK_LIBS_INST_W_VERS_CONF += $(MK_BLIS_DLL_INST_W_VERS_CONF)
endif
# Strip leading, internal, and trailing whitespace.
MK_LIBS_INST := $(strip $(MK_LIBS_INST))
MK_LIBS_INST_W_VERS_CONF := $(strip $(MK_LIBS_INST_W_VERS_CONF))
@@ -385,12 +402,12 @@ TESTSUITE_BIN := $(TESTSUITE_NAME).x
# --- Uninstall definitions ----------------------------------------------------
#
# This shell command grabs all files named "libblis-*.a" in the installation
# directory and then filters out the name of the library archive for the
# current version/configuration. We consider this remaining set of libraries
# to be "old" and eligible for removal upon running of the uninstall-old
# target.
UNINSTALL_LIBS := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.a" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).a" | $(GREP) -v $(BLIS_LIB_NAME))
# This shell command grabs all files named "libblis-*.a" or "libblis-*.so" in
# the installation directory and then filters out the name of the library
# archive for the current version/configuration. We consider this remaining set
# of libraries to be "old" and eligible for removal upon running of the
# uninstall-old target.
UNINSTALL_LIBS := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.[a|so]" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).[a|so]" | $(GREP) -v $(BLIS_LIB_NAME))
@@ -464,12 +481,15 @@ ifeq ($(MAKE_DEFS_MK_PRESENT),no)
endif
# --- Static library archiver rules ---
# --- All-purpose library rule (static and shared) ---
blis-lib: check-env $(MK_LIBS)
# --- Static library archiver rules ---
$(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS)
ifeq ($(FLA_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(AR) $(ARFLAGS) $@ $?
$(RANLIB) $@
else
@@ -479,6 +499,17 @@ else
endif
# --- Dynamic library linker rules ---
$(MK_ALL_BLIS_DLL): $(MK_ALL_BLIS_OBJS)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $?
else
@echo "Dynamically linking $@"
@$(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $?
endif
# --- Test suite rules ---
testsuite: testsuite-run
@@ -540,6 +571,16 @@ else
@$(INSTALL) -m 0644 $< $@
endif
$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so: $(BASE_LIB_PATH)/%.so $(CONFIG_MK_FILE)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(INSTALL) -m 0755 -d $(@D)
$(INSTALL) -m 0644 $< $@
else
@echo "Installing $(@F) into $(INSTALL_PREFIX)/lib/"
@$(INSTALL) -m 0755 -d $(@D)
@$(INSTALL) -m 0644 $< $@
endif
# --- Install-symlinks rules ---
@@ -555,6 +596,16 @@ else
@$(MV) $(@F) $(INSTALL_PREFIX)/lib/
endif
$(INSTALL_PREFIX)/lib/%.so: $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(SYMLINK) $(<F) $(@F)
$(MV) $(@F) $(INSTALL_PREFIX)/lib/
else
@echo "Installing symlink $(@F) into $(INSTALL_PREFIX)/lib/"
@$(SYMLINK) $(<F) $(@F)
@$(MV) $(@F) $(INSTALL_PREFIX)/lib/
endif
# --- Query current configuration ---
@@ -569,6 +620,7 @@ ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
- $(FIND) $(BASE_OBJ_CONFIG_PATH) -name "*.o" | $(XARGS) $(RM_F)
- $(FIND) $(BASE_OBJ_FRAME_PATH) -name "*.o" | $(XARGS) $(RM_F)
- $(FIND) $(BASE_LIB_PATH) -name "*.a" | $(XARGS) $(RM_F)
- $(FIND) $(BASE_LIB_PATH) -name "*.so" | $(XARGS) $(RM_F)
else
@echo "Removing .o files from $(BASE_OBJ_CONFIG_PATH)."
@- $(FIND) $(BASE_OBJ_CONFIG_PATH) -name "*.o" | $(XARGS) $(RM_F)
@@ -576,6 +628,8 @@ else
@- $(FIND) $(BASE_OBJ_FRAME_PATH) -name "*.o" | $(XARGS) $(RM_F)
@echo "Removing .a files from $(BASE_LIB_PATH)."
@- $(FIND) $(BASE_LIB_PATH) -name "*.a" | $(XARGS) $(RM_F)
@echo "Removing .so files from $(BASE_LIB_PATH)."
@- $(FIND) $(BASE_LIB_PATH) -name "*.so" | $(XARGS) $(RM_F)
endif
cleantest: check-env

201
build/bump-version.sh Executable file
View File

@@ -0,0 +1,201 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
#
# bump-version.sh
#
# Field G. Van Zee
#
print_usage()
{
#local script_name
# Get the script name
#script_name=${0##*/}
# Echo usage info
echo " "
echo " "$script_name
echo " "
echo " Field G. Van Zee"
echo " "
echo " Performs a series of actions needed when incrementing (bumping) the"
echo " BLIS version number."
echo " "
echo " Usage:"
echo " ${script_name} [options] new_vers"
echo " "
echo " Arguments:"
echo " "
echo " new_vers The new version string."
echo " "
echo " Options:"
echo " "
echo " -d dry-run"
echo " Go through all the motions, but don't actually make any"
echo " changes to files or perform any git commits."
echo " -f VERSFILE version file name"
echo " Update VERSFILE with new version string instead of default"
echo " 'version' file."
# Exit with non-zero exit status
exit 1
}
main()
{
# -- BEGIN GLOBAL VARIABLE DECLARATIONS --
# The name of the script, stripped of any preceeding path.
script_name=${0##*/}
# The name of the CHANGELOG file.
changelog_file='CHANGELOG'
# The name of the default version file.
version_file_def='version'
# The name of the specified version file.
version_file=''
# Strings used during version query.
git_commit_str=''
new_version_str=''
# The script name to use instead of the $0 when outputting messages.
output_name=''
# The git directory.
gitdir='.git'
# Whether we are performing a dry run or not.
dry_run_flag=""
# -- END GLOBAL VARIABLE DECLARATIONS --
# Process our command line options.
while getopts ":dhf:" opt; do
case $opt in
d ) dry_run_flag="1" ;;
f ) version_file=$OPTARG ;;
h ) print_usage ;;
\? ) print_usage
esac
done
shift $(($OPTIND - 1))
# If a version file name was not given, set version_file to the default
# value.
if [ -n "${version_file}" ]; then
echo "${script_name}: version file specified: '${version_file}'."
else
echo "${script_name}: no version file specified; defaulting to '${version_file_def}'."
version_file="${version_file_def}"
fi
# Check the number of arguments after command line option processing.
if [ $# = "1" ]; then
new_version_str=$1
echo "${script_name}: preparing to bump to version '${new_version_str}'."
else
print_usage
fi
# Check if the .git dir exists; if it does not, we do nothing.
if [ -d "${gitdir}" ]; then
echo "${script_name}: found '${gitdir}' directory; assuming git clone."
#echo "${script_name}: executing: git describe --always."
git_commit_str=$(git describe --always)
echo "${script_name}: starting commit: ${git_commit_str}."
echo "${script_name}: updating version file '${version_file}'."
if [ -z "$dry_run_flag" ]; then
echo "${new_version_str}" > ${version_file}
fi
echo "${script_name}: executing: git commit -m \"Version file update (${new_version_str})\" ${version_file}."
if [ -z "$dry_run_flag" ]; then
git commit -m "Version file update (${new_version_str})" ${version_file}
fi
#echo "${script_name}: executing: git describe --always."
git_commit_str=$(git describe --always)
echo "${script_name}: commit to be tagged: ${git_commit_str}."
echo "${script_name}: executing: git tag ${new_version_str} ${git_commit_str}."
if [ -z "$dry_run_flag" ]; then
git tag ${new_version_str} ${git_commit_str}
fi
echo "${script_name}: updating ${changelog_file}."
if [ -z "$dry_run_flag" ]; then
make changelog
fi
echo "${script_name}: executing: git commit -m \"CHANGELOG update (${new_version_str})\" ${changelog_file}."
if [ -z "$dry_run_flag" ]; then
git commit -m "CHANGELOG update (${new_version_str})" ${changelog_file}
fi
#echo "${script_name}: executing: git describe --always."
git_commit_str=$(git describe --always)
echo "${script_name}: latest commit: ${git_commit_str}."
else
echo "${script_name}: could not find '${gitdir}' directory; bailing out."
fi
# Exit peacefully.
return 0
}
# The script's main entry point, passing all parameters given.
main "$@"

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
CPICFLAGS := -fPIC
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -82,6 +82,7 @@ CC := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L \
-I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk
CMISCFLAGS := -qthreaded -qsmp=omp -qasm=gcc -qkeyword=asm # -qreport -qsource -qlistopt -qlist
CPICFLAGS :=
CDBGFLAGS :=
CWARNFLAGS := -w
COPTFLAGS := -O3
@@ -91,9 +92,9 @@ CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -march=armv7-a -mfpu=neon -O2
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CPICFLAGS := -fPIC
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
@@ -90,9 +91,9 @@ CVECFLAGS := -msse3 -march=native
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -mabi=64
CMISCFLAGS := -std=c99 -fopenmp #-pg
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O3 -march=loongson3a -mtune=loongson3a
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -79,6 +79,7 @@ GIT_LOG := $(GIT) log --decorate
CC := icc
CPPROCFLAGS :=
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
CPICFLAGS := -fPIC
CDBGFLAGS :=
CWARNFLAGS := -Wall
COPTFLAGS := -O3
@@ -88,9 +89,9 @@ CVECFLAGS :=
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -98,6 +99,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -mmic -lm -openmp

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -fopenmp
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O0 -malign-double -funroll-all-loops
@@ -90,9 +91,9 @@ CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

165
config/pnacl/bli_config.h Normal file
View File

@@ -0,0 +1,165 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// -- OPERATING SYSTEM ---------------------------------------------------------
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
// integers while 64 results in 64-bit integers. Any other value results in use
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
// Define the number of floating-point types supported, and the size of the
// largest type.
#define BLIS_NUM_FP_TYPES 4
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
// Enable use of built-in C99 "float complex" and "double complex" types and
// associated overloaded operations and functions? Disabling results in
// scomplex and dcomplex being defined in terms of simple structs.
//#define BLIS_ENABLE_C99_COMPLEX
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 1
// -- MEMORY ALLOCATION --------------------------------------------------------
// -- Contiguous (static) memory allocator --
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
// memory pools so that the micro-kernel, when computing with the end of the
// last block, can exceed the bounds of the usable portion of the memory
// region without causing a segmentation fault.
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
// -- Memory alignment --
// It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size
// and the page size.
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
// Basic (homogeneous) datatype support always enabled.
// Enable mixed domain operations?
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
// Enable extra mixed precision operations?
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
// Stay initialized after auto-initialization, unless and until the user
// explicitly calls bli_finalize().
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
// Enable the BLAS compatibility layer?
#define BLIS_ENABLE_BLAS2BLIS
// The bit size of the integer type used to track values such as dimensions and
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
// A value of 32 results in the compatibility layer using 32-bit signed integers
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
#endif

251
config/pnacl/bli_kernel.h Normal file
View File

@@ -0,0 +1,251 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
/*
* SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible.
* Therefore, if compilation targets an older Chrome release, we use scalar kernels.
* The target Chrome version is indicated by PPAPI_MACRO defined in the header below.
*/
#include <ppapi/c/pp_macros.h>
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
#else
#define BLIS_DEFAULT_MC_S 252
#define BLIS_DEFAULT_KC_S 264
#define BLIS_DEFAULT_NC_S 8196
#endif
#define BLIS_DEFAULT_MC_D 1080
#define BLIS_DEFAULT_KC_D 120
#define BLIS_DEFAULT_NC_D 8400
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#else
#define BLIS_DEFAULT_MC_C 120
#define BLIS_DEFAULT_KC_C 264
#define BLIS_DEFAULT_NC_C 4092
#endif
#define BLIS_DEFAULT_MC_Z 60
#define BLIS_DEFAULT_KC_Z 264
#define BLIS_DEFAULT_NC_Z 2040
// -- Register blocksizes --
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#else
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 3
#endif
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 3
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MR_C 4
#define BLIS_DEFAULT_NR_C 4
#else
#define BLIS_DEFAULT_MR_C 2
#define BLIS_DEFAULT_NR_C 3
#endif
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 3
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#if PPAPI_RELEASE >= 36
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt
#endif
// -- trsm-related --
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
// -- unpackm --
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
// -- dotaxpyv --
// -- axpyf --
// -- dotxf --
// -- dotxaxpyf --
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
// -- axpyv --
#if PPAPI_RELEASE >= 36
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt
#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt
#endif
// -- copyv --
// -- dotv --
#define BLIS_SDOTV_KERNEL bli_sdotv_opt
#define BLIS_DDOTV_KERNEL bli_ddotv_opt
#define BLIS_CDOTV_KERNEL bli_cdotv_opt
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt
// -- dotxv --
// -- invertv --
// -- scal2v --
// -- scalv --
// -- setv --
// -- subv --
// -- swapv --
#endif

1
config/pnacl/kernels Symbolic link
View File

@@ -0,0 +1 @@
../../kernels/nacl/pnacl

119
config/pnacl/make_defs.mk Normal file
View File

@@ -0,0 +1,119 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := pnacl-ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := pnacl-clang
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include
CPICFLAGS :=
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O3
CKOPTFLAGS := $(COPTFLAGS) -ffast-math
CVECFLAGS :=
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := pnacl-ar
ARFLAGS := rcs
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS :=
LDFLAGS := -lm
# --- Determine the finalizer and related flags ---
FINALIZER := pnacl-finalize
FINFLAGS :=
# --- Determine the translator and related flags ---
TRANSLATOR := pnacl-translate
TRNSFLAGS := -O3
TRNSAMD64FLAGS := -arch x86-64
TRNSX86FLAGS := -arch i686
TRNSARMFLAGS := -arch armv7
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif

View File

@@ -80,8 +80,8 @@ CC := gcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CMISCFLAGS += -m64 -mcpu=power7
CMISCFLAGS := -std=c99 -m64 -mcpu=power7 #-fopenmp -pg
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O3 -mtune=power7
@@ -91,9 +91,9 @@ CVECFLAGS := -mvsx
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -101,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -81,7 +81,8 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CDBGFLAGS := -g
CPICFLAGS := -fPIC
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -O2
CKOPTFLAGS := $(COPTFLAGS)
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg
CPICFLAGS := -fPIC
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -O3 -march=native
@@ -90,9 +91,9 @@ CVECFLAGS := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -81,6 +81,7 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O2
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -100,6 +101,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm

View File

@@ -38,6 +38,8 @@
#include "bli_gemm_int.h"
#include "bli_gemm_target.h"
#include "bli_gemm_ukernel.h"
#include "bli_gemm_blk_var1f.h"
#include "bli_gemm_blk_var2f.h"
#include "bli_gemm_blk_var3f.h"

View File

@@ -0,0 +1,126 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_ukr_fp
typedef void (*FUNCPTR_T)(
dim_t k,
void* alpha,
void* a,
void* b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ukernel_void);
void bli_gemm_ukernel( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c )
{
num_t dt = bli_obj_datatype( *c );
dim_t k = bli_obj_width( *a );
void* buf_a = bli_obj_buffer_at_off( *a );
void* buf_b = bli_obj_buffer_at_off( *b );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta );
inc_t ps_a = bli_obj_panel_stride( *a );
inc_t ps_b = bli_obj_panel_stride( *b );
FUNCPTR_T f;
auxinfo_t data;
// Fill the auxinfo_t struct in case the micro-kernel uses it.
bli_auxinfo_set_next_a( buf_a, data );
bli_auxinfo_set_next_b( buf_b, data );
bli_auxinfo_set_ps_a( ps_a, data );
bli_auxinfo_set_ps_b( ps_b, data );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt];
// Invoke the function.
f( k,
buf_alpha,
buf_a,
buf_b,
buf_beta,
buf_c, rs_c, cs_c,
&data );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a, \
void* b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
) \
{ \
PASTEMAC(ch,ukrname)( k, \
alpha, \
a, \
b, \
beta, \
c, rs_c, cs_c, \
data ); \
}
INSERT_GENTFUNC_BASIC( gemm_ukernel_void, GEMM_UKERNEL )

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemm_ukernel( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a, \
void* b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemm_ukernel_void )

View File

@@ -0,0 +1,141 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef FUNCPTR_T
#define FUNCPTR_T gemmtrsm_ukr_fp
typedef void (*FUNCPTR_T)(
dim_t k,
void* alpha,
void* a1x,
void* a11,
void* bx1,
void* b11,
void* c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukernel_void);
static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukernel_void);
void bli_gemmtrsm_ukernel( obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 )
{
dim_t k = bli_obj_width( *a1x );
num_t dt = bli_obj_datatype( *c11 );
void* buf_a1x = bli_obj_buffer_at_off( *a1x );
void* buf_a11 = bli_obj_buffer_at_off( *a11 );
void* buf_bx1 = bli_obj_buffer_at_off( *bx1 );
void* buf_b11 = bli_obj_buffer_at_off( *b11 );
void* buf_c11 = bli_obj_buffer_at_off( *c11 );
inc_t rs_c = bli_obj_row_stride( *c11 );
inc_t cs_c = bli_obj_col_stride( *c11 );
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
inc_t ps_a = bli_obj_panel_stride( *a1x );
inc_t ps_b = bli_obj_panel_stride( *bx1 );
FUNCPTR_T f;
auxinfo_t data;
// Fill the auxinfo_t struct in case the micro-kernel uses it.
if ( bli_obj_is_lower( *a11 ) )
{ bli_auxinfo_set_next_a( buf_a1x, data ); }
else
{ bli_auxinfo_set_next_a( buf_a11, data ); }
bli_auxinfo_set_next_b( buf_bx1, data );
bli_auxinfo_set_ps_a( ps_a, data );
bli_auxinfo_set_ps_b( ps_b, data );
// Index into the type combination array to extract the correct
// function pointer.
if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt];
else f = ftypes_u[dt];
// Invoke the function.
f( k,
buf_alpha,
buf_a1x,
buf_a11,
buf_bx1,
buf_b11,
buf_c11, rs_c, cs_c,
&data );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a1x, \
void* a11, \
void* bx1, \
void* b11, \
void* c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
) \
{ \
PASTEMAC(ch,ukrname)( k, \
alpha, \
a1x, \
a11, \
bx1, \
b11, \
c11, rs_c, cs_c, \
data ); \
}
INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukernel_void, GEMMTRSM_L_UKERNEL )
INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukernel_void, GEMMTRSM_U_UKERNEL )

View File

@@ -0,0 +1,63 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemmtrsm_ukernel( obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a1x, \
void* a11, \
void* bx1, \
void* b11, \
void* c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_ukernel_void )
INSERT_GENTPROT_BASIC( gemmtrsm_u_ukernel_void )

View File

@@ -37,6 +37,9 @@
#include "bli_trsm_front.h"
#include "bli_trsm_int.h"
#include "bli_gemmtrsm_ukernel.h"
#include "bli_trsm_ukernel.h"
#include "bli_trsm_blk_var1f.h"
#include "bli_trsm_blk_var1b.h"

View File

@@ -0,0 +1,111 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef FUNCPTR_T
#define FUNCPTR_T trsm_ukr_fp
typedef void (*FUNCPTR_T)(
void* a,
void* b,
void* c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukernel_void);
static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukernel_void);
void bli_trsm_ukernel( obj_t* a,
obj_t* b,
obj_t* c )
{
num_t dt = bli_obj_datatype( *c );
void* buf_a = bli_obj_buffer_at_off( *a );
void* buf_b = bli_obj_buffer_at_off( *b );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
inc_t ps_a = bli_obj_panel_stride( *a );
inc_t ps_b = bli_obj_panel_stride( *b );
FUNCPTR_T f;
auxinfo_t data;
// Fill the auxinfo_t struct in case the micro-kernel uses it.
bli_auxinfo_set_next_a( buf_a, data );
bli_auxinfo_set_next_b( buf_b, data );
bli_auxinfo_set_ps_a( ps_a, data );
bli_auxinfo_set_ps_b( ps_b, data );
// Index into the type combination array to extract the correct
// function pointer.
if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt];
else f = ftypes_u[dt];
// Invoke the function.
f( buf_a,
buf_b,
buf_c, rs_c, cs_c,
&data );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
void* a, \
void* b, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
) \
{ \
PASTEMAC(ch,ukrname)( a, \
b, \
c, rs_c, cs_c, \
data ); \
}
INSERT_GENTFUNC_BASIC( trsm_l_ukernel_void, TRSM_L_UKERNEL )
INSERT_GENTFUNC_BASIC( trsm_u_ukernel_void, TRSM_U_UKERNEL )

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trsm_ukernel( obj_t* a,
obj_t* b,
obj_t* c );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
void* a, \
void* b, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( trsm_l_ukernel_void )
INSERT_GENTPROT_BASIC( trsm_u_ukernel_void )

View File

@@ -43,6 +43,9 @@ void* bli_malloc( siz_t size )
#if BLIS_HEAP_ADDR_ALIGN_SIZE == 1
p = malloc( ( size_t )size );
#elif defined(_WIN32)
p = _aligned_malloc( ( size_t )size,
( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE );
#else
r_val = posix_memalign( &p,
( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE,
@@ -58,6 +61,10 @@ void* bli_malloc( siz_t size )
void bli_free( void* p )
{
#if BLIS_HEAP_ADDR_ALIGN_SIZE == 1 || !defined(_WIN32)
free( p );
#else
_aligned_free( p );
#endif
}

View File

@@ -0,0 +1,203 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#if PPAPI_RELEASE >= 36
typedef float v4sf __attribute__ ((vector_size(16)));
inline v4sf v4sf_splat(float x) {
return (v4sf) { x, x, x, x };
}
inline v4sf v4sf_load(const float* a) {
return *((const v4sf*)a);
}
inline v4sf v4sf_cload(const scomplex* a) {
return *((const v4sf*)a);
}
inline void v4sf_store(float* a, v4sf x) {
*((v4sf*)a) = x;
}
inline void v4sf_cstore(scomplex* a, v4sf x) {
*((v4sf*)a) = x;
}
inline v4sf v4sf_zero() {
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
}
#endif
void bli_saxpyv_opt(
conj_t conjx,
dim_t n,
float alpha[restrict static 1],
float x[restrict static n],
inc_t incx,
float y[restrict static n],
inc_t incy)
{
if (bli_zero_dim1(n)) {
return;
}
if (bli_seq0(*alpha)) {
return;
}
#if PPAPI_RELEASE >= 36
if (!bli_has_nonunit_inc2(incx, incy)) {
const v4sf alphav = v4sf_splat(*alpha);
while (n >= 4) {
const v4sf xv = v4sf_load(x);
v4sf yv = v4sf_load(y);
yv += xv * alphav;
v4sf_store(y, yv);
x += 4;
y += 4;
n -= 4;
}
const float alphac = *alpha;
while (n--) {
(*y++) += (*x++) * alphac;
}
}
#endif
/* Just call the reference implementation. */
BLIS_SAXPYV_KERNEL_REF(
conjx,
n,
alpha,
x,
incx,
y,
incy);
}
void bli_caxpyv_opt(
conj_t conjx,
dim_t n,
scomplex alpha[restrict static 1],
scomplex x[restrict static n],
inc_t incx,
scomplex y[restrict static n],
inc_t incy)
{
if (bli_zero_dim1(n)) {
return;
}
if (bli_ceq0(*alpha)) {
return;
}
#if PPAPI_RELEASE >= 36
if (!bli_has_nonunit_inc2(incx, incy)) {
if (bli_is_noconj(conjx)) {
const v4sf alphav0 = v4sf_splat(alpha->real);
const v4sf alphav1 = (v4sf) { -alpha->imag, alpha->imag, -alpha->imag, alpha->imag };
while (n >= 2) {
const v4sf xv0 = v4sf_cload(x);
v4sf yv = v4sf_cload(y);
const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2);
yv += xv0 * alphav0 + xv1 * alphav1;
v4sf_cstore(y, yv);
x += 2;
y += 2;
n -= 2;
}
const float alphar = alpha->real;
const float alphai = alpha->imag;
while (n--) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
y->real = yr + xr * alphar - xi * alphai;
y->imag = yi + xr * alphai + xi * alphar;
x += 1;
y += 1;
}
} else {
const v4sf alphav0 = (v4sf) { alpha->real, -alpha->real, alpha->real, -alpha->real };
const v4sf alphav1 = v4sf_splat(alpha->imag);
while (n >= 2) {
const v4sf xv0 = v4sf_cload(x);
v4sf yv = v4sf_cload(y);
const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2);
yv += xv0 * alphav0 + xv1 * alphav1;
v4sf_cstore(y, yv);
x += 2;
y += 2;
n -= 2;
}
const float alphar = alpha->real;
const float alphai = alpha->imag;
while (n--) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
y->real = yr + xr * alphar + xi * alphai;
y->imag = yi + xr * alphai - xi * alphar;
x += 1;
y += 1;
}
}
}
#endif
/* Just call the reference implementation. */
BLIS_CAXPYV_KERNEL_REF(
conjx,
n,
alpha,
x,
incx,
y,
incy);
}

View File

@@ -0,0 +1,618 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#if PPAPI_RELEASE >= 36
typedef float v4sf __attribute__ ((vector_size(16)));
inline v4sf v4sf_splat(float x) {
return (v4sf) { x, x, x, x };
}
inline v4sf v4sf_load(const float* a) {
return *((const v4sf*)a);
}
inline v4sf v4sf_cload(const scomplex* a) {
return *((const v4sf*)a);
}
inline void v4sf_store(float* a, v4sf x) {
*((v4sf*)a) = x;
}
inline void v4sf_cstore(scomplex* a, v4sf x) {
*((v4sf*)a) = x;
}
inline v4sf v4sf_zero() {
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
}
#endif
void bli_sdotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
float x[restrict static n],
inc_t incx,
float y[restrict static n],
inc_t incy,
float rho[restrict static 1])
{
#if PPAPI_RELEASE >= 36
// If the vector lengths are zero, set rho to zero and return.
if (bli_zero_dim1(n)) {
*rho = 0.0f;
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if (bli_has_nonunit_inc2(incx, incy)) {
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
while (n >= 6) {
sum0 += (*x) * (*y);
x += incx;
y += incy;
sum1 += (*x) * (*y);
x += incx;
y += incy;
sum2 += (*x) * (*y);
x += incx;
y += incy;
sum3 += (*x) * (*y);
x += incx;
y += incy;
sum4 += (*x) * (*y);
x += incx;
y += incy;
sum5 += (*x) * (*y);
x += incx;
y += incy;
n -= 6;
}
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
while (n--) {
sum += (*x) * (*y);
x += incx;
y += incy;
}
*rho = sum;
} else {
v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero();
v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero();
while (n >= 24) {
vsum0 += v4sf_load(x) * v4sf_load(y);
vsum1 += v4sf_load(x+4) * v4sf_load(y+4);
vsum2 += v4sf_load(x+8) * v4sf_load(y+8);
vsum3 += v4sf_load(x+12) * v4sf_load(y+12);
vsum4 += v4sf_load(x+16) * v4sf_load(y+16);
vsum5 += v4sf_load(x+20) * v4sf_load(y+20);
x += 24;
y += 24;
n -= 24;
}
v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5);
while (n >= 4) {
vsum += v4sf_load(x) * v4sf_load(y);
x += 4;
y += 4;
n -= 4;
}
float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]);
while (n--) {
sum += (*x++) * (*y++);
}
*rho = sum;
}
#else
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
while (n >= 6) {
sum0 += (*x) * (*y);
x += incx;
y += incy;
sum1 += (*x) * (*y);
x += incx;
y += incy;
sum2 += (*x) * (*y);
x += incx;
y += incy;
sum3 += (*x) * (*y);
x += incx;
y += incy;
sum4 += (*x) * (*y);
x += incx;
y += incy;
sum5 += (*x) * (*y);
x += incx;
y += incy;
n -= 6;
}
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
while (n--) {
sum += (*x) * (*y);
x += incx;
y += incy;
}
*rho = sum;
#endif
}
void bli_ddotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
double x[restrict static n],
inc_t incx,
double y[restrict static n],
inc_t incy,
double rho[restrict static 1])
{
double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0;
while (n >= 6) {
sum0 += (*x) * (*y);
x += incx;
y += incy;
sum1 += (*x) * (*y);
x += incx;
y += incy;
sum2 += (*x) * (*y);
x += incx;
y += incy;
sum3 += (*x) * (*y);
x += incx;
y += incy;
sum4 += (*x) * (*y);
x += incx;
y += incy;
sum5 += (*x) * (*y);
x += incx;
y += incy;
n -= 6;
}
double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
while (n--) {
sum += (*x) * (*y);
x += incx;
y += incy;
}
*rho = sum;
}
void bli_cdotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex x[restrict static n],
inc_t incx,
scomplex y[restrict static n],
inc_t incy,
scomplex rho[restrict static 1])
{
if (bli_is_conj(conjy)) {
bli_toggle_conj(conjx);
}
if (bli_zero_dim1(n)) {
rho->real = 0.0f;
rho->imag = 0.0f;
return;
}
float sumr;
float sumi;
#if PPAPI_RELEASE >= 36
if (bli_is_noconj(conjx)) {
if (bli_has_nonunit_inc2(incx, incy)) {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r - x0i * y0i;
sum0i += x0r * y0i + x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r - x1i * y1i;
sum1i += x1r * y1i + x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
} else {
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
while (n >= 8) {
const v4sf xv0t = v4sf_cload(x);
const v4sf xv0b = v4sf_cload(x+2);
const v4sf yv0t = v4sf_cload(y);
const v4sf yv0b = v4sf_cload(y+2);
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
sumv0r += xv0r * yv0r - xv0i * yv0i;
sumv0i += xv0r * yv0i + xv0i * yv0r;
const v4sf xv1t = v4sf_cload(x+4);
const v4sf xv1b = v4sf_cload(x+6);
const v4sf yv1t = v4sf_cload(y+4);
const v4sf yv1b = v4sf_cload(y+6);
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
sumv1r += xv1r * yv1r - xv1i * yv1i;
sumv1i += xv1r * yv1i + xv1i * yv1r;
x += 8;
y += 8;
n -= 8;
}
const v4sf sumvr = sumv0r + sumv1r;
const v4sf sumvi = sumv0i + sumv1i;
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
}
while (n--) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr - xi * yi;
sumi += xr * yi + xi * yr;
x += incx;
y += incy;
}
} else {
if (bli_has_nonunit_inc2(incx, incy)) {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r + x0i * y0i;
sum0i += x0r * y0i - x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r + x1i * y1i;
sum1i += x1r * y1i - x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
} else {
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
while (n >= 8) {
const v4sf xv0t = v4sf_cload(x);
const v4sf xv0b = v4sf_cload(x+2);
const v4sf yv0t = v4sf_cload(y);
const v4sf yv0b = v4sf_cload(y+2);
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
sumv0r += xv0r * yv0r + xv0i * yv0i;
sumv0i += xv0r * yv0i - xv0i * yv0r;
const v4sf xv1t = v4sf_cload(x+4);
const v4sf xv1b = v4sf_cload(x+6);
const v4sf yv1t = v4sf_cload(y+4);
const v4sf yv1b = v4sf_cload(y+6);
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
sumv1r += xv1r * yv1r + xv1i * yv1i;
sumv1i += xv1r * yv1i - xv1i * yv1r;
x += 8;
y += 8;
n -= 8;
}
const v4sf sumvr = sumv0r + sumv1r;
const v4sf sumvi = sumv0i + sumv1i;
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
}
while (n--) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr + xi * yi;
sumi += xr * yi - xi * yr;
x += incx;
y += incy;
}
}
#else
if (bli_is_noconj(conjx)) {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r - x0i * y0i;
sum0i += x0r * y0i + x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r - x1i * y1i;
sum1i += x1r * y1i + x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr - xi * yi;
sumi += xr * yi + xi * yr;
}
} else {
float sum0r = 0.0f, sum1r = 0.0f;
float sum0i = 0.0f, sum1i = 0.0f;
while (n >= 2) {
const float x0r = x->real;
const float x0i = x->imag;
const float y0r = y->real;
const float y0i = y->imag;
sum0r += x0r * y0r + x0i * y0i;
sum0i += x0r * y0i - x0i * y0r;
x += incx;
y += incy;
const float x1r = x->real;
const float x1i = x->imag;
const float y1r = y->real;
const float y1i = y->imag;
sum1r += x1r * y1r + x1i * y1i;
sum1i += x1r * y1i - x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const float xr = x->real;
const float xi = x->imag;
const float yr = y->real;
const float yi = y->imag;
sumr += xr * yr + xi * yi;
sumi += xr * yi - xi * yr;
}
}
#endif
rho->real = sumr;
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
}
void bli_zdotv_opt(
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex x[restrict static n],
inc_t incx,
dcomplex y[restrict static n],
inc_t incy,
dcomplex rho[restrict static 1])
{
if (bli_is_conj(conjy)) {
bli_toggle_conj(conjx);
}
if (bli_zero_dim1(n)) {
rho->real = 0.0;
rho->imag = 0.0;
return;
}
double sumr;
double sumi;
if (bli_is_noconj(conjx)) {
double sum0r = 0.0, sum1r = 0.0;
double sum0i = 0.0, sum1i = 0.0;
while (n >= 2) {
const double x0r = x->real;
const double x0i = x->imag;
const double y0r = y->real;
const double y0i = y->imag;
sum0r += x0r * y0r - x0i * y0i;
sum0i += x0r * y0i + x0i * y0r;
x += incx;
y += incy;
const double x1r = x->real;
const double x1i = x->imag;
const double y1r = y->real;
const double y1i = y->imag;
sum1r += x1r * y1r - x1i * y1i;
sum1i += x1r * y1i + x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const double xr = x->real;
const double xi = x->imag;
const double yr = y->real;
const double yi = y->imag;
sumr += xr * yr - xi * yi;
sumi += xr * yi + xi * yr;
}
} else {
double sum0r = 0.0, sum1r = 0.0;
double sum0i = 0.0, sum1i = 0.0;
while (n >= 2) {
const double x0r = x->real;
const double x0i = x->imag;
const double y0r = y->real;
const double y0i = y->imag;
sum0r += x0r * y0r + x0i * y0i;
sum0i += x0r * y0i - x0i * y0r;
x += incx;
y += incy;
const double x1r = x->real;
const double x1i = x->imag;
const double y1r = y->real;
const double y1i = y->imag;
sum1r += x1r * y1r + x1i * y1i;
sum1i += x1r * y1i - x1i * y1r;
x += incx;
y += incy;
n -= 2;
}
sumr = sum0r + sum1r;
sumi = sum0i + sum1i;
if (n != 0) {
const double xr = x->real;
const double xi = x->imag;
const double yr = y->real;
const double yi = y->imag;
sumr += xr * yr + xi * yi;
sumi += xr * yi - xi * yr;
}
}
rho->real = sumr;
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
}

View File

@@ -0,0 +1,386 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#if PPAPI_RELEASE >= 36
typedef float v4sf __attribute__ ((vector_size(16)));
inline v4sf v4sf_splat(float x) {
return (v4sf) { x, x, x, x };
}
inline v4sf v4sf_load(const float* a) {
return *((const v4sf*)a);
}
inline v4sf v4sf_cload(const scomplex* a) {
return *((const v4sf*)a);
}
inline void v4sf_store(float* a, v4sf x) {
*((v4sf*)a) = x;
}
inline void v4sf_cstore(scomplex* a, v4sf x) {
*((v4sf*)a) = x;
}
inline v4sf v4sf_zero() {
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
}
void bli_sgemm_opt(
dim_t k,
float alpha[restrict static 1],
float a[restrict static 8*k],
float b[restrict static k*4],
float beta[restrict static 1],
float c[restrict static 8*4],
inc_t rs_c,
inc_t cs_c,
auxinfo_t* data)
{
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
for (dim_t i = 0; i < k; i += 1) {
const v4sf avt = v4sf_load(a);
const v4sf avb = v4sf_load(a+4);
const v4sf bv_xxxx = v4sf_splat(b[0]);
abv0t += avt * bv_xxxx;
abv0b += avb * bv_xxxx;
const v4sf bv_yyyy = v4sf_splat(b[1]);
abv1t += avt * bv_yyyy;
abv1b += avb * bv_yyyy;
const v4sf bv_zzzz = v4sf_splat(b[2]);
abv2t += avt * bv_zzzz;
abv2b += avb * bv_zzzz;
const v4sf bv_wwww = v4sf_splat(b[3]);
abv3t += avt * bv_wwww;
abv3b += avb * bv_wwww;
a += 8;
b += 4;
}
const v4sf alphav = v4sf_splat(*alpha);
abv0t *= alphav;
abv0b *= alphav;
abv1t *= alphav;
abv1b *= alphav;
abv2t *= alphav;
abv2b *= alphav;
abv3t *= alphav;
abv3b *= alphav;
if (rs_c == 1) {
v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]);
v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]);
v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]);
v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]);
v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]);
v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]);
const v4sf betav = v4sf_splat(*beta);
cv0t = cv0t * betav + abv0t;
cv1t = cv1t * betav + abv1t;
cv2t = cv2t * betav + abv2t;
cv3t = cv3t * betav + abv3t;
cv0b = cv0b * betav + abv0b;
cv1b = cv1b * betav + abv1b;
cv2b = cv2b * betav + abv2b;
cv3b = cv3b * betav + abv3b;
v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
v4sf_store(&c[0*rs_c + 1*cs_c], cv1t);
v4sf_store(&c[0*rs_c + 2*cs_c], cv2t);
v4sf_store(&c[0*rs_c + 3*cs_c], cv3t);
v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
v4sf_store(&c[4*rs_c + 1*cs_c], cv1b);
v4sf_store(&c[4*rs_c + 2*cs_c], cv2b);
v4sf_store(&c[4*rs_c + 3*cs_c], cv3b);
} else {
// Load columns 0, 1, 2, 3 (top part)
v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
// Load columns 0, 1, 2, 3 (bottom part)
v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
const v4sf betav = v4sf_splat(*beta);
cv0t = cv0t * betav + abv0t;
cv1t = cv1t * betav + abv1t;
cv2t = cv2t * betav + abv2t;
cv3t = cv3t * betav + abv3t;
cv0b = cv0b * betav + abv0b;
cv1b = cv1b * betav + abv1b;
cv2b = cv2b * betav + abv2b;
cv3b = cv3b * betav + abv3b;
// Store column 0
c[0*rs_c + 0*cs_c] = cv0t[0];
c[1*rs_c + 0*cs_c] = cv0t[1];
c[2*rs_c + 0*cs_c] = cv0t[2];
c[3*rs_c + 0*cs_c] = cv0t[3];
c[4*rs_c + 0*cs_c] = cv0b[0];
c[5*rs_c + 0*cs_c] = cv0b[1];
c[6*rs_c + 0*cs_c] = cv0b[2];
c[7*rs_c + 0*cs_c] = cv0b[3];
// Store column 1
c[0*rs_c + 1*cs_c] = cv1t[0];
c[1*rs_c + 1*cs_c] = cv1t[1];
c[2*rs_c + 1*cs_c] = cv1t[2];
c[3*rs_c + 1*cs_c] = cv1t[3];
c[4*rs_c + 1*cs_c] = cv1b[0];
c[5*rs_c + 1*cs_c] = cv1b[1];
c[6*rs_c + 1*cs_c] = cv1b[2];
c[7*rs_c + 1*cs_c] = cv1b[3];
// Store column 2
c[0*rs_c + 2*cs_c] = cv2t[0];
c[1*rs_c + 2*cs_c] = cv2t[1];
c[2*rs_c + 2*cs_c] = cv2t[2];
c[3*rs_c + 2*cs_c] = cv2t[3];
c[4*rs_c + 2*cs_c] = cv2b[0];
c[5*rs_c + 2*cs_c] = cv2b[1];
c[6*rs_c + 2*cs_c] = cv2b[2];
c[7*rs_c + 2*cs_c] = cv2b[3];
// Store column 3
c[0*rs_c + 3*cs_c] = cv3t[0];
c[1*rs_c + 3*cs_c] = cv3t[1];
c[2*rs_c + 3*cs_c] = cv3t[2];
c[3*rs_c + 3*cs_c] = cv3t[3];
c[4*rs_c + 3*cs_c] = cv3b[0];
c[5*rs_c + 3*cs_c] = cv3b[1];
c[6*rs_c + 3*cs_c] = cv3b[2];
c[7*rs_c + 3*cs_c] = cv3b[3];
}
}
void bli_cgemm_opt(
dim_t k,
scomplex alpha[restrict static 1],
scomplex a[restrict static 4*k],
scomplex b[restrict static k*4],
scomplex beta[restrict static 1],
scomplex c[restrict static 4*4],
inc_t rs_c,
inc_t cs_c,
auxinfo_t* data)
{
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
for (dim_t i = 0; i < k; i += 1) {
const v4sf avt = v4sf_cload(a);
const v4sf avb = v4sf_cload(a+2);
const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
const v4sf bv0r = v4sf_splat(b[0].real);
const v4sf bv0i = v4sf_splat(b[0].imag);
abv0r += avr * bv0r - avi * bv0i;
abv0i += avr * bv0i + avi * bv0r;
const v4sf bv1r = v4sf_splat(b[1].real);
const v4sf bv1i = v4sf_splat(b[1].imag);
abv1r += avr * bv1r - avi * bv1i;
abv1i += avr * bv1i + avi * bv1r;
const v4sf bv2r = v4sf_splat(b[2].real);
const v4sf bv2i = v4sf_splat(b[2].imag);
abv2r += avr * bv2r - avi * bv2i;
abv2i += avr * bv2i + avi * bv2r;
const v4sf bv3r = v4sf_splat(b[3].real);
const v4sf bv3i = v4sf_splat(b[3].imag);
abv3r += avr * bv3r - avi * bv3i;
abv3i += avr * bv3i + avi * bv3r;
a += 4;
b += 4;
}
const v4sf alphavr = v4sf_splat(alpha->real);
const v4sf alphavi = v4sf_splat(alpha->imag);
v4sf temp;
temp = abv0r * alphavr - abv0i * alphavi;
abv0i = abv0r * alphavi + abv0i * alphavr;
abv0r = temp;
temp = abv1r * alphavr - abv1i * alphavi;
abv1i = abv1r * alphavi + abv1i * alphavr;
abv1r = temp;
temp = abv2r * alphavr - abv2i * alphavi;
abv2i = abv2r * alphavi + abv2i * alphavr;
abv2r = temp;
temp = abv3r * alphavr - abv3i * alphavi;
abv3i = abv3r * alphavi + abv3i * alphavr;
abv3r = temp;
if (rs_c == 1) {
const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
const v4sf betavr = v4sf_splat(beta->real);
const v4sf betavi = v4sf_splat(beta->imag);
temp = abv0r + cv0r * betavr - cv0i * betavi;
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
cv0r = temp;
temp = abv1r + cv1r * betavr - cv1i * betavi;
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
cv1r = temp;
temp = abv2r + cv2r * betavr - cv2i * betavi;
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
cv2r = temp;
temp = abv3r + cv3r * betavr - cv3i * betavi;
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
cv3r = temp;
v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
} else {
// Load columns 0, 1, 2, 3 (real part)
v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
// Load columns 0, 1, 2, 3 (imaginary part)
v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
const v4sf betavr = v4sf_splat(beta->real);
const v4sf betavi = v4sf_splat(beta->imag);
temp = abv0r + cv0r * betavr - cv0i * betavi;
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
cv0r = temp;
temp = abv1r + cv1r * betavr - cv1i * betavi;
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
cv1r = temp;
temp = abv2r + cv2r * betavr - cv2i * betavi;
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
cv2r = temp;
temp = abv3r + cv3r * betavr - cv3i * betavi;
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
cv3r = temp;
// Store column 0
c[0*rs_c + 0*cs_c].real = cv0r[0];
c[0*rs_c + 0*cs_c].imag = cv0i[0];
c[1*rs_c + 0*cs_c].real = cv0r[1];
c[1*rs_c + 0*cs_c].imag = cv0i[1];
c[2*rs_c + 0*cs_c].real = cv0r[2];
c[2*rs_c + 0*cs_c].imag = cv0i[2];
c[3*rs_c + 0*cs_c].real = cv0r[3];
c[3*rs_c + 0*cs_c].imag = cv0i[3];
// Store column 1
c[0*rs_c + 1*cs_c].real = cv1r[0];
c[0*rs_c + 1*cs_c].imag = cv1i[0];
c[1*rs_c + 1*cs_c].real = cv1r[1];
c[1*rs_c + 1*cs_c].imag = cv1i[1];
c[2*rs_c + 1*cs_c].real = cv1r[2];
c[2*rs_c + 1*cs_c].imag = cv1i[2];
c[3*rs_c + 1*cs_c].real = cv1r[3];
c[3*rs_c + 1*cs_c].imag = cv1i[3];
// Store column 2
c[0*rs_c + 2*cs_c].real = cv2r[0];
c[0*rs_c + 2*cs_c].imag = cv2i[0];
c[1*rs_c + 2*cs_c].real = cv2r[1];
c[1*rs_c + 2*cs_c].imag = cv2i[1];
c[2*rs_c + 2*cs_c].real = cv2r[2];
c[2*rs_c + 2*cs_c].imag = cv2i[2];
c[3*rs_c + 2*cs_c].real = cv2r[3];
c[3*rs_c + 2*cs_c].imag = cv2i[3];
// Store column 3
c[0*rs_c + 3*cs_c].real = cv3r[0];
c[0*rs_c + 3*cs_c].imag = cv3i[0];
c[1*rs_c + 3*cs_c].real = cv3r[1];
c[1*rs_c + 3*cs_c].imag = cv3i[1];
c[2*rs_c + 3*cs_c].real = cv3r[2];
c[2*rs_c + 3*cs_c].imag = cv3i[2];
c[3*rs_c + 3*cs_c].real = cv3r[3];
c[3*rs_c + 3*cs_c].imag = cv3i[3];
}
}
#endif

View File

@@ -45,7 +45,8 @@
#
.PHONY: all bin clean \
check-env check-env-mk check-env-fragments check-env-make-defs
check-env check-env-mk check-env-fragments check-env-make-defs \
run run-amd64 run-x86 run-arm
@@ -241,8 +242,21 @@ TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
$(TEST_OBJ_PATH)/%.o, \
$(wildcard $(TEST_SRC_PATH)/*.c))
ifeq ($(CONFIG_NAME),pnacl)
# Linked executable
TEST_BIN := test_libblis.unstable.pexe
# Finalized executable
TEST_BIN_PNACL := test_libblis.pexe
# Translated executable for x86-64
TEST_BIN_AMD64 := test_libblis.x86-64.nexe
# Translated executable for x86
TEST_BIN_X86 := test_libblis.x86.nexe
# Translated executable for ARM
TEST_BIN_ARM := test_libblis.arm.nexe
else
# Binary executable name.
TEST_BIN := test_libblis.x
endif
# Add installed and local header paths to CFLAGS
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
@@ -257,7 +271,11 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
all: check-env bin
ifeq ($(CONFIG_NAME),pnacl)
bin: check-env $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
else
bin: check-env $(TEST_BIN)
endif
# --- Environment check rules ---
@@ -301,9 +319,68 @@ else
@$(LINKER) $(TEST_OBJS) $(BLIS_LIB) $(LDFLAGS) -o $@
endif
ifeq ($(CONFIG_NAME),pnacl)
# Finalize PNaCl executable (i.e. convert from LLVM bitcode to PNaCl bitcode)
$(TEST_BIN_PNACL): $(TEST_BIN)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
else
@echo "Finalizing $@"
@$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
endif
# Translate PNaCl executable to x86-64 NaCl executable
$(TEST_BIN_AMD64): $(TEST_BIN_PNACL)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
else
@echo "Translating $< -> $@"
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
endif
# Translate PNaCl executable to x86 NaCl executable
$(TEST_BIN_X86): $(TEST_BIN_PNACL)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
else
@echo "Translating $< -> $@"
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
endif
# Translate PNaCl executable to ARMv7 NaCl executable
$(TEST_BIN_ARM): $(TEST_BIN_PNACL)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
else
@echo "Translating $< -> $@"
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
endif
endif
# -- Test run rules --
ifeq ($(CONFIG_NAME),pnacl)
run-amd64: $(TEST_BIN_AMD64)
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_64 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_64.nexe -- $(TEST_BIN_AMD64)
run-x86: $(TEST_BIN_X86)
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_32 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_32.nexe -- $(TEST_BIN_X86)
run-arm: $(TEST_BIN_ARM)
$(NACL_SDK_ROOT)/tools/sel_ldr_arm -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_arm.nexe -- $(TEST_BIN_ARM)
else
run: $(TEST_BIN)
./$(TEST_BIN)
endif
# -- Clean rules --
ifeq ($(CONFIG_NAME),pnacl)
clean:
- $(RM_F) $(TEST_OBJS) $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
else
clean:
- $(RM_F) $(TEST_OBJS) $(TEST_BIN)
endif

View File

@@ -270,7 +270,7 @@ void libblis_test_gemm_ukr_impl( iface_t iface,
switch ( iface )
{
case BLIS_TEST_SEQ_UKERNEL:
bli_gemm_ukr( alpha, a, b, beta, c );
bli_gemm_ukernel( alpha, a, b, beta, c );
break;
default:
@@ -354,99 +354,3 @@ void libblis_test_gemm_ukr_check( obj_t* alpha,
bli_obj_free( &z );
}
//
// Define object-wrapper to GEMM_UKERNEL micro-kernels.
//
#define FUNCPTR_T gemm_ukr_fp
typedef void (*FUNCPTR_T)(
dim_t k,
void* alpha,
void* a,
void* b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ukr);
void bli_gemm_ukr( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c )
{
num_t dt = bli_obj_datatype( *c );
dim_t k = bli_obj_width( *a );
void* buf_a = bli_obj_buffer_at_off( *a );
void* buf_b = bli_obj_buffer_at_off( *b );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta );
inc_t ps_a = bli_obj_panel_stride( *a );
inc_t ps_b = bli_obj_panel_stride( *b );
FUNCPTR_T f;
auxinfo_t data;
// Fill the auxinfo_t struct in case the micro-kernel uses it.
bli_auxinfo_set_next_a( buf_a, data );
bli_auxinfo_set_next_b( buf_b, data );
bli_auxinfo_set_ps_a( ps_a, data );
bli_auxinfo_set_ps_b( ps_b, data );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt];
// Invoke the function.
f( k,
buf_alpha,
buf_a,
buf_b,
buf_beta,
buf_c, rs_c, cs_c,
&data );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a, \
void* b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
) \
{ \
PASTEMAC(ch,ukrname)( k, \
alpha, \
a, \
b, \
beta, \
c, rs_c, cs_c, \
data ); \
}
INSERT_GENTFUNC_BASIC( gemm_ukr, GEMM_UKERNEL )

View File

@@ -34,28 +34,3 @@
void libblis_test_gemm_ukr( test_params_t* params, test_op_t* op );
//
// Prototype wrapper interfaces to micro-kernel.
//
void bli_gemm_ukr( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a, \
void* b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemm_ukr )

View File

@@ -314,7 +314,7 @@ void libblis_test_gemmtrsm_ukr_impl( iface_t iface,
switch ( iface )
{
case BLIS_TEST_SEQ_UKERNEL:
bli_gemmtrsm_ukr( alpha, a1x, a11, bx1, b11, c11 );
bli_gemmtrsm_ukernel( alpha, a1x, a11, bx1, b11, c11 );
break;
default:
@@ -474,115 +474,3 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
bli_obj_set_diag_offset( 0, *a11 );
}
//
// Define object-wrapper to GEMMTRSM_L_UKERNEL, GEMMTRSM_U_UKERNEL
// micro-kernels.
//
#undef FUNCPTR_T
#define FUNCPTR_T gemmtrsm_ukr_fp
typedef void (*FUNCPTR_T)(
dim_t k,
void* alpha,
void* a1x,
void* a11,
void* bx1,
void* b11,
void* c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukr);
static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukr);
void bli_gemmtrsm_ukr( obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 )
{
dim_t k = bli_obj_width( *a1x );
num_t dt = bli_obj_datatype( *c11 );
void* buf_a1x = bli_obj_buffer_at_off( *a1x );
void* buf_a11 = bli_obj_buffer_at_off( *a11 );
void* buf_bx1 = bli_obj_buffer_at_off( *bx1 );
void* buf_b11 = bli_obj_buffer_at_off( *b11 );
void* buf_c11 = bli_obj_buffer_at_off( *c11 );
inc_t rs_c = bli_obj_row_stride( *c11 );
inc_t cs_c = bli_obj_col_stride( *c11 );
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
inc_t ps_a = bli_obj_panel_stride( *a1x );
inc_t ps_b = bli_obj_panel_stride( *bx1 );
FUNCPTR_T f;
auxinfo_t data;
// Fill the auxinfo_t struct in case the micro-kernel uses it.
if ( bli_obj_is_lower( *a11 ) )
{ bli_auxinfo_set_next_a( buf_a1x, data ); }
else
{ bli_auxinfo_set_next_a( buf_a11, data ); }
bli_auxinfo_set_next_b( buf_bx1, data );
bli_auxinfo_set_ps_a( ps_a, data );
bli_auxinfo_set_ps_b( ps_b, data );
// Index into the type combination array to extract the correct
// function pointer.
if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt];
else f = ftypes_u[dt];
// Invoke the function.
f( k,
buf_alpha,
buf_a1x,
buf_a11,
buf_bx1,
buf_b11,
buf_c11, rs_c, cs_c,
&data );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a1x, \
void* a11, \
void* bx1, \
void* b11, \
void* c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
) \
{ \
PASTEMAC(ch,ukrname)( k, \
alpha, \
a1x, \
a11, \
bx1, \
b11, \
c11, rs_c, cs_c, \
data ); \
}
INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukr, GEMMTRSM_L_UKERNEL )
INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukr, GEMMTRSM_U_UKERNEL )

View File

@@ -34,30 +34,3 @@
void libblis_test_gemmtrsm_ukr( test_params_t* params, test_op_t* op );
//
// Prototype wrapper interfaces to micro-kernel.
//
void bli_gemmtrsm_ukr( obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
void* alpha, \
void* a1x, \
void* a11, \
void* bx1, \
void* b11, \
void* c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_ukr )
INSERT_GENTPROT_BASIC( gemmtrsm_u_ukr )

View File

@@ -267,7 +267,7 @@ void libblis_test_trsm_ukr_impl( iface_t iface,
switch ( iface )
{
case BLIS_TEST_SEQ_UKERNEL:
bli_trsm_ukr( a, b, c );
bli_trsm_ukernel( a, b, c );
break;
default:
@@ -367,84 +367,3 @@ void libblis_test_trsm_ukr_check( side_t side,
bli_obj_free( &z );
}
//
// Define object-wrapper to TRSM_L_UKERNEL, TRSM_U_UKERNEL micro-kernels.
//
#undef FUNCPTR_T
#define FUNCPTR_T trsm_ukr_fp
typedef void (*FUNCPTR_T)(
void* a,
void* b,
void* c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukr);
static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukr);
void bli_trsm_ukr( obj_t* a,
obj_t* b,
obj_t* c )
{
num_t dt = bli_obj_datatype( *c );
void* buf_a = bli_obj_buffer_at_off( *a );
void* buf_b = bli_obj_buffer_at_off( *b );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
inc_t ps_a = bli_obj_panel_stride( *a );
inc_t ps_b = bli_obj_panel_stride( *b );
FUNCPTR_T f;
auxinfo_t data;
// Fill the auxinfo_t struct in case the micro-kernel uses it.
bli_auxinfo_set_next_a( buf_a, data );
bli_auxinfo_set_next_b( buf_b, data );
bli_auxinfo_set_ps_a( ps_a, data );
bli_auxinfo_set_ps_b( ps_b, data );
// Index into the type combination array to extract the correct
// function pointer.
if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt];
else f = ftypes_u[dt];
// Invoke the function.
f( buf_a,
buf_b,
buf_c, rs_c, cs_c,
&data );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
void* a, \
void* b, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
) \
{ \
PASTEMAC(ch,ukrname)( a, \
b, \
c, rs_c, cs_c, \
data ); \
}
INSERT_GENTFUNC_BASIC( trsm_l_ukr, TRSM_L_UKERNEL )
INSERT_GENTFUNC_BASIC( trsm_u_ukr, TRSM_U_UKERNEL )

View File

@@ -34,23 +34,3 @@
void libblis_test_trsm_ukr( test_params_t* params, test_op_t* op );
//
// Prototype wrapper interfaces to micro-kernel.
//
void bli_trsm_ukr( obj_t* a,
obj_t* b,
obj_t* c );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
void* a, \
void* b, \
void* c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( trsm_l_ukr )
INSERT_GENTPROT_BASIC( trsm_u_ukr )

1
version Normal file
View File

@@ -0,0 +1 @@
0.1.3