mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Merge branch 'master' of http://github.com/flame/blis
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -19,11 +19,12 @@
|
||||
*.a
|
||||
# test executables
|
||||
*.x
|
||||
*.pexe
|
||||
*.nexe
|
||||
|
||||
# -- build system files --
|
||||
|
||||
config.mk
|
||||
version
|
||||
|
||||
# -- makefile fragments --
|
||||
|
||||
|
||||
135
CHANGELOG
135
CHANGELOG
@@ -1,4 +1,137 @@
|
||||
commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (HEAD, tag: 0.1.2, origin/master, master)
|
||||
commit 036cc634918463b1caa0fd89c9a211f2f5639af7 (HEAD, tag: 0.1.3, master)
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jun 23 13:48:17 2014 -0500
|
||||
|
||||
Version file update (0.1.3)
|
||||
|
||||
commit 09d9a3bf6763932d9f571085b2cfd1b8631eccba
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jun 23 13:43:26 2014 -0500
|
||||
|
||||
Reverting version file to test new version script.
|
||||
|
||||
Details:
|
||||
- Changed version file contents to 0.1.2 so that I can test out a new
|
||||
version file bumping script.
|
||||
|
||||
commit ebb33965981dcb2b0bdee5fc7fdf6c959420f311
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jun 23 11:22:50 2014 -0500
|
||||
|
||||
Added 'version' file.
|
||||
|
||||
commit 2cb9a5501a3cbeb6692cf68e896087ba73b6af69
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jun 23 10:42:29 2014 -0500
|
||||
|
||||
Removed 'version' from .gitignore file.
|
||||
|
||||
commit b40dcefc5ee31f67aa3990e2e9d2ef8ed1386a25 (origin/master)
|
||||
Merge: 7101a8e b693b0c
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jun 23 10:39:05 2014 -0500
|
||||
|
||||
Merge pull request #11 from Maratyszcza/stable
|
||||
|
||||
[sc]axpy kernels for PNaCl
|
||||
|
||||
commit b693b0cddcfb41450e3c09a3ab97acb44c1ccdec
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Sun Jun 22 13:44:25 2014 -0700
|
||||
|
||||
[SC]AXPY kernels for PNaCl
|
||||
|
||||
commit 7101a8eec0327d6c3a7eb36eb4b0fd45c1c6d162
|
||||
Merge: ad48dca 020a831
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Jun 19 21:46:50 2014 -0500
|
||||
|
||||
Merge pull request #10 from Maratyszcza/stable
|
||||
|
||||
Portable Native Client port
|
||||
|
||||
commit 020a831bc5f61744cb8354886aa679b99b1285f6
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Thu Jun 19 00:58:26 2014 -0700
|
||||
|
||||
Code clean-up in PNaCl port
|
||||
|
||||
commit 491be4f91ed725522f5cc7184053857c6c376ada
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Thu Jun 19 00:45:44 2014 -0700
|
||||
|
||||
Optimized dot product kernels for PNaCl
|
||||
|
||||
commit 4b8e71aab80182873a2e138eb07902b8d8fd5480
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Thu Jun 19 00:43:25 2014 -0700
|
||||
|
||||
Use AR rcs flags for PNaCl target to avoid warning
|
||||
|
||||
commit 031deb2a5c718d569bde842590a791b812f4cf1d
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Wed Jun 18 03:11:34 2014 -0700
|
||||
|
||||
PNaCl configuration: use pnacl-ar instead or ar (fixes build issue on Mac)
|
||||
|
||||
commit 68a02976e3c3638f0a9821342e269a1743e3ace3
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Wed Jun 18 03:10:25 2014 -0700
|
||||
|
||||
Compile pnacl configuration in GNU11 mode to avoid warning about non-standard features
|
||||
|
||||
commit 6f8462eb0ec278b89731e73ef583386a3371d095
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Wed Jun 18 03:08:46 2014 -0700
|
||||
|
||||
Fix inconsistent VERBOSE macro in Makefile
|
||||
|
||||
commit b2ffb4de8b6872cb23537ad282e557d11dcd9c8b
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Sun Jun 15 18:41:30 2014 -0400
|
||||
|
||||
Reformatted PNaCl GEMM kernels
|
||||
|
||||
commit 6de2d472d98baa215264a776f3d5291780a6a085
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Sun Jun 15 08:44:31 2014 -0400
|
||||
|
||||
CGEMM and ZGEMM kernels for PNaCl
|
||||
|
||||
commit f064711a5e6fb3852c17c7520909b09dc27665f2
|
||||
Author: Marat Dukhan <maratek@gmail.com>
|
||||
Date: Sun Jun 15 06:27:37 2014 -0400
|
||||
|
||||
SGEMM and DGEMM kernels for PNaCl
|
||||
|
||||
commit ad48dca22913a363899f0bef45553898718eebb1
|
||||
Merge: ee2b679 7118f87
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Sat Jun 14 15:10:13 2014 -0500
|
||||
|
||||
Merge pull request #9 from tkelman/memalign_windows
|
||||
|
||||
Use _aligned_malloc instead of posix_memalign on Windows
|
||||
|
||||
commit 7118f87e18b4941423472afc00215c1d1f2a1fcd
|
||||
Author: Tony Kelman <tony@kelman.net>
|
||||
Date: Sat Jun 14 06:53:20 2014 -0700
|
||||
|
||||
Use _aligned_malloc instead of posix_memalign on Windows
|
||||
|
||||
commit ee2b679281ca45fb40b2198e293bc3bc3d446632
|
||||
Author: Tyler Smith <tms@cs.utexas.edu>
|
||||
Date: Fri Jun 6 12:41:55 2014 -0500
|
||||
|
||||
Only include omp.h if BLIS_ENABLE_OPENMP is set
|
||||
|
||||
commit 19c05dfaac43c627f86e897c8c00f1f9440754aa
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Jun 5 10:54:16 2014 -0500
|
||||
|
||||
CHANGELOG update (for 0.1.2).
|
||||
|
||||
commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (tag: 0.1.2)
|
||||
Author: Tyler Smith <tms@cs.utexas.edu>
|
||||
Date: Mon Jun 2 13:40:57 2014 -0500
|
||||
|
||||
|
||||
74
Makefile
74
Makefile
@@ -174,7 +174,7 @@ VERS_CONF := $(VERSION)-$(CONFIG_NAME)
|
||||
# Note: These names will be modified later to include the configuration and
|
||||
# version strings.
|
||||
BLIS_LIB_NAME := $(BLIS_LIB_BASE_NAME).a
|
||||
#BLIS_DLL_NAME := $(BLIS_LIB_BASE_NAME).so
|
||||
BLIS_DLL_NAME := $(BLIS_LIB_BASE_NAME).so
|
||||
|
||||
# --- BLIS framework source and object variable names ---
|
||||
|
||||
@@ -196,8 +196,9 @@ MK_CONFIG_OBJS :=
|
||||
MK_CONFIG_NOOPT_OBJS :=
|
||||
MK_CONFIG_KERNELS_OBJS :=
|
||||
|
||||
# Append the base library path to the library name.
|
||||
# Append the base library path to the library names.
|
||||
MK_ALL_BLIS_LIB := $(BASE_LIB_PATH)/$(BLIS_LIB_NAME)
|
||||
MK_ALL_BLIS_DLL := $(BASE_LIB_PATH)/$(BLIS_DLL_NAME)
|
||||
|
||||
# --- Define install target names for static libraries ---
|
||||
|
||||
@@ -209,6 +210,16 @@ MK_BLIS_LIB_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.a, \
|
||||
$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).a, \
|
||||
$(MK_BLIS_LIB))
|
||||
|
||||
# --- Define install target names for shared libraries ---
|
||||
|
||||
MK_BLIS_DLL := $(MK_ALL_BLIS_DLL)
|
||||
MK_BLIS_DLL_INST := $(patsubst $(BASE_LIB_PATH)/%.so, \
|
||||
$(INSTALL_PREFIX)/lib/%.so, \
|
||||
$(MK_BLIS_DLL))
|
||||
MK_BLIS_DLL_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.so, \
|
||||
$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so, \
|
||||
$(MK_BLIS_DLL))
|
||||
|
||||
# --- Determine which libraries to build ---
|
||||
|
||||
MK_LIBS :=
|
||||
@@ -221,6 +232,12 @@ MK_LIBS_INST += $(MK_BLIS_LIB_INST)
|
||||
MK_LIBS_INST_W_VERS_CONF += $(MK_BLIS_LIB_INST_W_VERS_CONF)
|
||||
endif
|
||||
|
||||
ifeq ($(BLIS_ENABLE_DYNAMIC_BUILD),yes)
|
||||
MK_LIBS += $(MK_BLIS_DLL)
|
||||
MK_LIBS_INST += $(MK_BLIS_DLL_INST)
|
||||
MK_LIBS_INST_W_VERS_CONF += $(MK_BLIS_DLL_INST_W_VERS_CONF)
|
||||
endif
|
||||
|
||||
# Strip leading, internal, and trailing whitespace.
|
||||
MK_LIBS_INST := $(strip $(MK_LIBS_INST))
|
||||
MK_LIBS_INST_W_VERS_CONF := $(strip $(MK_LIBS_INST_W_VERS_CONF))
|
||||
@@ -385,12 +402,12 @@ TESTSUITE_BIN := $(TESTSUITE_NAME).x
|
||||
# --- Uninstall definitions ----------------------------------------------------
|
||||
#
|
||||
|
||||
# This shell command grabs all files named "libblis-*.a" in the installation
|
||||
# directory and then filters out the name of the library archive for the
|
||||
# current version/configuration. We consider this remaining set of libraries
|
||||
# to be "old" and eligible for removal upon running of the uninstall-old
|
||||
# target.
|
||||
UNINSTALL_LIBS := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.a" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).a" | $(GREP) -v $(BLIS_LIB_NAME))
|
||||
# This shell command grabs all files named "libblis-*.a" or "libblis-*.so" in
|
||||
# the installation directory and then filters out the name of the library
|
||||
# archive for the current version/configuration. We consider this remaining set
|
||||
# of libraries to be "old" and eligible for removal upon running of the
|
||||
# uninstall-old target.
|
||||
UNINSTALL_LIBS := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.[a|so]" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).[a|so]" | $(GREP) -v $(BLIS_LIB_NAME))
|
||||
|
||||
|
||||
|
||||
@@ -464,12 +481,15 @@ ifeq ($(MAKE_DEFS_MK_PRESENT),no)
|
||||
endif
|
||||
|
||||
|
||||
# --- Static library archiver rules ---
|
||||
# --- All-purpose library rule (static and shared) ---
|
||||
|
||||
blis-lib: check-env $(MK_LIBS)
|
||||
|
||||
|
||||
# --- Static library archiver rules ---
|
||||
|
||||
$(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS)
|
||||
ifeq ($(FLA_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(AR) $(ARFLAGS) $@ $?
|
||||
$(RANLIB) $@
|
||||
else
|
||||
@@ -479,6 +499,17 @@ else
|
||||
endif
|
||||
|
||||
|
||||
# --- Dynamic library linker rules ---
|
||||
|
||||
$(MK_ALL_BLIS_DLL): $(MK_ALL_BLIS_OBJS)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $?
|
||||
else
|
||||
@echo "Dynamically linking $@"
|
||||
@$(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $?
|
||||
endif
|
||||
|
||||
|
||||
# --- Test suite rules ---
|
||||
|
||||
testsuite: testsuite-run
|
||||
@@ -540,6 +571,16 @@ else
|
||||
@$(INSTALL) -m 0644 $< $@
|
||||
endif
|
||||
|
||||
$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so: $(BASE_LIB_PATH)/%.so $(CONFIG_MK_FILE)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(INSTALL) -m 0755 -d $(@D)
|
||||
$(INSTALL) -m 0644 $< $@
|
||||
else
|
||||
@echo "Installing $(@F) into $(INSTALL_PREFIX)/lib/"
|
||||
@$(INSTALL) -m 0755 -d $(@D)
|
||||
@$(INSTALL) -m 0644 $< $@
|
||||
endif
|
||||
|
||||
|
||||
# --- Install-symlinks rules ---
|
||||
|
||||
@@ -555,6 +596,16 @@ else
|
||||
@$(MV) $(@F) $(INSTALL_PREFIX)/lib/
|
||||
endif
|
||||
|
||||
$(INSTALL_PREFIX)/lib/%.so: $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(SYMLINK) $(<F) $(@F)
|
||||
$(MV) $(@F) $(INSTALL_PREFIX)/lib/
|
||||
else
|
||||
@echo "Installing symlink $(@F) into $(INSTALL_PREFIX)/lib/"
|
||||
@$(SYMLINK) $(<F) $(@F)
|
||||
@$(MV) $(@F) $(INSTALL_PREFIX)/lib/
|
||||
endif
|
||||
|
||||
|
||||
# --- Query current configuration ---
|
||||
|
||||
@@ -569,6 +620,7 @@ ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
- $(FIND) $(BASE_OBJ_CONFIG_PATH) -name "*.o" | $(XARGS) $(RM_F)
|
||||
- $(FIND) $(BASE_OBJ_FRAME_PATH) -name "*.o" | $(XARGS) $(RM_F)
|
||||
- $(FIND) $(BASE_LIB_PATH) -name "*.a" | $(XARGS) $(RM_F)
|
||||
- $(FIND) $(BASE_LIB_PATH) -name "*.so" | $(XARGS) $(RM_F)
|
||||
else
|
||||
@echo "Removing .o files from $(BASE_OBJ_CONFIG_PATH)."
|
||||
@- $(FIND) $(BASE_OBJ_CONFIG_PATH) -name "*.o" | $(XARGS) $(RM_F)
|
||||
@@ -576,6 +628,8 @@ else
|
||||
@- $(FIND) $(BASE_OBJ_FRAME_PATH) -name "*.o" | $(XARGS) $(RM_F)
|
||||
@echo "Removing .a files from $(BASE_LIB_PATH)."
|
||||
@- $(FIND) $(BASE_LIB_PATH) -name "*.a" | $(XARGS) $(RM_F)
|
||||
@echo "Removing .so files from $(BASE_LIB_PATH)."
|
||||
@- $(FIND) $(BASE_LIB_PATH) -name "*.so" | $(XARGS) $(RM_F)
|
||||
endif
|
||||
|
||||
cleantest: check-env
|
||||
|
||||
201
build/bump-version.sh
Executable file
201
build/bump-version.sh
Executable file
@@ -0,0 +1,201 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# bump-version.sh
|
||||
#
|
||||
# Field G. Van Zee
|
||||
#
|
||||
|
||||
|
||||
print_usage()
|
||||
{
|
||||
#local script_name
|
||||
|
||||
# Get the script name
|
||||
#script_name=${0##*/}
|
||||
|
||||
# Echo usage info
|
||||
echo " "
|
||||
echo " "$script_name
|
||||
echo " "
|
||||
echo " Field G. Van Zee"
|
||||
echo " "
|
||||
echo " Performs a series of actions needed when incrementing (bumping) the"
|
||||
echo " BLIS version number."
|
||||
echo " "
|
||||
echo " Usage:"
|
||||
echo " ${script_name} [options] new_vers"
|
||||
echo " "
|
||||
echo " Arguments:"
|
||||
echo " "
|
||||
echo " new_vers The new version string."
|
||||
echo " "
|
||||
echo " Options:"
|
||||
echo " "
|
||||
echo " -d dry-run"
|
||||
echo " Go through all the motions, but don't actually make any"
|
||||
echo " changes to files or perform any git commits."
|
||||
echo " -f VERSFILE version file name"
|
||||
echo " Update VERSFILE with new version string instead of default"
|
||||
echo " 'version' file."
|
||||
|
||||
# Exit with non-zero exit status
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
main()
|
||||
{
|
||||
# -- BEGIN GLOBAL VARIABLE DECLARATIONS --
|
||||
|
||||
# The name of the script, stripped of any preceeding path.
|
||||
script_name=${0##*/}
|
||||
|
||||
# The name of the CHANGELOG file.
|
||||
changelog_file='CHANGELOG'
|
||||
|
||||
# The name of the default version file.
|
||||
version_file_def='version'
|
||||
|
||||
# The name of the specified version file.
|
||||
version_file=''
|
||||
|
||||
# Strings used during version query.
|
||||
git_commit_str=''
|
||||
new_version_str=''
|
||||
|
||||
# The script name to use instead of the $0 when outputting messages.
|
||||
output_name=''
|
||||
|
||||
# The git directory.
|
||||
gitdir='.git'
|
||||
|
||||
# Whether we are performing a dry run or not.
|
||||
dry_run_flag=""
|
||||
|
||||
# -- END GLOBAL VARIABLE DECLARATIONS --
|
||||
|
||||
|
||||
# Process our command line options.
|
||||
while getopts ":dhf:" opt; do
|
||||
case $opt in
|
||||
d ) dry_run_flag="1" ;;
|
||||
f ) version_file=$OPTARG ;;
|
||||
h ) print_usage ;;
|
||||
\? ) print_usage
|
||||
esac
|
||||
done
|
||||
shift $(($OPTIND - 1))
|
||||
|
||||
|
||||
# If a version file name was not given, set version_file to the default
|
||||
# value.
|
||||
if [ -n "${version_file}" ]; then
|
||||
|
||||
echo "${script_name}: version file specified: '${version_file}'."
|
||||
else
|
||||
|
||||
echo "${script_name}: no version file specified; defaulting to '${version_file_def}'."
|
||||
version_file="${version_file_def}"
|
||||
fi
|
||||
|
||||
|
||||
# Check the number of arguments after command line option processing.
|
||||
if [ $# = "1" ]; then
|
||||
|
||||
new_version_str=$1
|
||||
echo "${script_name}: preparing to bump to version '${new_version_str}'."
|
||||
|
||||
else
|
||||
print_usage
|
||||
fi
|
||||
|
||||
|
||||
# Check if the .git dir exists; if it does not, we do nothing.
|
||||
if [ -d "${gitdir}" ]; then
|
||||
|
||||
echo "${script_name}: found '${gitdir}' directory; assuming git clone."
|
||||
|
||||
#echo "${script_name}: executing: git describe --always."
|
||||
git_commit_str=$(git describe --always)
|
||||
echo "${script_name}: starting commit: ${git_commit_str}."
|
||||
|
||||
echo "${script_name}: updating version file '${version_file}'."
|
||||
if [ -z "$dry_run_flag" ]; then
|
||||
echo "${new_version_str}" > ${version_file}
|
||||
fi
|
||||
|
||||
echo "${script_name}: executing: git commit -m \"Version file update (${new_version_str})\" ${version_file}."
|
||||
if [ -z "$dry_run_flag" ]; then
|
||||
git commit -m "Version file update (${new_version_str})" ${version_file}
|
||||
fi
|
||||
|
||||
#echo "${script_name}: executing: git describe --always."
|
||||
git_commit_str=$(git describe --always)
|
||||
echo "${script_name}: commit to be tagged: ${git_commit_str}."
|
||||
|
||||
echo "${script_name}: executing: git tag ${new_version_str} ${git_commit_str}."
|
||||
if [ -z "$dry_run_flag" ]; then
|
||||
git tag ${new_version_str} ${git_commit_str}
|
||||
fi
|
||||
|
||||
echo "${script_name}: updating ${changelog_file}."
|
||||
if [ -z "$dry_run_flag" ]; then
|
||||
make changelog
|
||||
fi
|
||||
|
||||
echo "${script_name}: executing: git commit -m \"CHANGELOG update (${new_version_str})\" ${changelog_file}."
|
||||
if [ -z "$dry_run_flag" ]; then
|
||||
git commit -m "CHANGELOG update (${new_version_str})" ${changelog_file}
|
||||
fi
|
||||
|
||||
#echo "${script_name}: executing: git describe --always."
|
||||
git_commit_str=$(git describe --always)
|
||||
echo "${script_name}: latest commit: ${git_commit_str}."
|
||||
|
||||
else
|
||||
|
||||
echo "${script_name}: could not find '${gitdir}' directory; bailing out."
|
||||
|
||||
fi
|
||||
|
||||
|
||||
# Exit peacefully.
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
# The script's main entry point, passing all parameters given.
|
||||
main "$@"
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 # -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -82,6 +82,7 @@ CC := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L \
|
||||
-I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk
|
||||
CMISCFLAGS := -qthreaded -qsmp=omp -qasm=gcc -qkeyword=asm # -qreport -qsource -qlistopt -qlist
|
||||
CPICFLAGS :=
|
||||
CDBGFLAGS :=
|
||||
CWARNFLAGS := -w
|
||||
COPTFLAGS := -O3
|
||||
@@ -91,9 +92,9 @@ CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -march=armv7-a -mfpu=neon -O2
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := -msse3 -march=native
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -mabi=64
|
||||
CMISCFLAGS := -std=c99 -fopenmp #-pg
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3 -march=loongson3a -mtune=loongson3a
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -79,6 +79,7 @@ GIT_LOG := $(GIT) log --decorate
|
||||
CC := icc
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS :=
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3
|
||||
@@ -88,9 +89,9 @@ CVECFLAGS :=
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -98,6 +99,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -mmic -lm -openmp
|
||||
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -fopenmp
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O0 -malign-double -funroll-all-loops
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
165
config/pnacl/bli_config.h
Normal file
165
config/pnacl/bli_config.h
Normal file
@@ -0,0 +1,165 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
// -- INTEGER PROPERTIES -------------------------------------------------------
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions,
|
||||
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
|
||||
// integers while 64 results in 64-bit integers. Any other value results in use
|
||||
// of the C99 type "long int". Note that this ONLY affects integers used
|
||||
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
|
||||
// interface.
|
||||
#define BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
|
||||
|
||||
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
|
||||
|
||||
// Define the number of floating-point types supported, and the size of the
|
||||
// largest type.
|
||||
#define BLIS_NUM_FP_TYPES 4
|
||||
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
|
||||
|
||||
// Enable use of built-in C99 "float complex" and "double complex" types and
|
||||
// associated overloaded operations and functions? Disabling results in
|
||||
// scomplex and dcomplex being defined in terms of simple structs.
|
||||
//#define BLIS_ENABLE_C99_COMPLEX
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 1
|
||||
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
// -- Contiguous (static) memory allocator --
|
||||
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
// memory pools so that the micro-kernel, when computing with the end of the
|
||||
// last block, can exceed the bounds of the usable portion of the memory
|
||||
// region without causing a segmentation fault.
|
||||
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
|
||||
|
||||
// -- Memory alignment --
|
||||
|
||||
// It is sometimes useful to define the various memory alignments in terms
|
||||
// of some other characteristics of the system, such as the cache line size
|
||||
// and the page size.
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
// Alignment size used when allocating entire blocks of contiguous memory
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
// Basic (homogeneous) datatype support always enabled.
|
||||
|
||||
// Enable mixed domain operations?
|
||||
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
|
||||
// Enable extra mixed precision operations?
|
||||
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
|
||||
|
||||
|
||||
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
|
||||
|
||||
// Stay initialized after auto-initialization, unless and until the user
|
||||
// explicitly calls bli_finalize().
|
||||
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
|
||||
|
||||
|
||||
|
||||
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
|
||||
|
||||
// Enable the BLAS compatibility layer?
|
||||
#define BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions and
|
||||
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
|
||||
// A value of 32 results in the compatibility layer using 32-bit signed integers
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
251
config/pnacl/bli_kernel.h
Normal file
251
config/pnacl/bli_kernel.h
Normal file
@@ -0,0 +1,251 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
/*
|
||||
* SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible.
|
||||
* Therefore, if compilation targets an older Chrome release, we use scalar kernels.
|
||||
* The target Chrome version is indicated by PPAPI_MACRO defined in the header below.
|
||||
*/
|
||||
#include <ppapi/c/pp_macros.h>
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
#else
|
||||
#define BLIS_DEFAULT_MC_S 252
|
||||
#define BLIS_DEFAULT_KC_S 264
|
||||
#define BLIS_DEFAULT_NC_S 8196
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 1080
|
||||
#define BLIS_DEFAULT_KC_D 120
|
||||
#define BLIS_DEFAULT_NC_D 8400
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
#else
|
||||
#define BLIS_DEFAULT_MC_C 120
|
||||
#define BLIS_DEFAULT_KC_C 264
|
||||
#define BLIS_DEFAULT_NC_C 4092
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 60
|
||||
#define BLIS_DEFAULT_KC_Z 264
|
||||
#define BLIS_DEFAULT_NC_Z 2040
|
||||
|
||||
// -- Register blocksizes --
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
#else
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 3
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 3
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_DEFAULT_MR_C 4
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
#else
|
||||
#define BLIS_DEFAULT_MR_C 2
|
||||
#define BLIS_DEFAULT_NR_C 3
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 2
|
||||
#define BLIS_DEFAULT_NR_Z 3
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt
|
||||
#endif
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
// -- axpyv --
|
||||
#if PPAPI_RELEASE >= 36
|
||||
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt
|
||||
#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt
|
||||
#endif
|
||||
|
||||
// -- copyv --
|
||||
|
||||
// -- dotv --
|
||||
#define BLIS_SDOTV_KERNEL bli_sdotv_opt
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt
|
||||
#define BLIS_CDOTV_KERNEL bli_cdotv_opt
|
||||
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
// -- invertv --
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
// -- scalv --
|
||||
|
||||
// -- setv --
|
||||
|
||||
// -- subv --
|
||||
|
||||
// -- swapv --
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
1
config/pnacl/kernels
Symbolic link
1
config/pnacl/kernels
Symbolic link
@@ -0,0 +1 @@
|
||||
../../kernels/nacl/pnacl
|
||||
119
config/pnacl/make_defs.mk
Normal file
119
config/pnacl/make_defs.mk
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
# Only include this block of code once.
|
||||
ifndef MAKE_DEFS_MK_INCLUDED
|
||||
MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := pnacl-ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := pnacl-clang
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include
|
||||
CPICFLAGS :=
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3
|
||||
CKOPTFLAGS := $(COPTFLAGS) -ffast-math
|
||||
CVECFLAGS :=
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := pnacl-ar
|
||||
ARFLAGS := rcs
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS :=
|
||||
LDFLAGS := -lm
|
||||
|
||||
# --- Determine the finalizer and related flags ---
|
||||
FINALIZER := pnacl-finalize
|
||||
FINFLAGS :=
|
||||
|
||||
# --- Determine the translator and related flags ---
|
||||
TRANSLATOR := pnacl-translate
|
||||
TRNSFLAGS := -O3
|
||||
TRNSAMD64FLAGS := -arch x86-64
|
||||
TRNSX86FLAGS := -arch i686
|
||||
TRNSARMFLAGS := -arch armv7
|
||||
|
||||
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
|
||||
endif
|
||||
@@ -80,8 +80,8 @@ CC := gcc
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CMISCFLAGS += -m64 -mcpu=power7
|
||||
CMISCFLAGS := -std=c99 -m64 -mcpu=power7 #-fopenmp -pg
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3 -mtune=power7
|
||||
@@ -91,9 +91,9 @@ CVECFLAGS := -mvsx
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -101,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -81,7 +81,8 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CDBGFLAGS := -g
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3 -march=native
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2
|
||||
@@ -90,9 +91,9 @@ CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -100,6 +101,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
@@ -38,6 +38,8 @@
|
||||
#include "bli_gemm_int.h"
|
||||
#include "bli_gemm_target.h"
|
||||
|
||||
#include "bli_gemm_ukernel.h"
|
||||
|
||||
#include "bli_gemm_blk_var1f.h"
|
||||
#include "bli_gemm_blk_var2f.h"
|
||||
#include "bli_gemm_blk_var3f.h"
|
||||
|
||||
126
frame/3/gemm/bli_gemm_ukernel.c
Normal file
126
frame/3/gemm/bli_gemm_ukernel.c
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_ukr_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a,
|
||||
void* b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ukernel_void);
|
||||
|
||||
|
||||
void bli_gemm_ukernel( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
|
||||
dim_t k = bli_obj_width( *a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( *a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
|
||||
|
||||
void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta );
|
||||
|
||||
inc_t ps_a = bli_obj_panel_stride( *a );
|
||||
inc_t ps_b = bli_obj_panel_stride( *b );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
auxinfo_t data;
|
||||
|
||||
|
||||
// Fill the auxinfo_t struct in case the micro-kernel uses it.
|
||||
bli_auxinfo_set_next_a( buf_a, data );
|
||||
bli_auxinfo_set_next_b( buf_b, data );
|
||||
bli_auxinfo_set_ps_a( ps_a, data );
|
||||
bli_auxinfo_set_ps_b( ps_b, data );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f( k,
|
||||
buf_alpha,
|
||||
buf_a,
|
||||
buf_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
&data );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
beta, \
|
||||
c, rs_c, cs_c, \
|
||||
data ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm_ukernel_void, GEMM_UKERNEL )
|
||||
|
||||
60
frame/3/gemm/bli_gemm_ukernel.h
Normal file
60
frame/3/gemm/bli_gemm_ukernel.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemm_ukernel( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ukernel_void )
|
||||
|
||||
141
frame/3/trsm/bli_gemmtrsm_ukernel.c
Normal file
141
frame/3/trsm/bli_gemmtrsm_ukernel.c
Normal file
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef FUNCPTR_T
|
||||
#define FUNCPTR_T gemmtrsm_ukr_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a1x,
|
||||
void* a11,
|
||||
void* bx1,
|
||||
void* b11,
|
||||
void* c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukernel_void);
|
||||
static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukernel_void);
|
||||
|
||||
|
||||
void bli_gemmtrsm_ukernel( obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 )
|
||||
{
|
||||
dim_t k = bli_obj_width( *a1x );
|
||||
|
||||
num_t dt = bli_obj_datatype( *c11 );
|
||||
|
||||
void* buf_a1x = bli_obj_buffer_at_off( *a1x );
|
||||
|
||||
void* buf_a11 = bli_obj_buffer_at_off( *a11 );
|
||||
|
||||
void* buf_bx1 = bli_obj_buffer_at_off( *bx1 );
|
||||
|
||||
void* buf_b11 = bli_obj_buffer_at_off( *b11 );
|
||||
|
||||
void* buf_c11 = bli_obj_buffer_at_off( *c11 );
|
||||
inc_t rs_c = bli_obj_row_stride( *c11 );
|
||||
inc_t cs_c = bli_obj_col_stride( *c11 );
|
||||
|
||||
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
|
||||
|
||||
inc_t ps_a = bli_obj_panel_stride( *a1x );
|
||||
inc_t ps_b = bli_obj_panel_stride( *bx1 );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
auxinfo_t data;
|
||||
|
||||
|
||||
// Fill the auxinfo_t struct in case the micro-kernel uses it.
|
||||
if ( bli_obj_is_lower( *a11 ) )
|
||||
{ bli_auxinfo_set_next_a( buf_a1x, data ); }
|
||||
else
|
||||
{ bli_auxinfo_set_next_a( buf_a11, data ); }
|
||||
bli_auxinfo_set_next_b( buf_bx1, data );
|
||||
|
||||
bli_auxinfo_set_ps_a( ps_a, data );
|
||||
bli_auxinfo_set_ps_b( ps_b, data );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt];
|
||||
else f = ftypes_u[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f( k,
|
||||
buf_alpha,
|
||||
buf_a1x,
|
||||
buf_a11,
|
||||
buf_bx1,
|
||||
buf_b11,
|
||||
buf_c11, rs_c, cs_c,
|
||||
&data );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a1x, \
|
||||
void* a11, \
|
||||
void* bx1, \
|
||||
void* b11, \
|
||||
void* c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha, \
|
||||
a1x, \
|
||||
a11, \
|
||||
bx1, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
data ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukernel_void, GEMMTRSM_L_UKERNEL )
|
||||
INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukernel_void, GEMMTRSM_U_UKERNEL )
|
||||
|
||||
63
frame/3/trsm/bli_gemmtrsm_ukernel.h
Normal file
63
frame/3/trsm/bli_gemmtrsm_ukernel.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemmtrsm_ukernel( obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a1x, \
|
||||
void* a11, \
|
||||
void* bx1, \
|
||||
void* b11, \
|
||||
void* c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_ukernel_void )
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_u_ukernel_void )
|
||||
|
||||
@@ -37,6 +37,9 @@
|
||||
#include "bli_trsm_front.h"
|
||||
#include "bli_trsm_int.h"
|
||||
|
||||
#include "bli_gemmtrsm_ukernel.h"
|
||||
#include "bli_trsm_ukernel.h"
|
||||
|
||||
#include "bli_trsm_blk_var1f.h"
|
||||
#include "bli_trsm_blk_var1b.h"
|
||||
|
||||
|
||||
111
frame/3/trsm/bli_trsm_ukernel.c
Normal file
111
frame/3/trsm/bli_trsm_ukernel.c
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef FUNCPTR_T
|
||||
#define FUNCPTR_T trsm_ukr_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
void* a,
|
||||
void* b,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukernel_void);
|
||||
static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukernel_void);
|
||||
|
||||
|
||||
void bli_trsm_ukernel( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( *a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
inc_t ps_a = bli_obj_panel_stride( *a );
|
||||
inc_t ps_b = bli_obj_panel_stride( *b );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
auxinfo_t data;
|
||||
|
||||
|
||||
// Fill the auxinfo_t struct in case the micro-kernel uses it.
|
||||
bli_auxinfo_set_next_a( buf_a, data );
|
||||
bli_auxinfo_set_next_b( buf_b, data );
|
||||
|
||||
bli_auxinfo_set_ps_a( ps_a, data );
|
||||
bli_auxinfo_set_ps_b( ps_b, data );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt];
|
||||
else f = ftypes_u[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f( buf_a,
|
||||
buf_b,
|
||||
buf_c, rs_c, cs_c,
|
||||
&data );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,ukrname)( a, \
|
||||
b, \
|
||||
c, rs_c, cs_c, \
|
||||
data ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trsm_l_ukernel_void, TRSM_L_UKERNEL )
|
||||
INSERT_GENTFUNC_BASIC( trsm_u_ukernel_void, TRSM_U_UKERNEL )
|
||||
|
||||
56
frame/3/trsm/bli_trsm_ukernel.h
Normal file
56
frame/3/trsm/bli_trsm_ukernel.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_trsm_ukernel( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_ukernel_void )
|
||||
INSERT_GENTPROT_BASIC( trsm_u_ukernel_void )
|
||||
|
||||
@@ -43,6 +43,9 @@ void* bli_malloc( siz_t size )
|
||||
|
||||
#if BLIS_HEAP_ADDR_ALIGN_SIZE == 1
|
||||
p = malloc( ( size_t )size );
|
||||
#elif defined(_WIN32)
|
||||
p = _aligned_malloc( ( size_t )size,
|
||||
( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE );
|
||||
#else
|
||||
r_val = posix_memalign( &p,
|
||||
( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE,
|
||||
@@ -58,6 +61,10 @@ void* bli_malloc( siz_t size )
|
||||
|
||||
void bli_free( void* p )
|
||||
{
|
||||
#if BLIS_HEAP_ADDR_ALIGN_SIZE == 1 || !defined(_WIN32)
|
||||
free( p );
|
||||
#else
|
||||
_aligned_free( p );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
203
kernels/nacl/pnacl/1/bli_axpyv_opt.c
Normal file
203
kernels/nacl/pnacl/1/bli_axpyv_opt.c
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
typedef float v4sf __attribute__ ((vector_size(16)));
|
||||
|
||||
inline v4sf v4sf_splat(float x) {
|
||||
return (v4sf) { x, x, x, x };
|
||||
}
|
||||
|
||||
inline v4sf v4sf_load(const float* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline v4sf v4sf_cload(const scomplex* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline void v4sf_store(float* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline void v4sf_cstore(scomplex* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline v4sf v4sf_zero() {
|
||||
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void bli_saxpyv_opt(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
float alpha[restrict static 1],
|
||||
float x[restrict static n],
|
||||
inc_t incx,
|
||||
float y[restrict static n],
|
||||
inc_t incy)
|
||||
{
|
||||
if (bli_zero_dim1(n)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (bli_seq0(*alpha)) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
if (!bli_has_nonunit_inc2(incx, incy)) {
|
||||
const v4sf alphav = v4sf_splat(*alpha);
|
||||
while (n >= 4) {
|
||||
const v4sf xv = v4sf_load(x);
|
||||
v4sf yv = v4sf_load(y);
|
||||
yv += xv * alphav;
|
||||
v4sf_store(y, yv);
|
||||
|
||||
x += 4;
|
||||
y += 4;
|
||||
n -= 4;
|
||||
}
|
||||
const float alphac = *alpha;
|
||||
while (n--) {
|
||||
(*y++) += (*x++) * alphac;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* Just call the reference implementation. */
|
||||
BLIS_SAXPYV_KERNEL_REF(
|
||||
conjx,
|
||||
n,
|
||||
alpha,
|
||||
x,
|
||||
incx,
|
||||
y,
|
||||
incy);
|
||||
}
|
||||
|
||||
|
||||
void bli_caxpyv_opt(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
scomplex alpha[restrict static 1],
|
||||
scomplex x[restrict static n],
|
||||
inc_t incx,
|
||||
scomplex y[restrict static n],
|
||||
inc_t incy)
|
||||
{
|
||||
if (bli_zero_dim1(n)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (bli_ceq0(*alpha)) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
if (!bli_has_nonunit_inc2(incx, incy)) {
|
||||
if (bli_is_noconj(conjx)) {
|
||||
const v4sf alphav0 = v4sf_splat(alpha->real);
|
||||
const v4sf alphav1 = (v4sf) { -alpha->imag, alpha->imag, -alpha->imag, alpha->imag };
|
||||
while (n >= 2) {
|
||||
const v4sf xv0 = v4sf_cload(x);
|
||||
v4sf yv = v4sf_cload(y);
|
||||
const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2);
|
||||
yv += xv0 * alphav0 + xv1 * alphav1;
|
||||
v4sf_cstore(y, yv);
|
||||
|
||||
x += 2;
|
||||
y += 2;
|
||||
n -= 2;
|
||||
}
|
||||
const float alphar = alpha->real;
|
||||
const float alphai = alpha->imag;
|
||||
while (n--) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
y->real = yr + xr * alphar - xi * alphai;
|
||||
y->imag = yi + xr * alphai + xi * alphar;
|
||||
|
||||
x += 1;
|
||||
y += 1;
|
||||
}
|
||||
} else {
|
||||
const v4sf alphav0 = (v4sf) { alpha->real, -alpha->real, alpha->real, -alpha->real };
|
||||
const v4sf alphav1 = v4sf_splat(alpha->imag);
|
||||
while (n >= 2) {
|
||||
const v4sf xv0 = v4sf_cload(x);
|
||||
v4sf yv = v4sf_cload(y);
|
||||
const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2);
|
||||
yv += xv0 * alphav0 + xv1 * alphav1;
|
||||
v4sf_cstore(y, yv);
|
||||
|
||||
x += 2;
|
||||
y += 2;
|
||||
n -= 2;
|
||||
}
|
||||
const float alphar = alpha->real;
|
||||
const float alphai = alpha->imag;
|
||||
while (n--) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
y->real = yr + xr * alphar + xi * alphai;
|
||||
y->imag = yi + xr * alphai - xi * alphar;
|
||||
|
||||
x += 1;
|
||||
y += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* Just call the reference implementation. */
|
||||
BLIS_CAXPYV_KERNEL_REF(
|
||||
conjx,
|
||||
n,
|
||||
alpha,
|
||||
x,
|
||||
incx,
|
||||
y,
|
||||
incy);
|
||||
}
|
||||
618
kernels/nacl/pnacl/1/bli_dotv_opt.c
Normal file
618
kernels/nacl/pnacl/1/bli_dotv_opt.c
Normal file
@@ -0,0 +1,618 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
typedef float v4sf __attribute__ ((vector_size(16)));
|
||||
|
||||
inline v4sf v4sf_splat(float x) {
|
||||
return (v4sf) { x, x, x, x };
|
||||
}
|
||||
|
||||
inline v4sf v4sf_load(const float* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline v4sf v4sf_cload(const scomplex* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline void v4sf_store(float* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline void v4sf_cstore(scomplex* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline v4sf v4sf_zero() {
|
||||
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
#endif
|
||||
|
||||
void bli_sdotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float x[restrict static n],
|
||||
inc_t incx,
|
||||
float y[restrict static n],
|
||||
inc_t incy,
|
||||
float rho[restrict static 1])
|
||||
{
|
||||
#if PPAPI_RELEASE >= 36
|
||||
// If the vector lengths are zero, set rho to zero and return.
|
||||
if (bli_zero_dim1(n)) {
|
||||
*rho = 0.0f;
|
||||
return;
|
||||
}
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if (bli_has_nonunit_inc2(incx, incy)) {
|
||||
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
|
||||
while (n >= 6) {
|
||||
sum0 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum1 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum2 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum3 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum4 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum5 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 6;
|
||||
}
|
||||
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
|
||||
while (n--) {
|
||||
sum += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
*rho = sum;
|
||||
} else {
|
||||
v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero();
|
||||
v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero();
|
||||
while (n >= 24) {
|
||||
vsum0 += v4sf_load(x) * v4sf_load(y);
|
||||
vsum1 += v4sf_load(x+4) * v4sf_load(y+4);
|
||||
vsum2 += v4sf_load(x+8) * v4sf_load(y+8);
|
||||
vsum3 += v4sf_load(x+12) * v4sf_load(y+12);
|
||||
vsum4 += v4sf_load(x+16) * v4sf_load(y+16);
|
||||
vsum5 += v4sf_load(x+20) * v4sf_load(y+20);
|
||||
|
||||
x += 24;
|
||||
y += 24;
|
||||
n -= 24;
|
||||
}
|
||||
v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5);
|
||||
while (n >= 4) {
|
||||
vsum += v4sf_load(x) * v4sf_load(y);
|
||||
|
||||
x += 4;
|
||||
y += 4;
|
||||
n -= 4;
|
||||
}
|
||||
float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]);
|
||||
while (n--) {
|
||||
sum += (*x++) * (*y++);
|
||||
}
|
||||
*rho = sum;
|
||||
}
|
||||
#else
|
||||
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
|
||||
while (n >= 6) {
|
||||
sum0 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum1 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum2 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum3 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum4 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum5 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 6;
|
||||
}
|
||||
float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
|
||||
while (n--) {
|
||||
sum += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
*rho = sum;
|
||||
#endif
|
||||
}
|
||||
|
||||
void bli_ddotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double x[restrict static n],
|
||||
inc_t incx,
|
||||
double y[restrict static n],
|
||||
inc_t incy,
|
||||
double rho[restrict static 1])
|
||||
{
|
||||
double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0;
|
||||
while (n >= 6) {
|
||||
sum0 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum1 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum2 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum3 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum4 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
sum5 += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 6;
|
||||
}
|
||||
double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
|
||||
while (n--) {
|
||||
sum += (*x) * (*y);
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
*rho = sum;
|
||||
}
|
||||
|
||||
void bli_cdotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex x[restrict static n],
|
||||
inc_t incx,
|
||||
scomplex y[restrict static n],
|
||||
inc_t incy,
|
||||
scomplex rho[restrict static 1])
|
||||
{
|
||||
if (bli_is_conj(conjy)) {
|
||||
bli_toggle_conj(conjx);
|
||||
}
|
||||
|
||||
if (bli_zero_dim1(n)) {
|
||||
rho->real = 0.0f;
|
||||
rho->imag = 0.0f;
|
||||
return;
|
||||
}
|
||||
|
||||
float sumr;
|
||||
float sumi;
|
||||
#if PPAPI_RELEASE >= 36
|
||||
if (bli_is_noconj(conjx)) {
|
||||
if (bli_has_nonunit_inc2(incx, incy)) {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r - x0i * y0i;
|
||||
sum0i += x0r * y0i + x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r - x1i * y1i;
|
||||
sum1i += x1r * y1i + x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
} else {
|
||||
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
|
||||
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
|
||||
while (n >= 8) {
|
||||
const v4sf xv0t = v4sf_cload(x);
|
||||
const v4sf xv0b = v4sf_cload(x+2);
|
||||
const v4sf yv0t = v4sf_cload(y);
|
||||
const v4sf yv0b = v4sf_cload(y+2);
|
||||
|
||||
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
|
||||
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
|
||||
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
|
||||
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
|
||||
|
||||
sumv0r += xv0r * yv0r - xv0i * yv0i;
|
||||
sumv0i += xv0r * yv0i + xv0i * yv0r;
|
||||
|
||||
const v4sf xv1t = v4sf_cload(x+4);
|
||||
const v4sf xv1b = v4sf_cload(x+6);
|
||||
const v4sf yv1t = v4sf_cload(y+4);
|
||||
const v4sf yv1b = v4sf_cload(y+6);
|
||||
|
||||
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
|
||||
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
|
||||
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
|
||||
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
|
||||
|
||||
sumv1r += xv1r * yv1r - xv1i * yv1i;
|
||||
sumv1i += xv1r * yv1i + xv1i * yv1r;
|
||||
|
||||
x += 8;
|
||||
y += 8;
|
||||
|
||||
n -= 8;
|
||||
}
|
||||
const v4sf sumvr = sumv0r + sumv1r;
|
||||
const v4sf sumvi = sumv0i + sumv1i;
|
||||
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
|
||||
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
|
||||
}
|
||||
while (n--) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr - xi * yi;
|
||||
sumi += xr * yi + xi * yr;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
} else {
|
||||
if (bli_has_nonunit_inc2(incx, incy)) {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r + x0i * y0i;
|
||||
sum0i += x0r * y0i - x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r + x1i * y1i;
|
||||
sum1i += x1r * y1i - x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
} else {
|
||||
v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
|
||||
v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
|
||||
while (n >= 8) {
|
||||
const v4sf xv0t = v4sf_cload(x);
|
||||
const v4sf xv0b = v4sf_cload(x+2);
|
||||
const v4sf yv0t = v4sf_cload(y);
|
||||
const v4sf yv0b = v4sf_cload(y+2);
|
||||
|
||||
const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
|
||||
const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
|
||||
const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
|
||||
const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
|
||||
|
||||
sumv0r += xv0r * yv0r + xv0i * yv0i;
|
||||
sumv0i += xv0r * yv0i - xv0i * yv0r;
|
||||
|
||||
const v4sf xv1t = v4sf_cload(x+4);
|
||||
const v4sf xv1b = v4sf_cload(x+6);
|
||||
const v4sf yv1t = v4sf_cload(y+4);
|
||||
const v4sf yv1b = v4sf_cload(y+6);
|
||||
|
||||
const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
|
||||
const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
|
||||
const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
|
||||
const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
|
||||
|
||||
sumv1r += xv1r * yv1r + xv1i * yv1i;
|
||||
sumv1i += xv1r * yv1i - xv1i * yv1r;
|
||||
|
||||
x += 8;
|
||||
y += 8;
|
||||
|
||||
n -= 8;
|
||||
}
|
||||
const v4sf sumvr = sumv0r + sumv1r;
|
||||
const v4sf sumvi = sumv0i + sumv1i;
|
||||
sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
|
||||
sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
|
||||
}
|
||||
while (n--) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr + xi * yi;
|
||||
sumi += xr * yi - xi * yr;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (bli_is_noconj(conjx)) {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r - x0i * y0i;
|
||||
sum0i += x0r * y0i + x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r - x1i * y1i;
|
||||
sum1i += x1r * y1i + x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr - xi * yi;
|
||||
sumi += xr * yi + xi * yr;
|
||||
}
|
||||
} else {
|
||||
float sum0r = 0.0f, sum1r = 0.0f;
|
||||
float sum0i = 0.0f, sum1i = 0.0f;
|
||||
while (n >= 2) {
|
||||
const float x0r = x->real;
|
||||
const float x0i = x->imag;
|
||||
const float y0r = y->real;
|
||||
const float y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r + x0i * y0i;
|
||||
sum0i += x0r * y0i - x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const float x1r = x->real;
|
||||
const float x1i = x->imag;
|
||||
const float y1r = y->real;
|
||||
const float y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r + x1i * y1i;
|
||||
sum1i += x1r * y1i - x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const float xr = x->real;
|
||||
const float xi = x->imag;
|
||||
const float yr = y->real;
|
||||
const float yi = y->imag;
|
||||
|
||||
sumr += xr * yr + xi * yi;
|
||||
sumi += xr * yi - xi * yr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
rho->real = sumr;
|
||||
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zdotv_opt(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex x[restrict static n],
|
||||
inc_t incx,
|
||||
dcomplex y[restrict static n],
|
||||
inc_t incy,
|
||||
dcomplex rho[restrict static 1])
|
||||
{
|
||||
if (bli_is_conj(conjy)) {
|
||||
bli_toggle_conj(conjx);
|
||||
}
|
||||
|
||||
if (bli_zero_dim1(n)) {
|
||||
rho->real = 0.0;
|
||||
rho->imag = 0.0;
|
||||
return;
|
||||
}
|
||||
|
||||
double sumr;
|
||||
double sumi;
|
||||
if (bli_is_noconj(conjx)) {
|
||||
double sum0r = 0.0, sum1r = 0.0;
|
||||
double sum0i = 0.0, sum1i = 0.0;
|
||||
while (n >= 2) {
|
||||
const double x0r = x->real;
|
||||
const double x0i = x->imag;
|
||||
const double y0r = y->real;
|
||||
const double y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r - x0i * y0i;
|
||||
sum0i += x0r * y0i + x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const double x1r = x->real;
|
||||
const double x1i = x->imag;
|
||||
const double y1r = y->real;
|
||||
const double y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r - x1i * y1i;
|
||||
sum1i += x1r * y1i + x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const double xr = x->real;
|
||||
const double xi = x->imag;
|
||||
const double yr = y->real;
|
||||
const double yi = y->imag;
|
||||
|
||||
sumr += xr * yr - xi * yi;
|
||||
sumi += xr * yi + xi * yr;
|
||||
}
|
||||
} else {
|
||||
double sum0r = 0.0, sum1r = 0.0;
|
||||
double sum0i = 0.0, sum1i = 0.0;
|
||||
while (n >= 2) {
|
||||
const double x0r = x->real;
|
||||
const double x0i = x->imag;
|
||||
const double y0r = y->real;
|
||||
const double y0i = y->imag;
|
||||
|
||||
sum0r += x0r * y0r + x0i * y0i;
|
||||
sum0i += x0r * y0i - x0i * y0r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
const double x1r = x->real;
|
||||
const double x1i = x->imag;
|
||||
const double y1r = y->real;
|
||||
const double y1i = y->imag;
|
||||
|
||||
sum1r += x1r * y1r + x1i * y1i;
|
||||
sum1i += x1r * y1i - x1i * y1r;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
|
||||
n -= 2;
|
||||
}
|
||||
sumr = sum0r + sum1r;
|
||||
sumi = sum0i + sum1i;
|
||||
if (n != 0) {
|
||||
const double xr = x->real;
|
||||
const double xi = x->imag;
|
||||
const double yr = y->real;
|
||||
const double yi = y->imag;
|
||||
|
||||
sumr += xr * yr + xi * yi;
|
||||
sumi += xr * yi - xi * yr;
|
||||
}
|
||||
}
|
||||
|
||||
rho->real = sumr;
|
||||
rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
|
||||
}
|
||||
|
||||
386
kernels/nacl/pnacl/3/bli_gemm_opt.c
Normal file
386
kernels/nacl/pnacl/3/bli_gemm_opt.c
Normal file
@@ -0,0 +1,386 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#if PPAPI_RELEASE >= 36
|
||||
typedef float v4sf __attribute__ ((vector_size(16)));
|
||||
|
||||
inline v4sf v4sf_splat(float x) {
|
||||
return (v4sf) { x, x, x, x };
|
||||
}
|
||||
|
||||
inline v4sf v4sf_load(const float* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline v4sf v4sf_cload(const scomplex* a) {
|
||||
return *((const v4sf*)a);
|
||||
}
|
||||
|
||||
inline void v4sf_store(float* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline void v4sf_cstore(scomplex* a, v4sf x) {
|
||||
*((v4sf*)a) = x;
|
||||
}
|
||||
|
||||
inline v4sf v4sf_zero() {
|
||||
return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
void bli_sgemm_opt(
|
||||
dim_t k,
|
||||
float alpha[restrict static 1],
|
||||
float a[restrict static 8*k],
|
||||
float b[restrict static k*4],
|
||||
float beta[restrict static 1],
|
||||
float c[restrict static 8*4],
|
||||
inc_t rs_c,
|
||||
inc_t cs_c,
|
||||
auxinfo_t* data)
|
||||
{
|
||||
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
|
||||
v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
|
||||
v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
|
||||
for (dim_t i = 0; i < k; i += 1) {
|
||||
const v4sf avt = v4sf_load(a);
|
||||
const v4sf avb = v4sf_load(a+4);
|
||||
|
||||
const v4sf bv_xxxx = v4sf_splat(b[0]);
|
||||
abv0t += avt * bv_xxxx;
|
||||
abv0b += avb * bv_xxxx;
|
||||
|
||||
const v4sf bv_yyyy = v4sf_splat(b[1]);
|
||||
abv1t += avt * bv_yyyy;
|
||||
abv1b += avb * bv_yyyy;
|
||||
|
||||
const v4sf bv_zzzz = v4sf_splat(b[2]);
|
||||
abv2t += avt * bv_zzzz;
|
||||
abv2b += avb * bv_zzzz;
|
||||
|
||||
const v4sf bv_wwww = v4sf_splat(b[3]);
|
||||
abv3t += avt * bv_wwww;
|
||||
abv3b += avb * bv_wwww;
|
||||
|
||||
a += 8;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
const v4sf alphav = v4sf_splat(*alpha);
|
||||
abv0t *= alphav;
|
||||
abv0b *= alphav;
|
||||
abv1t *= alphav;
|
||||
abv1b *= alphav;
|
||||
abv2t *= alphav;
|
||||
abv2b *= alphav;
|
||||
abv3t *= alphav;
|
||||
abv3b *= alphav;
|
||||
|
||||
if (rs_c == 1) {
|
||||
v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
|
||||
v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]);
|
||||
v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]);
|
||||
v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]);
|
||||
v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
|
||||
v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]);
|
||||
v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]);
|
||||
v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]);
|
||||
|
||||
const v4sf betav = v4sf_splat(*beta);
|
||||
cv0t = cv0t * betav + abv0t;
|
||||
cv1t = cv1t * betav + abv1t;
|
||||
cv2t = cv2t * betav + abv2t;
|
||||
cv3t = cv3t * betav + abv3t;
|
||||
cv0b = cv0b * betav + abv0b;
|
||||
cv1b = cv1b * betav + abv1b;
|
||||
cv2b = cv2b * betav + abv2b;
|
||||
cv3b = cv3b * betav + abv3b;
|
||||
|
||||
v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
|
||||
v4sf_store(&c[0*rs_c + 1*cs_c], cv1t);
|
||||
v4sf_store(&c[0*rs_c + 2*cs_c], cv2t);
|
||||
v4sf_store(&c[0*rs_c + 3*cs_c], cv3t);
|
||||
v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
|
||||
v4sf_store(&c[4*rs_c + 1*cs_c], cv1b);
|
||||
v4sf_store(&c[4*rs_c + 2*cs_c], cv2b);
|
||||
v4sf_store(&c[4*rs_c + 3*cs_c], cv3b);
|
||||
} else {
|
||||
// Load columns 0, 1, 2, 3 (top part)
|
||||
v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
|
||||
v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
|
||||
v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
|
||||
v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
|
||||
// Load columns 0, 1, 2, 3 (bottom part)
|
||||
v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
|
||||
v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
|
||||
v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
|
||||
v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
|
||||
|
||||
const v4sf betav = v4sf_splat(*beta);
|
||||
cv0t = cv0t * betav + abv0t;
|
||||
cv1t = cv1t * betav + abv1t;
|
||||
cv2t = cv2t * betav + abv2t;
|
||||
cv3t = cv3t * betav + abv3t;
|
||||
cv0b = cv0b * betav + abv0b;
|
||||
cv1b = cv1b * betav + abv1b;
|
||||
cv2b = cv2b * betav + abv2b;
|
||||
cv3b = cv3b * betav + abv3b;
|
||||
|
||||
// Store column 0
|
||||
c[0*rs_c + 0*cs_c] = cv0t[0];
|
||||
c[1*rs_c + 0*cs_c] = cv0t[1];
|
||||
c[2*rs_c + 0*cs_c] = cv0t[2];
|
||||
c[3*rs_c + 0*cs_c] = cv0t[3];
|
||||
c[4*rs_c + 0*cs_c] = cv0b[0];
|
||||
c[5*rs_c + 0*cs_c] = cv0b[1];
|
||||
c[6*rs_c + 0*cs_c] = cv0b[2];
|
||||
c[7*rs_c + 0*cs_c] = cv0b[3];
|
||||
|
||||
// Store column 1
|
||||
c[0*rs_c + 1*cs_c] = cv1t[0];
|
||||
c[1*rs_c + 1*cs_c] = cv1t[1];
|
||||
c[2*rs_c + 1*cs_c] = cv1t[2];
|
||||
c[3*rs_c + 1*cs_c] = cv1t[3];
|
||||
c[4*rs_c + 1*cs_c] = cv1b[0];
|
||||
c[5*rs_c + 1*cs_c] = cv1b[1];
|
||||
c[6*rs_c + 1*cs_c] = cv1b[2];
|
||||
c[7*rs_c + 1*cs_c] = cv1b[3];
|
||||
|
||||
// Store column 2
|
||||
c[0*rs_c + 2*cs_c] = cv2t[0];
|
||||
c[1*rs_c + 2*cs_c] = cv2t[1];
|
||||
c[2*rs_c + 2*cs_c] = cv2t[2];
|
||||
c[3*rs_c + 2*cs_c] = cv2t[3];
|
||||
c[4*rs_c + 2*cs_c] = cv2b[0];
|
||||
c[5*rs_c + 2*cs_c] = cv2b[1];
|
||||
c[6*rs_c + 2*cs_c] = cv2b[2];
|
||||
c[7*rs_c + 2*cs_c] = cv2b[3];
|
||||
|
||||
// Store column 3
|
||||
c[0*rs_c + 3*cs_c] = cv3t[0];
|
||||
c[1*rs_c + 3*cs_c] = cv3t[1];
|
||||
c[2*rs_c + 3*cs_c] = cv3t[2];
|
||||
c[3*rs_c + 3*cs_c] = cv3t[3];
|
||||
c[4*rs_c + 3*cs_c] = cv3b[0];
|
||||
c[5*rs_c + 3*cs_c] = cv3b[1];
|
||||
c[6*rs_c + 3*cs_c] = cv3b[2];
|
||||
c[7*rs_c + 3*cs_c] = cv3b[3];
|
||||
}
|
||||
}
|
||||
|
||||
void bli_cgemm_opt(
|
||||
dim_t k,
|
||||
scomplex alpha[restrict static 1],
|
||||
scomplex a[restrict static 4*k],
|
||||
scomplex b[restrict static k*4],
|
||||
scomplex beta[restrict static 1],
|
||||
scomplex c[restrict static 4*4],
|
||||
inc_t rs_c,
|
||||
inc_t cs_c,
|
||||
auxinfo_t* data)
|
||||
{
|
||||
// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
|
||||
v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
|
||||
v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
|
||||
for (dim_t i = 0; i < k; i += 1) {
|
||||
const v4sf avt = v4sf_cload(a);
|
||||
const v4sf avb = v4sf_cload(a+2);
|
||||
const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
|
||||
const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
|
||||
|
||||
const v4sf bv0r = v4sf_splat(b[0].real);
|
||||
const v4sf bv0i = v4sf_splat(b[0].imag);
|
||||
abv0r += avr * bv0r - avi * bv0i;
|
||||
abv0i += avr * bv0i + avi * bv0r;
|
||||
|
||||
const v4sf bv1r = v4sf_splat(b[1].real);
|
||||
const v4sf bv1i = v4sf_splat(b[1].imag);
|
||||
abv1r += avr * bv1r - avi * bv1i;
|
||||
abv1i += avr * bv1i + avi * bv1r;
|
||||
|
||||
const v4sf bv2r = v4sf_splat(b[2].real);
|
||||
const v4sf bv2i = v4sf_splat(b[2].imag);
|
||||
abv2r += avr * bv2r - avi * bv2i;
|
||||
abv2i += avr * bv2i + avi * bv2r;
|
||||
|
||||
const v4sf bv3r = v4sf_splat(b[3].real);
|
||||
const v4sf bv3i = v4sf_splat(b[3].imag);
|
||||
abv3r += avr * bv3r - avi * bv3i;
|
||||
abv3i += avr * bv3i + avi * bv3r;
|
||||
|
||||
a += 4;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
const v4sf alphavr = v4sf_splat(alpha->real);
|
||||
const v4sf alphavi = v4sf_splat(alpha->imag);
|
||||
v4sf temp;
|
||||
|
||||
temp = abv0r * alphavr - abv0i * alphavi;
|
||||
abv0i = abv0r * alphavi + abv0i * alphavr;
|
||||
abv0r = temp;
|
||||
|
||||
temp = abv1r * alphavr - abv1i * alphavi;
|
||||
abv1i = abv1r * alphavi + abv1i * alphavr;
|
||||
abv1r = temp;
|
||||
|
||||
temp = abv2r * alphavr - abv2i * alphavi;
|
||||
abv2i = abv2r * alphavi + abv2i * alphavr;
|
||||
abv2r = temp;
|
||||
|
||||
temp = abv3r * alphavr - abv3i * alphavi;
|
||||
abv3i = abv3r * alphavi + abv3i * alphavr;
|
||||
abv3r = temp;
|
||||
|
||||
if (rs_c == 1) {
|
||||
const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
|
||||
const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
|
||||
const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
|
||||
const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
|
||||
const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
|
||||
const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
|
||||
const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
|
||||
const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
|
||||
|
||||
v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
|
||||
v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
|
||||
v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
|
||||
v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
|
||||
v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
|
||||
v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
|
||||
v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
|
||||
v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
|
||||
|
||||
const v4sf betavr = v4sf_splat(beta->real);
|
||||
const v4sf betavi = v4sf_splat(beta->imag);
|
||||
|
||||
temp = abv0r + cv0r * betavr - cv0i * betavi;
|
||||
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
|
||||
cv0r = temp;
|
||||
|
||||
temp = abv1r + cv1r * betavr - cv1i * betavi;
|
||||
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
|
||||
cv1r = temp;
|
||||
|
||||
temp = abv2r + cv2r * betavr - cv2i * betavi;
|
||||
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
|
||||
cv2r = temp;
|
||||
|
||||
temp = abv3r + cv3r * betavr - cv3i * betavi;
|
||||
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
|
||||
cv3r = temp;
|
||||
|
||||
v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
|
||||
v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
|
||||
v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
|
||||
v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
|
||||
v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
|
||||
} else {
|
||||
// Load columns 0, 1, 2, 3 (real part)
|
||||
v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
|
||||
v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
|
||||
v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
|
||||
v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
|
||||
// Load columns 0, 1, 2, 3 (imaginary part)
|
||||
v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
|
||||
v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
|
||||
v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
|
||||
v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
|
||||
|
||||
const v4sf betavr = v4sf_splat(beta->real);
|
||||
const v4sf betavi = v4sf_splat(beta->imag);
|
||||
|
||||
temp = abv0r + cv0r * betavr - cv0i * betavi;
|
||||
cv0i = abv0i + cv0r * betavi + cv0i * betavr;
|
||||
cv0r = temp;
|
||||
|
||||
temp = abv1r + cv1r * betavr - cv1i * betavi;
|
||||
cv1i = abv1i + cv1r * betavi + cv1i * betavr;
|
||||
cv1r = temp;
|
||||
|
||||
temp = abv2r + cv2r * betavr - cv2i * betavi;
|
||||
cv2i = abv2i + cv2r * betavi + cv2i * betavr;
|
||||
cv2r = temp;
|
||||
|
||||
temp = abv3r + cv3r * betavr - cv3i * betavi;
|
||||
cv3i = abv3i + cv3r * betavi + cv3i * betavr;
|
||||
cv3r = temp;
|
||||
|
||||
// Store column 0
|
||||
c[0*rs_c + 0*cs_c].real = cv0r[0];
|
||||
c[0*rs_c + 0*cs_c].imag = cv0i[0];
|
||||
c[1*rs_c + 0*cs_c].real = cv0r[1];
|
||||
c[1*rs_c + 0*cs_c].imag = cv0i[1];
|
||||
c[2*rs_c + 0*cs_c].real = cv0r[2];
|
||||
c[2*rs_c + 0*cs_c].imag = cv0i[2];
|
||||
c[3*rs_c + 0*cs_c].real = cv0r[3];
|
||||
c[3*rs_c + 0*cs_c].imag = cv0i[3];
|
||||
|
||||
// Store column 1
|
||||
c[0*rs_c + 1*cs_c].real = cv1r[0];
|
||||
c[0*rs_c + 1*cs_c].imag = cv1i[0];
|
||||
c[1*rs_c + 1*cs_c].real = cv1r[1];
|
||||
c[1*rs_c + 1*cs_c].imag = cv1i[1];
|
||||
c[2*rs_c + 1*cs_c].real = cv1r[2];
|
||||
c[2*rs_c + 1*cs_c].imag = cv1i[2];
|
||||
c[3*rs_c + 1*cs_c].real = cv1r[3];
|
||||
c[3*rs_c + 1*cs_c].imag = cv1i[3];
|
||||
|
||||
// Store column 2
|
||||
c[0*rs_c + 2*cs_c].real = cv2r[0];
|
||||
c[0*rs_c + 2*cs_c].imag = cv2i[0];
|
||||
c[1*rs_c + 2*cs_c].real = cv2r[1];
|
||||
c[1*rs_c + 2*cs_c].imag = cv2i[1];
|
||||
c[2*rs_c + 2*cs_c].real = cv2r[2];
|
||||
c[2*rs_c + 2*cs_c].imag = cv2i[2];
|
||||
c[3*rs_c + 2*cs_c].real = cv2r[3];
|
||||
c[3*rs_c + 2*cs_c].imag = cv2i[3];
|
||||
|
||||
// Store column 3
|
||||
c[0*rs_c + 3*cs_c].real = cv3r[0];
|
||||
c[0*rs_c + 3*cs_c].imag = cv3i[0];
|
||||
c[1*rs_c + 3*cs_c].real = cv3r[1];
|
||||
c[1*rs_c + 3*cs_c].imag = cv3i[1];
|
||||
c[2*rs_c + 3*cs_c].real = cv3r[2];
|
||||
c[2*rs_c + 3*cs_c].imag = cv3i[2];
|
||||
c[3*rs_c + 3*cs_c].real = cv3r[3];
|
||||
c[3*rs_c + 3*cs_c].imag = cv3i[3];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -45,7 +45,8 @@
|
||||
#
|
||||
|
||||
.PHONY: all bin clean \
|
||||
check-env check-env-mk check-env-fragments check-env-make-defs
|
||||
check-env check-env-mk check-env-fragments check-env-make-defs \
|
||||
run run-amd64 run-x86 run-arm
|
||||
|
||||
|
||||
|
||||
@@ -241,8 +242,21 @@ TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
|
||||
$(TEST_OBJ_PATH)/%.o, \
|
||||
$(wildcard $(TEST_SRC_PATH)/*.c))
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
# Linked executable
|
||||
TEST_BIN := test_libblis.unstable.pexe
|
||||
# Finalized executable
|
||||
TEST_BIN_PNACL := test_libblis.pexe
|
||||
# Translated executable for x86-64
|
||||
TEST_BIN_AMD64 := test_libblis.x86-64.nexe
|
||||
# Translated executable for x86
|
||||
TEST_BIN_X86 := test_libblis.x86.nexe
|
||||
# Translated executable for ARM
|
||||
TEST_BIN_ARM := test_libblis.arm.nexe
|
||||
else
|
||||
# Binary executable name.
|
||||
TEST_BIN := test_libblis.x
|
||||
endif
|
||||
|
||||
# Add installed and local header paths to CFLAGS
|
||||
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
|
||||
@@ -257,7 +271,11 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
|
||||
|
||||
all: check-env bin
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
bin: check-env $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
|
||||
else
|
||||
bin: check-env $(TEST_BIN)
|
||||
endif
|
||||
|
||||
|
||||
# --- Environment check rules ---
|
||||
@@ -301,9 +319,68 @@ else
|
||||
@$(LINKER) $(TEST_OBJS) $(BLIS_LIB) $(LDFLAGS) -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
|
||||
# Finalize PNaCl executable (i.e. convert from LLVM bitcode to PNaCl bitcode)
|
||||
$(TEST_BIN_PNACL): $(TEST_BIN)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
|
||||
else
|
||||
@echo "Finalizing $@"
|
||||
@$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
|
||||
endif
|
||||
|
||||
# Translate PNaCl executable to x86-64 NaCl executable
|
||||
$(TEST_BIN_AMD64): $(TEST_BIN_PNACL)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
|
||||
else
|
||||
@echo "Translating $< -> $@"
|
||||
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
|
||||
# Translate PNaCl executable to x86 NaCl executable
|
||||
$(TEST_BIN_X86): $(TEST_BIN_PNACL)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
|
||||
else
|
||||
@echo "Translating $< -> $@"
|
||||
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
# Translate PNaCl executable to ARMv7 NaCl executable
|
||||
$(TEST_BIN_ARM): $(TEST_BIN_PNACL)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
|
||||
else
|
||||
@echo "Translating $< -> $@"
|
||||
@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
# -- Test run rules --
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
run-amd64: $(TEST_BIN_AMD64)
|
||||
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_64 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_64.nexe -- $(TEST_BIN_AMD64)
|
||||
run-x86: $(TEST_BIN_X86)
|
||||
$(NACL_SDK_ROOT)/tools/sel_ldr_x86_32 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_32.nexe -- $(TEST_BIN_X86)
|
||||
run-arm: $(TEST_BIN_ARM)
|
||||
$(NACL_SDK_ROOT)/tools/sel_ldr_arm -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_arm.nexe -- $(TEST_BIN_ARM)
|
||||
else
|
||||
run: $(TEST_BIN)
|
||||
./$(TEST_BIN)
|
||||
endif
|
||||
|
||||
# -- Clean rules --
|
||||
|
||||
ifeq ($(CONFIG_NAME),pnacl)
|
||||
clean:
|
||||
- $(RM_F) $(TEST_OBJS) $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
|
||||
else
|
||||
clean:
|
||||
- $(RM_F) $(TEST_OBJS) $(TEST_BIN)
|
||||
endif
|
||||
|
||||
|
||||
@@ -270,7 +270,7 @@ void libblis_test_gemm_ukr_impl( iface_t iface,
|
||||
switch ( iface )
|
||||
{
|
||||
case BLIS_TEST_SEQ_UKERNEL:
|
||||
bli_gemm_ukr( alpha, a, b, beta, c );
|
||||
bli_gemm_ukernel( alpha, a, b, beta, c );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -354,99 +354,3 @@ void libblis_test_gemm_ukr_check( obj_t* alpha,
|
||||
bli_obj_free( &z );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define object-wrapper to GEMM_UKERNEL micro-kernels.
|
||||
//
|
||||
|
||||
#define FUNCPTR_T gemm_ukr_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a,
|
||||
void* b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ukr);
|
||||
|
||||
|
||||
void bli_gemm_ukr( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
|
||||
dim_t k = bli_obj_width( *a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( *a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
|
||||
|
||||
void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta );
|
||||
|
||||
inc_t ps_a = bli_obj_panel_stride( *a );
|
||||
inc_t ps_b = bli_obj_panel_stride( *b );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
auxinfo_t data;
|
||||
|
||||
|
||||
// Fill the auxinfo_t struct in case the micro-kernel uses it.
|
||||
bli_auxinfo_set_next_a( buf_a, data );
|
||||
bli_auxinfo_set_next_b( buf_b, data );
|
||||
bli_auxinfo_set_ps_a( ps_a, data );
|
||||
bli_auxinfo_set_ps_b( ps_b, data );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f( k,
|
||||
buf_alpha,
|
||||
buf_a,
|
||||
buf_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
&data );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
beta, \
|
||||
c, rs_c, cs_c, \
|
||||
data ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm_ukr, GEMM_UKERNEL )
|
||||
|
||||
|
||||
@@ -34,28 +34,3 @@
|
||||
|
||||
void libblis_test_gemm_ukr( test_params_t* params, test_op_t* op );
|
||||
|
||||
|
||||
//
|
||||
// Prototype wrapper interfaces to micro-kernel.
|
||||
//
|
||||
void bli_gemm_ukr( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ukr )
|
||||
|
||||
|
||||
@@ -314,7 +314,7 @@ void libblis_test_gemmtrsm_ukr_impl( iface_t iface,
|
||||
switch ( iface )
|
||||
{
|
||||
case BLIS_TEST_SEQ_UKERNEL:
|
||||
bli_gemmtrsm_ukr( alpha, a1x, a11, bx1, b11, c11 );
|
||||
bli_gemmtrsm_ukernel( alpha, a1x, a11, bx1, b11, c11 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -474,115 +474,3 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
|
||||
bli_obj_set_diag_offset( 0, *a11 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define object-wrapper to GEMMTRSM_L_UKERNEL, GEMMTRSM_U_UKERNEL
|
||||
// micro-kernels.
|
||||
//
|
||||
|
||||
#undef FUNCPTR_T
|
||||
#define FUNCPTR_T gemmtrsm_ukr_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a1x,
|
||||
void* a11,
|
||||
void* bx1,
|
||||
void* b11,
|
||||
void* c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukr);
|
||||
static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukr);
|
||||
|
||||
|
||||
void bli_gemmtrsm_ukr( obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 )
|
||||
{
|
||||
dim_t k = bli_obj_width( *a1x );
|
||||
|
||||
num_t dt = bli_obj_datatype( *c11 );
|
||||
|
||||
void* buf_a1x = bli_obj_buffer_at_off( *a1x );
|
||||
|
||||
void* buf_a11 = bli_obj_buffer_at_off( *a11 );
|
||||
|
||||
void* buf_bx1 = bli_obj_buffer_at_off( *bx1 );
|
||||
|
||||
void* buf_b11 = bli_obj_buffer_at_off( *b11 );
|
||||
|
||||
void* buf_c11 = bli_obj_buffer_at_off( *c11 );
|
||||
inc_t rs_c = bli_obj_row_stride( *c11 );
|
||||
inc_t cs_c = bli_obj_col_stride( *c11 );
|
||||
|
||||
void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
|
||||
|
||||
inc_t ps_a = bli_obj_panel_stride( *a1x );
|
||||
inc_t ps_b = bli_obj_panel_stride( *bx1 );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
auxinfo_t data;
|
||||
|
||||
|
||||
// Fill the auxinfo_t struct in case the micro-kernel uses it.
|
||||
if ( bli_obj_is_lower( *a11 ) )
|
||||
{ bli_auxinfo_set_next_a( buf_a1x, data ); }
|
||||
else
|
||||
{ bli_auxinfo_set_next_a( buf_a11, data ); }
|
||||
bli_auxinfo_set_next_b( buf_bx1, data );
|
||||
|
||||
bli_auxinfo_set_ps_a( ps_a, data );
|
||||
bli_auxinfo_set_ps_b( ps_b, data );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt];
|
||||
else f = ftypes_u[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f( k,
|
||||
buf_alpha,
|
||||
buf_a1x,
|
||||
buf_a11,
|
||||
buf_bx1,
|
||||
buf_b11,
|
||||
buf_c11, rs_c, cs_c,
|
||||
&data );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a1x, \
|
||||
void* a11, \
|
||||
void* bx1, \
|
||||
void* b11, \
|
||||
void* c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha, \
|
||||
a1x, \
|
||||
a11, \
|
||||
bx1, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
data ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukr, GEMMTRSM_L_UKERNEL )
|
||||
INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukr, GEMMTRSM_U_UKERNEL )
|
||||
|
||||
|
||||
@@ -34,30 +34,3 @@
|
||||
|
||||
void libblis_test_gemmtrsm_ukr( test_params_t* params, test_op_t* op );
|
||||
|
||||
//
|
||||
// Prototype wrapper interfaces to micro-kernel.
|
||||
//
|
||||
void bli_gemmtrsm_ukr( obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a1x, \
|
||||
void* a11, \
|
||||
void* bx1, \
|
||||
void* b11, \
|
||||
void* c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_ukr )
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_u_ukr )
|
||||
|
||||
|
||||
@@ -267,7 +267,7 @@ void libblis_test_trsm_ukr_impl( iface_t iface,
|
||||
switch ( iface )
|
||||
{
|
||||
case BLIS_TEST_SEQ_UKERNEL:
|
||||
bli_trsm_ukr( a, b, c );
|
||||
bli_trsm_ukernel( a, b, c );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -367,84 +367,3 @@ void libblis_test_trsm_ukr_check( side_t side,
|
||||
bli_obj_free( &z );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define object-wrapper to TRSM_L_UKERNEL, TRSM_U_UKERNEL micro-kernels.
|
||||
//
|
||||
|
||||
#undef FUNCPTR_T
|
||||
#define FUNCPTR_T trsm_ukr_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
void* a,
|
||||
void* b,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukr);
|
||||
static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukr);
|
||||
|
||||
|
||||
void bli_trsm_ukr( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( *a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
inc_t ps_a = bli_obj_panel_stride( *a );
|
||||
inc_t ps_b = bli_obj_panel_stride( *b );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
auxinfo_t data;
|
||||
|
||||
|
||||
// Fill the auxinfo_t struct in case the micro-kernel uses it.
|
||||
bli_auxinfo_set_next_a( buf_a, data );
|
||||
bli_auxinfo_set_next_b( buf_b, data );
|
||||
|
||||
bli_auxinfo_set_ps_a( ps_a, data );
|
||||
bli_auxinfo_set_ps_b( ps_b, data );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt];
|
||||
else f = ftypes_u[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f( buf_a,
|
||||
buf_b,
|
||||
buf_c, rs_c, cs_c,
|
||||
&data );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,ukrname)( a, \
|
||||
b, \
|
||||
c, rs_c, cs_c, \
|
||||
data ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trsm_l_ukr, TRSM_L_UKERNEL )
|
||||
INSERT_GENTFUNC_BASIC( trsm_u_ukr, TRSM_U_UKERNEL )
|
||||
|
||||
|
||||
@@ -34,23 +34,3 @@
|
||||
|
||||
void libblis_test_trsm_ukr( test_params_t* params, test_op_t* op );
|
||||
|
||||
//
|
||||
// Prototype wrapper interfaces to micro-kernel.
|
||||
//
|
||||
void bli_trsm_ukr( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_ukr )
|
||||
INSERT_GENTPROT_BASIC( trsm_u_ukr )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user