Merge branch 'amd-staging-rome2.1' of ssh://git.amd.com:29418/cpulibraries/er/blis into amd-blis-cpp

Change-Id: I97a10ab7546d475474b0ff733bafb8248843c352
This commit is contained in:
prangana
2019-11-21 00:54:16 +05:30
422 changed files with 70330 additions and 3604 deletions

View File

@@ -39,7 +39,7 @@ build_script:
- bash -lc "cd /c/projects/blis && ./configure %CONFIGURE_OPTS% --enable-threading=%THREADING% --enable-arg-max-hack --prefix=/c/blis %CONFIG%"
- bash -lc "cd /c/projects/blis && mingw32-make -j4 V=1"
- bash -lc "cd /c/projects/blis && mingw32-make install"
- ps: Compress-Archive -Path C:\blis -DestinationPath C:\blis.zip
- 7z a C:\blis.zip C:\blis
- ps: Push-AppveyorArtifact C:\blis.zip
test_script:

1789
CHANGELOG

File diff suppressed because it is too large Load Diff

11
CREDITS
View File

@@ -9,18 +9,22 @@ The BLIS framework was primarily authored by
but many others have contributed code and feedback, including
Sameer Agarwal @sandwichmaker (Google)
Murtaza Ali (Texas Instruments)
Sajid Ali @s-sajid-ali (Northwestern University)
Erling Andersen @erling-d-andersen
Alex Arslan @ararslan
Vernon Austel (IBM, T.J. Watson Research Center)
Matthew Brett @matthew-brett (University of Birmingham)
Jed Brown @jedbrown (Argonne National Laboratory)
Robin Christ @robinchrist
Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany)
Jeff Diamond (Oracle)
Johannes Dieterich @iotamudelta
Krzysztof Drewniak @krzysz00
Marat Dukhan @Maratyszcza (Google)
Victor Eijkhout @VictorEijkhout (Texas Advanced Computing Center)
Evgeny Epifanovsky @epifanovsky (Q-Chem)
Isuru Fernando @isuruf
Roman Gareev @gareevroman
Richard Goldschmidt @SuperFluffy
@@ -30,7 +34,7 @@ but many others have contributed code and feedback, including
Jeff Hammond @jeffhammond (Intel)
Jacob Gorm Hansen @jacobgorm
Jean-Michel Hautbois @jhautbois
Ian Henriksen @insertinterestingnamehere
Ian Henriksen @insertinterestingnamehere (The University of Texas at Austin)
Minh Quan Ho @hominhquan
Matthew Honnibal @honnibal
Stefan Husmann @stefanhusmann
@@ -53,6 +57,7 @@ but many others have contributed code and feedback, including
Ilya Polkovnichenko
Jack Poulson @poulson (Stanford)
Mathieu Poumeyrol @kali
Christos Psarras @ChrisPsa (RWTH-Aachen)
@qnerd
Michael Rader @mrader1248
Pradeep Rao @pradeeptrgit (AMD)
@@ -63,11 +68,13 @@ but many others have contributed code and feedback, including
Rene Sitt
Tony Skjellum @tonyskjellum (The University of Tennessee at Chattanooga)
Mikhail Smelyanskiy (Intel, Parallel Computing Lab)
Nathaniel Smith @njsmith
Shaden Smith @ShadenSmith
Tyler Smith @tlrmchlsmth (The University of Texas at Austin)
Paul Springer @springer13 (RWTH-Aachen)
Vladimir Sukarev
Santanu Thangaraj (AMD)
Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin)
Rhys Ulerich @RhysU (The University of Texas at Austin)
Robert van de Geijn @rvdg (The University of Texas at Austin)
Kiran Varaganti @kvaragan (AMD)
@@ -83,8 +90,10 @@ partners, including
AMD
Hewlett Packard Enterprise
Huawei
Intel
Microsoft
Oracle
Texas Instruments
as well as the National Science Foundation (NSF Awards CCF-0917167,

110
Makefile
View File

@@ -386,23 +386,22 @@ ifeq ($(IS_CONFIGURED),yes)
# named with three .so version numbers.
UNINSTALL_OLD_LIBS :=
UNINSTALL_OLD_LIBS += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS_SO).?.?.?" 2> /dev/null | $(GREP) -v "$(LIBBLIS).$(LIBBLIS_SO_MMB_EXT)")
UNINSTALL_OLD_LIBS += $(filter-out $(INSTALL_LIBDIR)/$(LIBBLIS).$(LIBBLIS_SO_MMB_EXT),$(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS_SO).?.?.?))
# These shell commands gather the filepaths to any library symlink in the
# current LIBDIR that might be left over from an old installation. We start
# with symlinks named using the .so major version number.
UNINSTALL_OLD_SYML := $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS_SO).?" 2> /dev/null | $(GREP) -v "$(LIBBLIS_SO).$(SO_MAJOR)")
UNINSTALL_OLD_SYML := $(filter-out $(INSTALL_LIBDIR)/$(LIBBLIS_SO).$(SO_MAJOR),$(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS_SO).?))
# We also prepare to uninstall older-style symlinks whose names contain the
# BLIS version number and configuration family.
UNINSTALL_OLD_SYML += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS)-*.a" 2> /dev/null | $(GREP) -v "$(LIBBLIS)-$(VERS_CONF).a")
UNINSTALL_OLD_SYML += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS)-*.$(SHLIB_EXT)" 2> /dev/null | $(GREP) -v "$(LIBBLIS)-$(VERS_CONF).$(SHLIB_EXT)")
UNINSTALL_OLD_SYML += $(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS)-*.a)
UNINSTALL_OLD_SYML += $(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS)-*.$(SHLIB_EXT))
# This shell command grabs all files named "*.h" that are not blis.h or cblas.h
# in the installation directory. We consider this set of headers to be "old" and
# eligible for removal upon running of the uninstall-old-headers target.
UNINSTALL_OLD_HEADERS := $(shell $(FIND) $(INSTALL_INCDIR)/blis/ -name "*.h" 2> /dev/null | $(GREP) -v "$(BLIS_H)" | $(GREP) -v "$(CBLAS_H)")
UNINSTALL_OLD_HEADERS := $(filter-out $(BLIS_H),$(filter-out $(CBLAS_H),$(wildcard $(INSTALL_INCDIR)/blis/*.h)))
endif # IS_CONFIGURED
@@ -1027,23 +1026,24 @@ endif # ifeq ($(IS_WIN),no)
# --- Query current configuration ---
showconfig: check-env
@echo "configuration family: $(CONFIG_NAME)"
@echo "sub-configurations: $(CONFIG_LIST)"
@echo "requisite kernels: $(KERNEL_LIST)"
@echo "kernel-to-config map: $(KCONFIG_MAP)"
@echo "-----------------------"
@echo "BLIS version string: $(VERSION)"
@echo ".so major version: $(SO_MAJOR)"
@echo ".so minor.build vers: $(SO_MINORB)"
@echo "install libdir: $(INSTALL_LIBDIR)"
@echo "install includedir: $(INSTALL_INCDIR)"
@echo "debugging status: $(DEBUG_TYPE)"
@echo "multithreading status: $(THREADING_MODEL)"
@echo "enable BLAS API? $(MK_ENABLE_BLAS)"
@echo "enable CBLAS API? $(MK_ENABLE_CBLAS)"
@echo "build static library? $(MK_ENABLE_STATIC)"
@echo "build shared library? $(MK_ENABLE_SHARED)"
@echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)"
@echo "configuration family: $(CONFIG_NAME)"
@echo "sub-configurations: $(CONFIG_LIST)"
@echo "requisite kernels sets: $(KERNEL_LIST)"
@echo "kernel-to-config map: $(KCONFIG_MAP)"
@echo "-------------------------"
@echo "BLIS version string: $(VERSION)"
@echo ".so major version: $(SO_MAJOR)"
@echo ".so minor.build vers: $(SO_MINORB)"
@echo "install libdir: $(INSTALL_LIBDIR)"
@echo "install includedir: $(INSTALL_INCDIR)"
@echo "install sharedir: $(INSTALL_SHAREDIR)"
@echo "debugging status: $(DEBUG_TYPE)"
@echo "multithreading status: $(THREADING_MODEL)"
@echo "enable BLAS API? $(MK_ENABLE_BLAS)"
@echo "enable CBLAS API? $(MK_ENABLE_CBLAS)"
@echo "build static library? $(MK_ENABLE_STATIC)"
@echo "build shared library? $(MK_ENABLE_SHARED)"
@echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)"
# --- Clean rules ---
@@ -1059,16 +1059,16 @@ ifneq ($(SANDBOX),)
- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
endif
else
@echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)."
@echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)"
@- $(FIND) $(CONFIG_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
@echo "Removing makefile fragments from $(FRAME_FRAG_PATH)."
@echo "Removing makefile fragments from $(FRAME_FRAG_PATH)"
@- $(FIND) $(FRAME_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
@echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)."
@echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)"
@- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)."
@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)"
@- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
ifneq ($(SANDBOX),)
@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)."
@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)"
@- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
endif
endif
@@ -1080,7 +1080,7 @@ ifeq ($(ENABLE_VERBOSE),yes)
$(RM_F) $(BLIS_H_FLAT)
$(RM_F) $(CBLAS_H_FLAT)
else
@echo "Removing flattened header files from $(BASE_INC_PATH)."
@echo "Removing flattened header files from $(BASE_INC_PATH)"
@$(RM_F) $(BLIS_H_FLAT)
@$(RM_F) $(CBLAS_H_FLAT)
endif
@@ -1093,9 +1093,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(LIBBLIS_A_PATH)
- $(RM_F) $(LIBBLIS_SO_PATH)
else
@echo "Removing object files from $(BASE_OBJ_PATH)."
@echo "Removing object files from $(BASE_OBJ_PATH)"
@- $(FIND) $(BASE_OBJ_PATH) -name "*.o" | $(XARGS) $(RM_F)
@echo "Removing libraries from $(BASE_LIB_PATH)."
@echo "Removing libraries from $(BASE_LIB_PATH)"
@- $(RM_F) $(LIBBLIS_A_PATH)
@- $(RM_F) $(LIBBLIS_SO_PATH)
endif
@@ -1117,13 +1117,13 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
else
@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)."
@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)"
@- $(RM_F) $(BLASTEST_F2C_OBJS) $(BLASTEST_DRV_OBJS)
@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)."
@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)"
@- $(RM_F) $(BLASTEST_F2C_LIB)
@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)."
@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)"
@- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
@echo "Removing driver output files 'out.*'."
@echo "Removing driver output files 'out.*'"
@- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
endif # ENABLE_VERBOSE
endif # IS_CONFIGURED
@@ -1136,13 +1136,13 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(BLASTEST_DIR)/$(BLASTEST_F2C_LIB_NAME)
- $(RM_F) $(addprefix $(BLASTEST_DIR)/out.,$(BLASTEST_DRV_BASES))
else
@echo "Removing object files from ./$(BLASTEST_DIR)/$(OBJ_DIR)."
@echo "Removing object files from ./$(BLASTEST_DIR)/$(OBJ_DIR)"
@- $(FIND) $(BLASTEST_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
@echo "Removing libf2c.a from ./$(BLASTEST_DIR)."
@echo "Removing libf2c.a from ./$(BLASTEST_DIR)"
@- $(RM_F) $(BLASTEST_DIR)/$(BLASTEST_F2C_LIB_NAME)
@echo "Removing binaries from ./$(BLASTEST_DIR)."
@echo "Removing binaries from ./$(BLASTEST_DIR)"
@- $(FIND) $(BLASTEST_DIR) -name "*.x" | $(XARGS) $(RM_F)
@echo "Removing driver output files 'out.*' from ./$(BLASTEST_DIR)."
@echo "Removing driver output files 'out.*' from ./$(BLASTEST_DIR)"
@- $(RM_F) $(addprefix $(BLASTEST_DIR)/out.,$(BLASTEST_DRV_BASES))
endif # ENABLE_VERBOSE
endif # IS_CONFIGURED
@@ -1160,11 +1160,11 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(TESTSUITE_BIN)
- $(RM_F) $(TESTSUITE_OUT_FILE)
else
@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)."
@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)"
@- $(RM_F) $(MK_TESTSUITE_OBJS)
@echo "Removing binary $(TESTSUITE_BIN)."
@echo "Removing binary $(TESTSUITE_BIN)"
@- $(RM_F) $(TESTSUITE_BIN)
@echo "Removing $(TESTSUITE_OUT_FILE)."
@echo "Removing $(TESTSUITE_OUT_FILE)"
@- $(RM_F) $(TESTSUITE_OUT_FILE)
endif # ENABLE_VERBOSE
endif # IS_CONFIGURED
@@ -1176,9 +1176,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
- $(MAKE) -C $(CPP_TEST_DIR) clean
else
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)."
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)."
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
@$(MAKE) -C $(CPP_TEST_DIR) clean
endif # ENABLE_VERBOSE
@@ -1193,15 +1193,15 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_RF) $(LIB_DIR)
- $(RM_RF) $(INCLUDE_DIR)
else
@echo "Removing $(BLIS_CONFIG_H)."
@echo "Removing $(BLIS_CONFIG_H)"
@$(RM_F) $(BLIS_CONFIG_H)
@echo "Removing $(CONFIG_MK_FILE)."
@echo "Removing $(CONFIG_MK_FILE)"
@- $(RM_F) $(CONFIG_MK_FILE)
@echo "Removing $(OBJ_DIR)."
@echo "Removing $(OBJ_DIR)"
@- $(RM_RF) $(OBJ_DIR)
@echo "Removing $(LIB_DIR)."
@echo "Removing $(LIB_DIR)"
@- $(RM_RF) $(LIB_DIR)
@echo "Removing $(INCLUDE_DIR)."
@echo "Removing $(INCLUDE_DIR)"
@- $(RM_RF) $(INCLUDE_DIR)
endif
endif
@@ -1210,7 +1210,7 @@ endif
# --- CHANGELOG rules ---
changelog:
@echo "Updating '$(DIST_PATH)/$(CHANGELOG)' via '$(GIT_LOG)'."
@echo "Updating '$(DIST_PATH)/$(CHANGELOG)' via '$(GIT_LOG)'"
@$(GIT_LOG) > $(DIST_PATH)/$(CHANGELOG)
@@ -1225,7 +1225,7 @@ uninstall-libs: check-env
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(MK_LIBS_INST)
else
@echo "Uninstalling libraries $(notdir $(MK_LIBS_INST)) from $(dir $(firstword $(MK_LIBS_INST)))."
@echo "Uninstalling libraries $(notdir $(MK_LIBS_INST)) from $(dir $(firstword $(MK_LIBS_INST)))"
@- $(RM_F) $(MK_LIBS_INST)
endif
@@ -1233,7 +1233,7 @@ uninstall-lib-symlinks: check-env
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(MK_LIBS_SYML)
else
@echo "Uninstalling symlinks $(notdir $(MK_LIBS_SYML)) from $(dir $(firstword $(MK_LIBS_SYML)))."
@echo "Uninstalling symlinks $(notdir $(MK_LIBS_SYML)) from $(dir $(firstword $(MK_LIBS_SYML)))"
@- $(RM_F) $(MK_LIBS_SYML)
endif
@@ -1241,7 +1241,7 @@ uninstall-headers: check-env
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_RF) $(MK_INCL_DIR_INST)
else
@echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))."
@echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))"
@- $(RM_RF) $(MK_INCL_DIR_INST)
endif
@@ -1249,7 +1249,7 @@ uninstall-share: check-env
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_RF) $(MK_SHARE_DIR_INST)
else
@echo "Uninstalling directory '$(notdir $(MK_SHARE_DIR_INST))' from $(dir $(MK_SHARE_DIR_INST))."
@echo "Uninstalling directory '$(notdir $(MK_SHARE_DIR_INST))' from $(dir $(MK_SHARE_DIR_INST))"
@- $(RM_RF) $(MK_SHARE_DIR_INST)
endif
@@ -1265,7 +1265,7 @@ $(UNINSTALL_OLD_LIBS) $(UNINSTALL_OLD_SYML) $(UNINSTALL_OLD_HEADERS): check-env
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $@
else
@echo "Uninstalling $(@F) from $(@D)/."
@echo "Uninstalling $(@F) from $(@D)/"
@- $(RM_F) $@
endif

View File

@@ -6,6 +6,7 @@ Contents
--------
* **[Introduction](#introduction)**
* **[Education and Learning](#education-and-learning)**
* **[What's New](#whats-new)**
* **[What People Are Saying About BLIS](#what-people-are-saying-about-blis)**
* **[Key Features](#key-features)**
@@ -76,9 +77,38 @@ and [collaborators](http://shpc.ices.utexas.edu/collaborators.html),
[publications](http://shpc.ices.utexas.edu/publications.html),
and [other educational projects](http://www.ulaff.net/) (such as MOOCs).
Education and Learning
----------------------
Want to understand what's under the hood?
Many of the same concepts and principles employed when developing BLIS are
introduced and taught in a basic pedagogical setting as part of
[LAFF-On Programming for High Performance (LAFF-On-PfHP)](http://www.ulaff.net/),
one of several massive open online courses (MOOCs) in the
[Linear Algebra: Foundations to Frontiers](http://www.ulaff.net/) series,
all of which are available for free via the [edX platform](http://www.edx.org/).
What's New
----------
* **Small/skinny matrix support for dgemm now available!** Thanks to
contributions made possible by our partnership with AMD, we have dramatically
accelerated `gemm` for double-precision real matrix problems where one or two
dimensions is exceedingly small. A natural byproduct of this optimization is
that the traditional case of small _m = n = k_ (i.e. square matrices) is also
accelerated, even though it was not targeted specifically. And though only
`dgemm` was optimized for now, support for other datatypes, other operations,
and/or multithreading may be implemented in the future. We've also added a new
[PerformanceSmall](docs/PerformanceSmall.md) document to showcase the
improvement in performance when some matrix dimensions are small.
* **Performance comparisons now available!** We recently measured the
performance of various level-3 operations on a variety of hardware architectures,
as implemented within BLIS and other BLAS libraries for all four of the standard
floating-point datatypes. The results speak for themselves! Check out our
extensive performance graphs and background info in our new
[Performance](docs/Performance.md) document.
* **BLIS is now in Debian Unstable!** Thanks to Debian developer-maintainers
[M. Zhou](https://github.com/cdluminate) and
[Nico Schlömer](https://github.com/nschloe) for sponsoring our package in Debian.
@@ -87,7 +117,7 @@ the second-most popular Linux distribution (behind Ubuntu, which Debian packages
feed into). The Debian tracker page may be found
[here](https://tracker.debian.org/pkg/blis).
* **BLIS now supports mixed-datatype gemm.** The `gemm` operation may now be
* **BLIS now supports mixed-datatype gemm!** The `gemm` operation may now be
executed on operands of mixed domains and/or mixed precisions. Any combination
of storage datatype for A, B, and C is now supported, along with a separate
computation precision that can differ from the storage precision of A and B.
@@ -313,10 +343,20 @@ table of supported microarchitectures.
* **[Multithreading](docs/Multithreading.md).** This document describes how to
use the multithreading features of BLIS.
* **[Mixed-Datatype](docs/MixedDatatype.md).** This document provides an
* **[Mixed-Datatypes](docs/MixedDatatypes.md).** This document provides an
overview of BLIS's mixed-datatype functionality and provides a brief example
of how to take advantage of this new code.
* **[Performance](docs/Performance.md).** This document reports empirically
measured performance of a representative set of level-3 operations on a variety
of hardware architectures, as implemented within BLIS and other BLAS libraries
for all four of the standard floating-point datatypes.
* **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
empirically measured performance of `gemm` on select hardware architectures
within BLIS and other BLAS libraries when performing matrix problems where one
or two dimensions is exceedingly small.
* **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of
changes included with each new version of BLIS, along with contributor credits
for key features.

View File

@@ -136,7 +136,7 @@ CFLAGS += -Wno-maybe-uninitialized -Wno-parentheses -Wfatal-errors \
-I$(INC_PATH) -DHAVE_BLIS_H
# Locate the libblis library to which we will link.
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
# Override the location of the check-blastest.sh script.
#BLASTEST_CHECK := ./check-blastest.sh

View File

@@ -135,6 +135,12 @@
#endif
#endif
#if @enable_sup_handling@
#define BLIS_ENABLE_SUP_HANDLING
#else
#define BLIS_DISABLE_SUP_HANDLING
#endif
#if @enable_memkind@
#define BLIS_ENABLE_MEMKIND
#else
@@ -159,4 +165,5 @@
#define BLIS_DISABLE_SHARED
#endif
#endif

View File

@@ -115,13 +115,33 @@ THREADING_MODEL := @threading_model@
# Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option.
PRAGMA_OMP_SIMD := @pragma_omp_simd@
# The install libdir, includedir, and shareddir values from configure tell
# us where to install the libraries, header files, and public makefile
# fragments, respectively. Notice that we support the use of DESTDIR so that
# advanced users may install to a temporary location.
INSTALL_LIBDIR := $(DESTDIR)@install_libdir@
INSTALL_INCDIR := $(DESTDIR)@install_incdir@
INSTALL_SHAREDIR := $(DESTDIR)@install_sharedir@
# The installation prefix, exec_prefix, libdir, includedir, and shareddir
# values from configure tell us where to install the libraries, header files,
# and public makefile fragments. We must first assign each substituted
# @anchor@ to its own variable. Why? Because the subsitutions may contain
# unevaluated variable expressions. For example, '@libdir@' may be replaced
# with '${exec_prefix}/lib'. By assigning the anchors to variables first, and
# then assigning them to their final INSTALL_* variables, we allow prefix and
# exec_prefix to be used in the definitions of exec_prefix, libdir,
# includedir, and sharedir.
prefix := @prefix@
exec_prefix := @exec_prefix@
libdir := @libdir@
includedir := @includedir@
sharedir := @sharedir@
# Notice that we support the use of DESTDIR so that advanced users may install
# to a temporary location.
INSTALL_LIBDIR := $(DESTDIR)$(libdir)
INSTALL_INCDIR := $(DESTDIR)$(includedir)
INSTALL_SHAREDIR := $(DESTDIR)$(sharedir)
#$(info prefix = $(prefix) )
#$(info exec_prefix = $(exec_prefix) )
#$(info libdir = $(libdir) )
#$(info includedir = $(includedir) )
#$(info sharedir = $(sharedir) )
#$(error .)
# Whether to output verbose command-line feedback as the Makefile is
# processed.
@@ -135,11 +155,15 @@ BUILDING_OOT := @configured_oot@
ARG_MAX_HACK := @enable_arg_max_hack@
# Whether to build the static and shared libraries.
# Note the "MK_" prefix, which helps differentiate these variables from
# NOTE: The "MK_" prefix, which helps differentiate these variables from
# their corresonding cpp macros that use the BLIS_ prefix.
MK_ENABLE_STATIC := @enable_static@
MK_ENABLE_SHARED := @enable_shared@
# Whether to export all symbols within the shared library, even those symbols
# that are considered to be for internal use only.
EXPORT_SHARED := @export_shared@
# Whether to enable either the BLAS or CBLAS compatibility layers.
MK_ENABLE_BLAS := @enable_blas@
MK_ENABLE_CBLAS := @enable_cblas@

View File

@@ -33,6 +33,7 @@
*/
#define BLIS_EXPORT_BLIS
#include "bli_system.h"
#include "bli_type_defs.h"
#include "bli_arch.h"

View File

@@ -244,10 +244,24 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
# directive.
header_path = get_header_path( header, header_dirpaths )
# If the header was found, we recurse. Otherwise, we output
# the #include directive with a comment indicating that it
# was skipped.
if header_path:
# First, check if the header is our root header (and if so, ignore it).
# Otherwise, if the header was found, we recurse. Otherwise, we output
# the #include directive with a comment indicating that it as skipped
if header == root_inputfile:
markl = result.group(1)
markr = result.group(3)
echov2( "%sthis is the root header '%s'; commenting out / skipping." \
% ( cursp, header ) )
# If the header found is our root header, then we cannot
# recurse into it lest we enter an infinite loop. Output the
# line but make sure it's commented out entirely.
ostring += "%s #include %c%s%c %c" \
% ( skipstr, markl, header, markr, '\n' )
elif header_path:
echov2( "%slocated file '%s'; recursing." \
% ( cursp, header_path ) )
@@ -327,6 +341,7 @@ strip_comments = None
recursive_flag = None
verbose_flag = None
regex = None
root_inputfile = None
def main():
@@ -336,6 +351,7 @@ def main():
global recursive_flag
global verbose_flag
global regex
global root_inputfile
# Obtain the script name.
path, script_name = os.path.split(sys.argv[0])
@@ -397,6 +413,10 @@ def main():
temp_dir = args[2]
dir_list = args[3]
# Save the filename (basename) part of the input file (or root file) into a
# global variable that we can access later from within flatten_header().
root_inputfile = os.path.basename( inputfile )
# Separate the directories into distinct strings.
dir_list = dir_list.split()

View File

@@ -417,8 +417,9 @@ main()
# The arguments to this function. They'll get assigned meaningful
# values after getopts.
mkfile_frag_tmpl_path=""
root_dir=""
frag_dir=""
mkfile_frag_tmpl_path=""
suffix_file=""
ignore_file=""

View File

@@ -183,13 +183,11 @@ bli_cgemm4mb
bli_cgemm4mb_ker_var2
bli_cgemm4mh
bli_cgemm_ex
bli_cgemm_haswell_asm_3x8
bli_cgemm_haswell_asm_8x3
bli_cgemm_ker_var2
bli_cgemm_md_c2r_ref
bli_cgemm_ukernel
bli_cgemmtrsm_l_ukernel
bli_cgemmtrsm_u_ukernel
bli_cgemm_ukernel
bli_cgemv
bli_cgemv_ex
bli_cgemv_unb_var1
@@ -285,12 +283,6 @@ bli_chemv_unf_var3a
bli_cher
bli_cher2
bli_cher2_ex
bli_cher2_unb_var1
bli_cher2_unb_var2
bli_cher2_unb_var3
bli_cher2_unb_var4
bli_cher2_unf_var1
bli_cher2_unf_var4
bli_cher2k
bli_cher2k1m
bli_cher2k3m1
@@ -298,9 +290,13 @@ bli_cher2k3mh
bli_cher2k4m1
bli_cher2k4mh
bli_cher2k_ex
bli_cher2_unb_var1
bli_cher2_unb_var2
bli_cher2_unb_var3
bli_cher2_unb_var4
bli_cher2_unf_var1
bli_cher2_unf_var4
bli_cher_ex
bli_cher_unb_var1
bli_cher_unb_var2
bli_cherk
bli_cherk1m
bli_cherk3m1
@@ -310,6 +306,8 @@ bli_cherk4mh
bli_cherk_ex
bli_cherk_l_ker_var2
bli_cherk_u_ker_var2
bli_cher_unb_var1
bli_cher_unb_var2
bli_cinvertd
bli_cinvertd_ex
bli_cinvertsc
@@ -354,8 +352,8 @@ bli_cntl_copy
bli_cntl_create_node
bli_cntl_free
bli_cntl_free_node
bli_cntl_free_w_thrinfo
bli_cntl_free_wo_thrinfo
bli_cntl_free_w_thrinfo
bli_cntl_mark_family
bli_cntx_1m_stage
bli_cntx_3m1_stage
@@ -544,8 +542,8 @@ bli_ctrsm1m
bli_ctrsm3m1
bli_ctrsm4m1
bli_ctrsm_ex
bli_ctrsm_l_ukernel
bli_ctrsm_ll_ker_var2
bli_ctrsm_l_ukernel
bli_ctrsm_lu_ker_var2
bli_ctrsm_rl_ker_var2
bli_ctrsm_ru_ker_var2
@@ -591,7 +589,6 @@ bli_daddv
bli_daddv_ex
bli_damaxv
bli_damaxv_ex
bli_damaxv_zen_int
bli_dasumv
bli_dasumv_ex
bli_dasumv_unb_var1
@@ -603,14 +600,11 @@ bli_daxpyd
bli_daxpyd_ex
bli_daxpyf
bli_daxpyf_ex
bli_daxpyf_zen_int_8
bli_daxpym
bli_daxpym_ex
bli_daxpym_unb_var1
bli_daxpyv
bli_daxpyv_ex
bli_daxpyv_zen_int
bli_daxpyv_zen_int10
bli_dccastm
bli_dccastnzm
bli_dccastv
@@ -640,16 +634,12 @@ bli_ddotaxpyv
bli_ddotaxpyv_ex
bli_ddotv
bli_ddotv_ex
bli_ddotv_zen_int
bli_ddotv_zen_int10
bli_ddotxaxpyf
bli_ddotxaxpyf_ex
bli_ddotxf
bli_ddotxf_ex
bli_ddotxf_zen_int_8
bli_ddotxv
bli_ddotxv_ex
bli_ddotxv_zen_int
bli_ddpackm_blk_var1_md
bli_ddpackm_cxk_1e_md
bli_ddpackm_cxk_1r_md
@@ -673,14 +663,10 @@ bli_dgemm4mb
bli_dgemm4mb_ker_var2
bli_dgemm4mh
bli_dgemm_ex
bli_dgemm_haswell_asm_6x8
bli_dgemm_haswell_asm_8x6
bli_dgemm_ker_var2
bli_dgemm_ukernel
bli_dgemmtrsm_l_haswell_asm_6x8
bli_dgemmtrsm_l_ukernel
bli_dgemmtrsm_u_haswell_asm_6x8
bli_dgemmtrsm_u_ukernel
bli_dgemm_ukernel
bli_dgemv
bli_dgemv_ex
bli_dgemv_unb_var1
@@ -713,12 +699,6 @@ bli_dhemv_unf_var3a
bli_dher
bli_dher2
bli_dher2_ex
bli_dher2_unb_var1
bli_dher2_unb_var2
bli_dher2_unb_var3
bli_dher2_unb_var4
bli_dher2_unf_var1
bli_dher2_unf_var4
bli_dher2k
bli_dher2k1m
bli_dher2k3m1
@@ -726,9 +706,13 @@ bli_dher2k3mh
bli_dher2k4m1
bli_dher2k4mh
bli_dher2k_ex
bli_dher2_unb_var1
bli_dher2_unb_var2
bli_dher2_unb_var3
bli_dher2_unb_var4
bli_dher2_unf_var1
bli_dher2_unf_var4
bli_dher_ex
bli_dher_unb_var1
bli_dher_unb_var2
bli_dherk
bli_dherk1m
bli_dherk3m1
@@ -738,6 +722,8 @@ bli_dherk4mh
bli_dherk_ex
bli_dherk_l_ker_var2
bli_dherk_u_ker_var2
bli_dher_unb_var1
bli_dher_unb_var2
bli_dinvertd
bli_dinvertd_ex
bli_dinvertsc
@@ -746,11 +732,6 @@ bli_dinvertv_ex
bli_divsc
bli_divsc_check
bli_divsc_qfp
bli_dlamc1
bli_dlamc2
bli_dlamc3
bli_dlamc4
bli_dlamc5
bli_dlamch
bli_dmachval
bli_dmkherm
@@ -838,8 +819,6 @@ bli_dscalm_ex
bli_dscalm_unb_var1
bli_dscalv
bli_dscalv_ex
bli_dscalv_zen_int
bli_dscalv_zen_int10
bli_dscastm
bli_dscastnzm
bli_dscastv
@@ -906,11 +885,6 @@ bli_dsyrk3mh
bli_dsyrk4m1
bli_dsyrk4mh
bli_dsyrk_ex
bli_dt_size
bli_dt_size_check
bli_dt_string
bli_dt_string_check
bli_dt_union_check
bli_dtrmm
bli_dtrmm1m
bli_dtrmm3
@@ -938,8 +912,8 @@ bli_dtrsm1m
bli_dtrsm3m1
bli_dtrsm4m1
bli_dtrsm_ex
bli_dtrsm_l_ukernel
bli_dtrsm_ll_ker_var2
bli_dtrsm_l_ukernel
bli_dtrsm_lu_ker_var2
bli_dtrsm_rl_ker_var2
bli_dtrsm_ru_ker_var2
@@ -950,6 +924,11 @@ bli_dtrsv_unb_var1
bli_dtrsv_unb_var2
bli_dtrsv_unf_var1
bli_dtrsv_unf_var2
bli_dt_size
bli_dt_size_check
bli_dt_string
bli_dt_string_check
bli_dt_union_check
bli_dunpackm_blk_var1
bli_dunpackm_cxk
bli_dunpackm_unb_var1
@@ -1018,6 +997,7 @@ bli_gemm_basic_check
bli_gemm_blk_var1
bli_gemm_blk_var2
bli_gemm_blk_var3
bli_gemmbp_cntl_create
bli_gemm_check
bli_gemm_cntl_create
bli_gemm_cntl_create_node
@@ -1028,6 +1008,8 @@ bli_gemm_determine_kc_f
bli_gemm_direct
bli_gemm_ex
bli_gemm_front
bli_gemmind
bli_gemmind_get_avail
bli_gemm_int
bli_gemm_ker_var2
bli_gemm_ker_var2_md
@@ -1040,20 +1022,17 @@ bli_gemm_md_rcc
bli_gemm_md_rcr
bli_gemm_md_rrc
bli_gemm_md_rrr
bli_gemmnat
bli_gemm_packa
bli_gemm_packb
bli_gemm_prune_unref_mparts_k
bli_gemm_prune_unref_mparts_m
bli_gemm_prune_unref_mparts_n
bli_gemmtrsm_l_ukernel_qfp
bli_gemmtrsm_ukernel
bli_gemmtrsm_u_ukernel_qfp
bli_gemm_ukernel
bli_gemm_ukernel_qfp
bli_gemmbp_cntl_create
bli_gemmind
bli_gemmind_get_avail
bli_gemmnat
bli_gemmtrsm_l_ukernel_qfp
bli_gemmtrsm_u_ukernel_qfp
bli_gemmtrsm_ukernel
bli_gemv
bli_gemv_check
bli_gemv_ex
@@ -1120,30 +1099,18 @@ bli_hemv_unb_var3_qfp
bli_hemv_unb_var4
bli_hemv_unb_var4_qfp
bli_hemv_unf_var1
bli_hemv_unf_var1_qfp
bli_hemv_unf_var1a
bli_hemv_unf_var1a_qfp
bli_hemv_unf_var1_qfp
bli_hemv_unf_var3
bli_hemv_unf_var3_qfp
bli_hemv_unf_var3a
bli_hemv_unf_var3a_qfp
bli_hemv_unf_var3_qfp
bli_her
bli_her2
bli_her2_check
bli_her2_ex
bli_her2_ex_qfp
bli_her2_unb_var1
bli_her2_unb_var1_qfp
bli_her2_unb_var2
bli_her2_unb_var2_qfp
bli_her2_unb_var3
bli_her2_unb_var3_qfp
bli_her2_unb_var4
bli_her2_unb_var4_qfp
bli_her2_unf_var1
bli_her2_unf_var1_qfp
bli_her2_unf_var4
bli_her2_unf_var4_qfp
bli_her2k
bli_her2k1m
bli_her2k3m1
@@ -1157,13 +1124,21 @@ bli_her2k_front
bli_her2kind
bli_her2kind_get_avail
bli_her2knat
bli_her2_unb_var1
bli_her2_unb_var1_qfp
bli_her2_unb_var2
bli_her2_unb_var2_qfp
bli_her2_unb_var3
bli_her2_unb_var3_qfp
bli_her2_unb_var4
bli_her2_unb_var4_qfp
bli_her2_unf_var1
bli_her2_unf_var1_qfp
bli_her2_unf_var4
bli_her2_unf_var4_qfp
bli_her_check
bli_her_ex
bli_her_ex_qfp
bli_her_unb_var1
bli_her_unb_var1_qfp
bli_her_unb_var2
bli_her_unb_var2_qfp
bli_herk
bli_herk1m
bli_herk3m1
@@ -1178,15 +1153,19 @@ bli_herk_determine_kc_f
bli_herk_direct
bli_herk_ex
bli_herk_front
bli_herkind
bli_herkind_get_avail
bli_herk_l_ker_var2
bli_herknat
bli_herk_prune_unref_mparts_k
bli_herk_prune_unref_mparts_m
bli_herk_prune_unref_mparts_n
bli_herk_u_ker_var2
bli_herk_x_ker_var2
bli_herkind
bli_herkind_get_avail
bli_herknat
bli_her_unb_var1
bli_her_unb_var1_qfp
bli_her_unb_var2
bli_her_unb_var2_qfp
bli_ifprintm
bli_ifprintv
bli_igetsc
@@ -1217,9 +1196,9 @@ bli_info_get_enable_sba_pools
bli_info_get_enable_stay_auto_init
bli_info_get_enable_threading
bli_info_get_gemm_impl_string
bli_info_get_gemm_ukr_impl_string
bli_info_get_gemmtrsm_l_ukr_impl_string
bli_info_get_gemmtrsm_u_ukr_impl_string
bli_info_get_gemm_ukr_impl_string
bli_info_get_heap_addr_align_size
bli_info_get_heap_stride_align_size
bli_info_get_hemm_impl_string
@@ -1278,12 +1257,12 @@ bli_l1d_xy_check
bli_l1m_ax_check
bli_l1m_axy_check
bli_l1m_xy_check
bli_l1v_ax_check
bli_l1v_axby_check
bli_l1v_ax_check
bli_l1v_axy_check
bli_l1v_dot_check
bli_l1v_x_check
bli_l1v_xby_check
bli_l1v_x_check
bli_l1v_xi_check
bli_l1v_xy_check
bli_l3_basic_check
@@ -1452,12 +1431,10 @@ bli_pool_init
bli_pool_print
bli_pool_reinit
bli_pool_shrink
bli_pow_di
bli_pow_ri
bli_prime_factorization
bli_print_msg
bli_printm
bli_printm_ex
bli_print_msg
bli_printv
bli_printv_ex
bli_projm
@@ -1510,7 +1487,6 @@ bli_saddv
bli_saddv_ex
bli_samaxv
bli_samaxv_ex
bli_samaxv_zen_int
bli_sasumv
bli_sasumv_ex
bli_sasumv_unb_var1
@@ -1522,14 +1498,11 @@ bli_saxpyd
bli_saxpyd_ex
bli_saxpyf
bli_saxpyf_ex
bli_saxpyf_zen_int_8
bli_saxpym
bli_saxpym_ex
bli_saxpym_unb_var1
bli_saxpyv
bli_saxpyv_ex
bli_saxpyv_zen_int
bli_saxpyv_zen_int10
bli_sba_acquire
bli_sba_checkin_array
bli_sba_checkout_array
@@ -1591,16 +1564,12 @@ bli_sdotaxpyv
bli_sdotaxpyv_ex
bli_sdotv
bli_sdotv_ex
bli_sdotv_zen_int
bli_sdotv_zen_int10
bli_sdotxaxpyf
bli_sdotxaxpyf_ex
bli_sdotxf
bli_sdotxf_ex
bli_sdotxf_zen_int_8
bli_sdotxv
bli_sdotxv_ex
bli_sdotxv_zen_int
bli_sdpackm_blk_var1_md
bli_sdpackm_cxk_1e_md
bli_sdpackm_cxk_1r_md
@@ -1643,14 +1612,10 @@ bli_sgemm4mb
bli_sgemm4mb_ker_var2
bli_sgemm4mh
bli_sgemm_ex
bli_sgemm_haswell_asm_16x6
bli_sgemm_haswell_asm_6x16
bli_sgemm_ker_var2
bli_sgemm_ukernel
bli_sgemmtrsm_l_haswell_asm_6x16
bli_sgemmtrsm_l_ukernel
bli_sgemmtrsm_u_haswell_asm_6x16
bli_sgemmtrsm_u_ukernel
bli_sgemm_ukernel
bli_sgemv
bli_sgemv_ex
bli_sgemv_unb_var1
@@ -1683,12 +1648,6 @@ bli_shemv_unf_var3a
bli_sher
bli_sher2
bli_sher2_ex
bli_sher2_unb_var1
bli_sher2_unb_var2
bli_sher2_unb_var3
bli_sher2_unb_var4
bli_sher2_unf_var1
bli_sher2_unf_var4
bli_sher2k
bli_sher2k1m
bli_sher2k3m1
@@ -1696,9 +1655,13 @@ bli_sher2k3mh
bli_sher2k4m1
bli_sher2k4mh
bli_sher2k_ex
bli_sher2_unb_var1
bli_sher2_unb_var2
bli_sher2_unb_var3
bli_sher2_unb_var4
bli_sher2_unf_var1
bli_sher2_unf_var4
bli_sher_ex
bli_sher_unb_var1
bli_sher_unb_var2
bli_sherk
bli_sherk1m
bli_sherk3m1
@@ -1708,6 +1671,8 @@ bli_sherk4mh
bli_sherk_ex
bli_sherk_l_ker_var2
bli_sherk_u_ker_var2
bli_sher_unb_var1
bli_sher_unb_var2
bli_shiftd
bli_shiftd_check
bli_shiftd_ex
@@ -1717,11 +1682,6 @@ bli_sinvertd_ex
bli_sinvertsc
bli_sinvertv
bli_sinvertv_ex
bli_slamc1
bli_slamc2
bli_slamc3
bli_slamc4
bli_slamc5
bli_slamch
bli_sleep
bli_smachval
@@ -1793,8 +1753,6 @@ bli_sscalm_ex
bli_sscalm_unb_var1
bli_sscalv
bli_sscalv_ex
bli_sscalv_zen_int
bli_sscalv_zen_int10
bli_sscastm
bli_sscastnzm
bli_sscastv
@@ -1889,8 +1847,8 @@ bli_strsm1m
bli_strsm3m1
bli_strsm4m1
bli_strsm_ex
bli_strsm_l_ukernel
bli_strsm_ll_ker_var2
bli_strsm_l_ukernel
bli_strsm_lu_ker_var2
bli_strsm_rl_ker_var2
bli_strsm_ru_ker_var2
@@ -2062,17 +2020,17 @@ bli_trmm_determine_kc_f
bli_trmm_direct
bli_trmm_ex
bli_trmm_front
bli_trmmind
bli_trmmind_get_avail
bli_trmm_ll_ker_var2
bli_trmm_lu_ker_var2
bli_trmmnat
bli_trmm_prune_unref_mparts_k
bli_trmm_prune_unref_mparts_m
bli_trmm_prune_unref_mparts_n
bli_trmm_rl_ker_var2
bli_trmm_ru_ker_var2
bli_trmm_xx_ker_var2
bli_trmmind
bli_trmmind_get_avail
bli_trmmnat
bli_trmv
bli_trmv_check
bli_trmv_ex
@@ -2102,11 +2060,14 @@ bli_trsm_determine_kc_f
bli_trsm_direct
bli_trsm_ex
bli_trsm_front
bli_trsmind
bli_trsmind_get_avail
bli_trsm_int
bli_trsm_l_cntl_create
bli_trsm_l_ukernel_qfp
bli_trsm_ll_ker_var2
bli_trsm_l_ukernel_qfp
bli_trsm_lu_ker_var2
bli_trsmnat
bli_trsm_packa
bli_trsm_packb
bli_trsm_prune_unref_mparts_k
@@ -2115,12 +2076,9 @@ bli_trsm_prune_unref_mparts_n
bli_trsm_r_cntl_create
bli_trsm_rl_ker_var2
bli_trsm_ru_ker_var2
bli_trsm_u_ukernel_qfp
bli_trsm_ukernel
bli_trsm_u_ukernel_qfp
bli_trsm_xx_ker_var2
bli_trsmind
bli_trsmind_get_avail
bli_trsmnat
bli_trsv
bli_trsv_check
bli_trsv_ex
@@ -2245,13 +2203,11 @@ bli_zgemm4mb
bli_zgemm4mb_ker_var2
bli_zgemm4mh
bli_zgemm_ex
bli_zgemm_haswell_asm_3x4
bli_zgemm_haswell_asm_4x3
bli_zgemm_ker_var2
bli_zgemm_md_c2r_ref
bli_zgemm_ukernel
bli_zgemmtrsm_l_ukernel
bli_zgemmtrsm_u_ukernel
bli_zgemm_ukernel
bli_zgemv
bli_zgemv_ex
bli_zgemv_unb_var1
@@ -2284,12 +2240,6 @@ bli_zhemv_unf_var3a
bli_zher
bli_zher2
bli_zher2_ex
bli_zher2_unb_var1
bli_zher2_unb_var2
bli_zher2_unb_var3
bli_zher2_unb_var4
bli_zher2_unf_var1
bli_zher2_unf_var4
bli_zher2k
bli_zher2k1m
bli_zher2k3m1
@@ -2297,9 +2247,13 @@ bli_zher2k3mh
bli_zher2k4m1
bli_zher2k4mh
bli_zher2k_ex
bli_zher2_unb_var1
bli_zher2_unb_var2
bli_zher2_unb_var3
bli_zher2_unb_var4
bli_zher2_unf_var1
bli_zher2_unf_var4
bli_zher_ex
bli_zher_unb_var1
bli_zher_unb_var2
bli_zherk
bli_zherk1m
bli_zherk3m1
@@ -2309,6 +2263,8 @@ bli_zherk4mh
bli_zherk_ex
bli_zherk_l_ker_var2
bli_zherk_u_ker_var2
bli_zher_unb_var1
bli_zher_unb_var2
bli_zinvertd
bli_zinvertd_ex
bli_zinvertsc
@@ -2492,8 +2448,8 @@ bli_ztrsm1m
bli_ztrsm3m1
bli_ztrsm4m1
bli_ztrsm_ex
bli_ztrsm_l_ukernel
bli_ztrsm_ll_ker_var2
bli_ztrsm_l_ukernel
bli_ztrsm_lu_ker_var2
bli_ztrsm_rl_ker_var2
bli_ztrsm_ru_ker_var2
@@ -2528,19 +2484,6 @@ bli_zzpackm_struc_cxk_md
bli_zzxpbym_md
bli_zzxpbym_md_ex
bli_zzxpbym_md_unb_var1
bla_c_abs
bla_c_div
bla_d_abs
bla_d_cnjg
bla_d_imag
bla_d_sign
bla_f__cabs
bla_r_abs
bla_r_cnjg
bla_r_imag
bla_r_sign
bla_z_abs
bla_z_div
sasum_
sasumsub_
saxpy_
@@ -2567,14 +2510,14 @@ srotmg_
ssbmv_
sscal_
sspmv_
sspr2_
sspr_
sspr2_
sswap_
ssymm_
ssymv_
ssyr_
ssyr2_
ssyr2k_
ssyr_
ssyrk_
stbmv_
stbsv_
@@ -2606,14 +2549,14 @@ dscal_
dsdot_
dsdotsub_
dspmv_
dspr2_
dspr_
dspr2_
dswap_
dsymm_
dsymv_
dsyr_
dsyr2_
dsyr2k_
dsyr_
dsyrk_
dtbmv_
dtbsv_
@@ -2641,13 +2584,13 @@ cgeru_
chbmv_
chemm_
chemv_
cher_
cher2_
cher2k_
cher_
cherk_
chpmv_
chpr2_
chpr_
chpr2_
crotg_
cscal_
csrot_
@@ -2680,13 +2623,13 @@ zgeru_
zhbmv_
zhemm_
zhemv_
zher_
zher2_
zher2k_
zher_
zherk_
zhpmv_
zhpr2_
zhpr_
zhpr2_
zrotg_
zscal_
zswap_

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2019, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2019, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,6 +5,7 @@
# libraries.
#
# Copyright (C) 2019, The University of Texas at Austin
# Copyright (C) 2018, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are

118
common.mk
View File

@@ -118,7 +118,8 @@ get-noopt-cxxflags-for = $(strip $(CFLAGS_PRESET) \
get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
-DBLIS_CNAME=$(1) \
$(BUILD_FLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
@@ -126,23 +127,27 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(COMPSIMDFLAGS) \
-DBLIS_CNAME=$(1) \
$(BUILD_FLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(BUILD_FLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
get-frame-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(BUILD_FLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
$(call load-var-for,CKVECFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(BUILD_FLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
# When compiling sandboxes, we use flags similar to those of general framework
@@ -153,19 +158,24 @@ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(CSBOXINCFLAGS) \
$(BUILD_FLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cxxflags-for,$(1)) \
$(CSBOXINCFLAGS) \
$(BUILD_FLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
# Define a separate function that will return appropriate flags for use by
# applications that want to use the same basic flags as those used when BLIS
# was compiled. (This is the same as get-frame-cflags-for(), except that it
# omits the BUILD_FLAGS, which are exclusively for use when BLIS is being
# compiled.)
# was compiled. (NOTE: This is the same as the $(get-frame-cflags-for ...)
# function, except that it omits two variables that contain flags exclusively
# for use when BLIS is being compiled/built: BUILD_CPPFLAGS, which contains a
# cpp macro that confirms that BLIS is being built; and BUILD_SYMFLAGS, which
# contains symbol export flags that are only needed when a shared library is
# being compiled/linked.)
get-user-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
)
@@ -508,9 +518,9 @@ SOFLAGS := -shared
ifeq ($(IS_WIN),yes)
# Windows shared library link flags.
ifeq ($(CC_VENDOR),clang)
SOFLAGS += -Wl,-def:build/libblis-symbols.def -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib
SOFLAGS += -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib
else
SOFLAGS += -Wl,--export-all-symbols -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a
SOFLAGS += -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a
endif
else
# Linux shared library link flags.
@@ -532,6 +542,11 @@ ifeq ($(IS_WIN),no)
LDFLAGS += -Wl,-rpath,$(BASE_LIB_PATH)
endif
endif
# On windows, use the shared library even if static is created.
ifeq ($(IS_WIN),yes)
LIBBLIS_L := $(LIBBLIS_SO)
LIBBLIS_LINK := $(LIBBLIS_SO_PATH)
endif
endif
@@ -610,7 +625,7 @@ endif
$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c))))
# --- Shared library (position-independent code) flags ---
# --- Position-independent code flags (shared libraries only) ---
# Emit position-independent code for dynamic linking.
ifeq ($(IS_WIN),yes)
@@ -622,6 +637,71 @@ CPICFLAGS := -fPIC
endif
$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPICFLAGS,$(c))))
# --- Symbol exporting flags (shared libraries only) ---
# NOTE: These flags are only applied when building BLIS and not used by
# applications that import BLIS compilation flags via the
# $(get-user-cflags-for ...) function.
# Determine default export behavior / visibility of symbols for gcc.
ifeq ($(CC_VENDOR),gcc)
ifeq ($(IS_WIN),yes)
ifeq ($(EXPORT_SHARED),all)
BUILD_SYMFLAGS := -Wl,--export-all-symbols, -Wl,--enable-auto-import
else # ifeq ($(EXPORT_SHARED),public)
BUILD_SYMFLAGS := -Wl,--exclude-all-symbols
endif
else # ifeq ($(IS_WIN),no)
ifeq ($(EXPORT_SHARED),all)
# Export all symbols by default.
BUILD_SYMFLAGS := -fvisibility=default
else # ifeq ($(EXPORT_SHARED),public)
# Hide all symbols by default and export only those that have been annotated
# as needing to be exported.
BUILD_SYMFLAGS := -fvisibility=hidden
endif
endif
endif
# Determine default export behavior / visibility of symbols for icc.
# NOTE: The Windows branches have been omitted since we currently make no
# effort to support Windows builds via icc (only gcc/clang via AppVeyor).
ifeq ($(CC_VENDOR),icc)
ifeq ($(EXPORT_SHARED),all)
# Export all symbols by default.
BUILD_SYMFLAGS := -fvisibility=default
else # ifeq ($(EXPORT_SHARED),public)
# Hide all symbols by default and export only those that have been annotated
# as needing to be exported.
BUILD_SYMFLAGS := -fvisibility=hidden
endif
endif
# Determine default export behavior / visibility of symbols for clang.
ifeq ($(CC_VENDOR),clang)
ifeq ($(IS_WIN),yes)
ifeq ($(EXPORT_SHARED),all)
# NOTE: clang on Windows does not appear to support exporting all symbols
# by default, and therefore we ignore the value of EXPORT_SHARED.
BUILD_SYMFLAGS :=
else # ifeq ($(EXPORT_SHARED),public)
# NOTE: The default behavior of clang on Windows is to hide all symbols
# and only export functions and other declarations that have beenannotated
# as needing to be exported.
BUILD_SYMFLAGS :=
endif
else # ifeq ($(IS_WIN),no)
ifeq ($(EXPORT_SHARED),all)
# Export all symbols by default.
BUILD_SYMFLAGS := -fvisibility=default
else # ifeq ($(EXPORT_SHARED),public)
# Hide all symbols by default and export only those that have been annotated
# as needing to be exported.
BUILD_SYMFLAGS := -fvisibility=hidden
endif
endif
endif
# --- Language flags ---
# Enable C99.
@@ -685,8 +765,18 @@ endif
# --- #pragma omp simd flags (used for reference kernels only) ---
ifeq ($(PRAGMA_OMP_SIMD),yes)
ifeq ($(CC_VENDOR),gcc)
COMPSIMDFLAGS := -fopenmp-simd
else
ifeq ($(CC_VENDOR),clang)
COMPSIMDFLAGS := -fopenmp-simd
else
ifeq ($(CC_VENDOR),icc)
COMPSIMDFLAGS := -qopenmp-simd
endif
endif
endif
else # ifeq ($(PRAGMA_OMP_SIMD),no)
COMPSIMDFLAGS :=
endif
@@ -960,7 +1050,7 @@ VERS_DEF := -DBLIS_VERSION_STRING=\"$(VERSION)\"
# Define a C preprocessor flag that is *only* defined when BLIS is being
# compiled. (In other words, an application that #includes blis.h will not
# get this cpp macro.)
BUILD_FLAGS := -DBLIS_IS_BUILDING_LIBRARY
BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY

View File

@@ -57,7 +57,7 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
@@ -74,7 +74,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS)
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -57,16 +57,16 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -funroll-all-loops
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
@@ -74,7 +74,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -57,16 +57,16 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
@@ -74,7 +74,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -78,7 +78,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS)
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,9 +35,12 @@
#include "blis.h"
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
void bli_cntx_init_haswell( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_haswell_ref( cntx );
@@ -69,6 +73,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
@@ -118,12 +123,18 @@ void bli_cntx_init_haswell( cntx_t* cntx )
#if 1
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1008, 1008, 1008, 1008 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 6, 6, 3, 3 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1024, 1024, 1024, 1024 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 64, 56, 32 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 72, 56, 44 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
@@ -144,5 +155,62 @@ void bli_cntx_init_haswell( cntx_t* cntx )
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 201, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
-1, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -36,7 +37,6 @@
//#define BLIS_FAMILY_H
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------

View File

@@ -63,13 +63,13 @@ endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xCORE-AVX2
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
else
$(error gcc, icc, or clang is required for this configuration.)
endif
@@ -78,7 +78,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) #-funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -78,7 +78,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS)
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -70,7 +70,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Override the default value for LDFLAGS.
LDFLAGS := -mmic

View File

@@ -99,7 +99,7 @@ endif
# Note: We use AVX2 for reference kernels instead of AVX-512.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd
CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations
else
ifeq ($(CC_VENDOR),icc)
CRVECFLAGS := -xMIC-AVX512

View File

@@ -57,7 +57,7 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
@@ -78,7 +78,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -57,16 +57,16 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
@@ -74,7 +74,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -63,13 +63,13 @@ endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xAVX
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge
else
$(error gcc, icc, or clang is required for this configuration.)
endif
@@ -78,7 +78,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -89,7 +89,7 @@ endif
# to overcome the AVX-512 frequency drop". (Issue #187)
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd
CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations
else
ifeq ($(CC_VENDOR),icc)
CRVECFLAGS := -xCORE-AVX2

View File

@@ -57,16 +57,16 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
@@ -74,7 +74,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -57,7 +57,7 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.

View File

@@ -78,7 +78,11 @@ endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS)
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -35,9 +35,12 @@
#include "blis.h"
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
void bli_cntx_init_zen( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen_ref( cntx );
@@ -114,23 +117,27 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
/*
Multi Instance performance improvement of DGEMM when binded to a CCX
In Multi instance each thread runs a sequential DGEMM.
a) If BLIS is run in a multi instance mode with
CPU freq 2.6/2.2 Ghz
DDR4 clock frequency 2400Mhz
Multi Instance performance degradation on different cores
a) CPU freq 2.6 Ghz
DDR4 2400
Multi instance mode
mc = 240, kc = 512, and nc = 2040
has better performance on EPYC server, over the default block sizes.
b) CPU freq 2.4Ghz
DDR4 2400
Multi Instance mode
either
mc = 240, kc = 512 and nc = 2040
(or)
mc = 390, kc = 512 and nc = 4080
b) If BLIS is run in Single Instance mode
c) Higher frequency(3.1Ghz), single instance mode choose default value
mc = 510, kc = 1024 and nc = 4080
*/
// Zen optmized level 3 cache block sizes
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
@@ -138,7 +145,6 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
@@ -150,9 +156,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#endif
//bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
@@ -172,5 +176,62 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
-1, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -39,14 +39,13 @@
// By default, it is effective to parallelize the outer loops.
// Setting these macros to 1 will force JR and IR inner loops
// to be not paralleized.
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_DEFAULT_MR_THREAD_MAX 1
#define BLIS_DEFAULT_NR_THREAD_MAX 1
#define BLIS_ENABLE_ZEN_BLOCK_SIZES
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
@@ -64,6 +63,15 @@
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
// Allow the sup implementation to combine some small edge case iterations in
// the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the
// block-panel algorithm (NR) with the last full iteration that precedes it.
// NOTE: These cpp macros need to be explicitly set to an integer since they
// are used at compile-time to create unconditional branches or dead code
// regions.
#define BLIS_ENABLE_SUP_MR_EXT 1
#define BLIS_ENABLE_SUP_NR_EXT 0
//#endif

View File

@@ -46,10 +46,27 @@ AMD_CONFIG_FILE := amd_config.mk
AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS += -march=znver1
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -38,7 +38,7 @@
void bli_cntx_init_zen2( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen2_ref( cntx );
@@ -135,5 +135,61 @@ void bli_cntx_init_zen2( cntx_t* cntx )
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
-1, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -33,36 +33,56 @@
#
#
# FLAGS that are specific to 'zen2' architecture are added here.
# FLAGS that are common for all the AMD architectures are present in config/zen/amd_config.mk
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := zen2
#CONFIGS_INCL += $(THIS_CONFIG)
# Include file containing common flags for all AMD architectures
AMD_CONFIG_FILE := amd_config.mk
AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -fomit-frame-pointer
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
# gcc 9.0 (clang ?) or later:
GCC_VERSION := $(strip $(shell gcc -dumpversion))
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
CKVECFLAGS += -march=znver2
#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver2
# gcc 6.0 (clang 4.0) or later:
else
CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
endif
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-avx256-split-unaligned-store
# gcc 4.9 (clang 3.5) or later:
# possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

328
configure vendored
View File

@@ -51,8 +51,6 @@ print_usage()
#echo " "
#echo " BLIS ${version}"
echo " "
echo " Field G. Van Zee"
echo " "
echo " Configure BLIS's build system for compilation using a specified"
echo " configuration directory."
echo " "
@@ -72,30 +70,37 @@ print_usage()
echo " "
echo " -p PREFIX, --prefix=PREFIX"
echo " "
echo " The path to which make will install all build products."
echo " If given, this option implies the following options:"
echo " --libdir=PREFIX/lib"
echo " --incdir=PREFIX/include"
echo " The common installation prefix for all files. If given,"
echo " this option effectively implies:"
echo " --libdir=EXECPREFIX/lib"
echo " --includedir=PREFIX/include"
echo " --sharedir=PREFIX/share"
echo " If not given, PREFIX defaults to \$(HOME)/blis. If PREFIX"
echo " where EXECPREFIX defaults to PREFIX. If this option is"
echo " not given, PREFIX defaults to '${prefix_def}'. If PREFIX"
echo " refers to a directory that does not exist, it will be"
echo " created."
echo " "
echo " --exec-prefix=EXECPREFIX"
echo " "
echo " The installation prefix for libraries. Specifically, if"
echo " given, this option effectively implies:"
echo " --libdir=EXECPREFIX/lib"
echo " If not given, EXECPREFIX defaults to PREFIX, which may be"
echo " modified by the --prefix option. If EXECPREFIX refers to"
echo " a directory that does not exist, it will be created."
echo " "
echo " --libdir=LIBDIR"
echo " "
echo " The path to which make will install libraries. If given,"
echo " LIBDIR will override the corresponding directory implied"
echo " by --prefix; if not not given, LIBDIR defaults to"
echo " PREFIX/lib. If LIBDIR refers to a directory that does"
echo " not exist, it will be created."
echo " The path to which make will install libraries. If not"
echo " given, LIBDIR defaults to PREFIX/lib. If LIBDIR refers to"
echo " a directory that does not exist, it will be created."
echo " "
echo " --includedir=INCDIR"
echo " "
echo " The path to which make will install development header"
echo " files. If given, INCDIR will override the corresponding"
echo " directory implied by --prefix; if not given, INCDIR"
echo " defaults to PREFIX/include. If INCDIR refers to a"
echo " directory that does not exist, it will be created."
echo " files. If not given, INCDIR defaults to PREFIX/include."
echo " If INCDIR refers to a directory that does not exist, it"
echo " will be created."
echo " "
echo " --sharedir=SHAREDIR"
echo " "
@@ -104,18 +109,9 @@ print_usage()
echo " and LDFLAGS). These files allow certain BLIS makefiles,"
echo " such as those in the examples or testsuite directories, to"
echo " operate on an installed copy of BLIS rather than a local"
echo " (and possibly uninstalled) copy. If given, SHAREDIR will"
echo " override the corresponding directory implied by --prefix;"
echo " if not given, SHAREDIR defaults to PREFIX/share. If"
echo " SHAREDIR refers to a directory that does not exist, it"
echo " will be created."
echo " "
echo " -d DEBUG, --enable-debug[=DEBUG]"
echo " "
echo " Enable debugging symbols in the library. If argument"
echo " DEBUG is given as 'opt', then optimization flags are"
echo " kept in the framework, otherwise optimization is"
echo " turned off."
echo " (and possibly uninstalled) copy. If not given, SHAREDIR"
echo " defaults to PREFIX/share. If SHAREDIR refers to a"
echo " directory that does not exist, it will be created."
echo " "
echo " --enable-verbose-make, --disable-verbose-make"
echo " "
@@ -129,6 +125,13 @@ print_usage()
echo " even if the command plus command line arguments exceeds"
echo " the operating system limit (ARG_MAX)."
echo " "
echo " -d DEBUG, --enable-debug[=DEBUG]"
echo " "
echo " Enable debugging symbols in the library. If argument"
echo " DEBUG is given as 'opt', then optimization flags are"
echo " kept in the framework, otherwise optimization is"
echo " turned off."
echo " "
echo " --disable-static, --enable-static"
echo " "
echo " Disable (enabled by default) building BLIS as a static"
@@ -141,6 +144,23 @@ print_usage()
echo " library. If the shared library build is disabled, the"
echo " static library build must remain enabled."
echo " "
echo " -e SYMBOLS, --export-shared[=SYMBOLS]"
echo " "
echo " Specify the subset of library symbols that are exported"
echo " within a shared library. Valid values for SYMBOLS are:"
echo " 'public' (the default) and 'all'. By default, only"
echo " functions and variables that belong to public APIs are"
echo " exported in shared libraries. However, the user may"
echo " instead export all symbols in BLIS, even those that were"
echo " intended for internal use only. Note that the public APIs"
echo " encompass all functions that almost any user would ever"
echo " want to call, including the BLAS/CBLAS compatibility APIs"
echo " as well as the basic and expert interfaces to the typed"
echo " and object APIs that are unique to BLIS. Also note that"
echo " changing this option to 'all' will have no effect in some"
echo " environments, such as when compiling with clang on"
echo " Windows."
echo " "
echo " -t MODEL, --enable-threading[=MODEL], --disable-threading"
echo " "
echo " Enable threading in the library, using threading model"
@@ -222,6 +242,16 @@ print_usage()
echo " only be enabled when mixed domain/precision support is"
echo " enabled."
echo " "
echo " --disable-sup-handling, --enable-sup-handling"
echo " "
echo " Disable (enabled by default) handling of small/skinny"
echo " matrix problems via separate code branches. When disabled,"
echo " these small/skinny level-3 operations will be performed by"
echo " the conventional implementation, which is optimized for"
echo " medium and large problems. Note that what qualifies as"
echo " \"small\" depends on thresholds that may vary by sub-"
echo " configuration."
echo " "
echo " -s NAME --enable-sandbox=NAME"
echo " "
echo " Enable a separate sandbox implementation of gemm. This"
@@ -278,6 +308,7 @@ print_usage()
echo " Environment Variables:"
echo " "
echo " CC Specifies the C compiler to use."
echo " CXX Specifies the C++ compiler to use (sandbox only)."
echo " RANLIB Specifies the ranlib executable to use."
echo " AR Specifies the archiver to use."
echo " CFLAGS Specifies additional compiler flags to use (prepended)."
@@ -1016,7 +1047,7 @@ auto_detect()
# Set the linker flags. We need pthreads because it is needed for
# parts of bli_arch.c unrelated to bli_arch_string(), which is called
# by the main() function in ${main_c}.
if [ $is_win = no ]; then
if [[ $is_win == no || "$cc_vendor" != "clang" ]]; then
ldflags="${LIBPTHREAD--lpthread}"
fi
@@ -1294,8 +1325,7 @@ get_compiler_version()
# to OS X's egrep only returning the first match.
cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM' | { read first rest ; echo $first ; })
if [ "${cc_vendor}" = "icc" -o \
"${cc_vendor}" = "gcc" -o \
"${cc_vendor}" = "clang" ]; then
"${cc_vendor}" = "gcc" ]; then
cc_version=$(${cc} -dumpversion)
else
cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
@@ -1343,7 +1373,7 @@ check_compiler()
# Specific:
#
# skx: icc 15.0.1+, gcc 6.0+, clang 3.9+
# knl: icc 14.0.1+, gcc 5.0+, clang 3.5+
# knl: icc 14.0.1+, gcc 5.0+, clang 3.9+
# haswell: any
# sandybridge: any
# penryn: any
@@ -1418,27 +1448,42 @@ check_compiler()
# clang
if [ "x${cc_vendor}" = "xclang" ]; then
if [ ${cc_major} -lt 3 ]; then
echoerr_unsupportedcc
fi
if [ ${cc_major} -eq 3 ]; then
if [ ${cc_minor} -lt 3 ]; then
if [ "$(echo ${vendor_string} | grep -o Apple)" = "Apple" ]; then
if [ ${cc_major} -lt 5 ]; then
echoerr_unsupportedcc
fi
if [ ${cc_minor} -lt 5 ]; then
# See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
if [ ${cc_major} -eq 5 ]; then
# Apple clang 5.0 is clang 3.4svn
blacklistcc_add "excavator"
blacklistcc_add "zen"
blacklistcc_add "knl"
fi
if [ ${cc_minor} -lt 9 ]; then
if [ ${cc_major} -lt 7 ]; then
blacklistcc_add "knl"
blacklistcc_add "skx"
fi
fi
if [ ${cc_major} -lt 4 ]; then
# See comment above regarding zen support.
#blacklistcc_add "zen"
: # explicit no-op since bash can't handle empty loop bodies.
else
if [ ${cc_major} -lt 3 ]; then
echoerr_unsupportedcc
fi
if [ ${cc_major} -eq 3 ]; then
if [ ${cc_minor} -lt 3 ]; then
echoerr_unsupportedcc
fi
if [ ${cc_minor} -lt 5 ]; then
blacklistcc_add "excavator"
blacklistcc_add "zen"
fi
if [ ${cc_minor} -lt 9 ]; then
blacklistcc_add "knl"
blacklistcc_add "skx"
fi
fi
if [ ${cc_major} -lt 4 ]; then
# See comment above regarding zen support.
#blacklistcc_add "zen"
: # explicit no-op since bash can't handle empty loop bodies.
fi
fi
fi
}
@@ -1496,8 +1541,8 @@ check_assembler()
#
# The assembler on OS X won't recognize AVX-512 without help.
if [ "$(uname -s)" == "Darwin" ]; then
cflags="-Wa,-march=knl"
if [ "${cc_vendor}" == "clang" ]; then
cflags="-march=knl"
fi
asm_fp=$(find ${asm_dir} -name "avx512f.s")
@@ -1513,8 +1558,8 @@ check_assembler()
#
# The assembler on OS X won't recognize AVX-512 without help.
if [ "$(uname -s)" == "Darwin" ]; then
cflags="-Wa,-march=skylake-avx512"
if [ "${cc_vendor}" == "clang" ]; then
cflags="-march=skylake-avx512"
fi
asm_fp=$(find ${asm_dir} -name "avx512dq.s")
@@ -1731,21 +1776,33 @@ main()
# -- configure options --
# The user-given install prefix and a flag indicating it was given.
#install_prefix_def="${HOME}/blis"
install_prefix_user=${HOME}/blis # default to this directory.
# Define the default prefix so that the print_usage() function can
# output it in the --help text.
prefix_def='/usr/local'
# The installation prefix, assigned its default value, and a flag to
# track whether or not it was given by the user.
prefix=${prefix_def}
prefix_flag=''
# The user-given install libdir and a flag indicating it was given.
install_libdir_user=''
# The installation exec_prefix, assigned its default value, and a flag to
# track whether or not it was given by the user.
exec_prefix='${prefix}'
exec_prefix_flag=''
# The installation libdir, assigned its default value, and a flag to
# track whether or not it was given by the user.
libdir='${exec_prefix}/lib'
libdir_flag=''
# The user-given install includedir and a flag indicating it was given.
install_incdir_user=''
incdir_flag=''
# The installation includedir, assigned its default value, and a flag to
# track whether or not it was given by the user.
includedir='${prefix}/include'
includedir_flag=''
# The user-given install sharedir and a flag indicating it was given.
install_sharedir_user=''
# The installation sharedir, assigned its default value, and a flag to
# track whether or not it was given by the user.
sharedir='${prefix}/share'
sharedir_flag=''
# The preset value of CFLAGS and LDFLAGS (ie: compiler and linker flags
@@ -1758,7 +1815,7 @@ main()
debug_flag=''
# The threading flag.
threading_model='no'
threading_model='off'
# The method of assigning micropanels to threads in the JR and JR loops.
thread_part_jrir='slab'
@@ -1772,6 +1829,7 @@ main()
enable_arg_max_hack='no'
enable_static='yes'
enable_shared='yes'
export_shared='public'
enable_pba_pools='yes'
enable_sba_pools='yes'
enable_mem_tracing='no'
@@ -1781,6 +1839,7 @@ main()
enable_cblas='no'
enable_mixed_dt='yes'
enable_mixed_dt_extra_mem='yes'
enable_sup_handling='yes'
enable_memkind='' # The default memkind value is determined later on.
force_version='no'
@@ -1821,7 +1880,7 @@ main()
# Process our command line options.
unset OPTIND
while getopts ":hp:d:s:t:r:qci:b:-:" opt; do
while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do
case $opt in
-)
case "$OPTARG" in
@@ -1833,19 +1892,23 @@ main()
;;
prefix=*)
prefix_flag=1
install_prefix_user=${OPTARG#*=}
prefix=${OPTARG#*=}
;;
exec-prefix=*)
exec_prefix_flag=1
exec_prefix=${OPTARG#*=}
;;
libdir=*)
libdir_flag=1
install_libdir_user=${OPTARG#*=}
libdir=${OPTARG#*=}
;;
includedir=*)
incdir_flag=1
install_incdir_user=${OPTARG#*=}
includedir_flag=1
includedir=${OPTARG#*=}
;;
sharedir=*)
sharedir_flag=1
install_sharedir_user=${OPTARG#*=}
sharedir=${OPTARG#*=}
;;
enable-debug)
debug_flag=1
@@ -1882,15 +1945,18 @@ main()
disable-shared)
enable_shared='no'
;;
export-shared=*)
export_shared=${OPTARG#*=}
;;
enable-threading=*)
threading_model=${OPTARG#*=}
;;
disable-threading)
threading_model='off'
;;
thread-part-jrir=*)
thread_part_jrir=${OPTARG#*=}
;;
disable-threading)
threading_model='no'
;;
enable-pba-pools)
enable_pba_pools='yes'
;;
@@ -1946,6 +2012,12 @@ main()
disable-mixed-dt-extra-mem)
enable_mixed_dt_extra_mem='no'
;;
enable-sup-handling)
enable_sup_handling='yes'
;;
disable-sup-handling)
enable_sup_handling='no'
;;
with-memkind)
enable_memkind='yes'
;;
@@ -1967,12 +2039,15 @@ main()
;;
p)
prefix_flag=1
install_prefix_user=$OPTARG
prefix=$OPTARG
;;
d)
debug_flag=1
debug_type=$OPTARG
;;
e)
export_shared=$OPTARG
;;
s)
sandbox_flag=1
sandbox=$OPTARG
@@ -2459,54 +2534,49 @@ main()
# -- Prepare variables for subsitution into template files -----------------
# Parse the status of the install prefix and echo feedback.
# Parse the status of the prefix option and echo feedback.
if [ -n "${prefix_flag}" ]; then
echo "${script_name}: detected --prefix='${install_prefix_user}'."
echo "${script_name}: detected --prefix='${prefix}'."
else
echo "${script_name}: no install prefix option given; defaulting to '${install_prefix_user}'."
echo "${script_name}: no install prefix option given; defaulting to '${prefix}'."
fi
# Set initial (candidate) values for the libdir and includedir using the
# install prefix that was determined above.
install_libdir=${install_prefix_user}/lib
install_incdir=${install_prefix_user}/include
install_sharedir=${install_prefix_user}/share
# Parse the status of the exec_prefix option and echo feedback.
if [ -n "${exec_prefix_flag}" ]; then
echo "${script_name}: detected --exec-prefix='${exec_prefix}'."
else
echo "${script_name}: no install exec_prefix option given; defaulting to PREFIX."
fi
# Set the install libdir, if it was specified. Note that this will override
# the default libdir implied by the install prefix, even if both options
# were given.
# Parse the status of the libdir option and echo feedback.
if [ -n "${libdir_flag}" ]; then
echo "${script_name}: detected --libdir='${install_libdir_user}'."
install_libdir=${install_libdir_user}
echo "${script_name}: detected --libdir='${libdir}'."
else
echo "${script_name}: no install libdir option given; defaulting to PREFIX/lib."
echo "${script_name}: no install libdir option given; defaulting to EXECPREFIX/lib."
fi
# Set the install includedir, if it was specified. Note that this will
# override the default includedir implied by the install prefix, even if
# both options were given.
if [ -n "${incdir_flag}" ]; then
echo "${script_name}: detected --includedir='${install_incdir_user}'."
install_incdir=${install_incdir_user}
# Parse the status of the includedir option and echo feedback.
if [ -n "${includedir_flag}" ]; then
echo "${script_name}: detected --includedir='${includedir}'."
else
echo "${script_name}: no install includedir option given; defaulting to PREFIX/include."
fi
# Set the install sharedir, if it was specified. Note that this will
# override the default sharedir implied by the install prefix, even if
# both options were given.
# Parse the status of the sharedir option and echo feedback.
if [ -n "${sharedir_flag}" ]; then
echo "${script_name}: detected --sharedir='${install_sharedir_user}'."
install_sharedir=${install_sharedir_user}
echo "${script_name}: detected --sharedir='${sharedir}'."
else
echo "${script_name}: no install sharedir option given; defaulting to PREFIX/share."
fi
# Echo the installation directories that we settled on.
echo "${script_name}: final installation directories:"
echo "${script_name}: libdir: ${install_libdir}"
echo "${script_name}: includedir: ${install_incdir}"
echo "${script_name}: sharedir: ${install_sharedir}"
echo "${script_name}: prefix: "${prefix}
echo "${script_name}: exec_prefix: "${exec_prefix}
echo "${script_name}: libdir: "${libdir}
echo "${script_name}: includedir: "${includedir}
echo "${script_name}: sharedir: "${sharedir}
echo "${script_name}: NOTE: the variables above can be overridden when running make."
# Check if CFLAGS is non-empty.
if [ -n "${CFLAGS}" ]; then
@@ -2573,6 +2643,23 @@ main()
exit 1
fi
# Check if the "export shared" flag was specified.
if [ "x${export_shared}" = "xall" ]; then
if [ "x${enable_shared}" = "xyes" ]; then
echo "${script_name}: exporting all symbols within shared library."
else
echo "${script_name}: ignoring request to export all symbols within shared library."
fi
elif [ "x${export_shared}" = "xpublic" ]; then
if [ "x${enable_shared}" = "xyes" ]; then
echo "${script_name}: exporting only public symbols within shared library."
fi
else
echo "${script_name}: *** Invalid argument '${export_shared}' to --export-shared option given."
echo "${script_name}: *** Please use 'public' or 'all'."
exit 1
fi
# Check the threading model flag and standardize its value, if needed.
# NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred.
enable_openmp='no'
@@ -2594,9 +2681,11 @@ main()
enable_pthreads='yes'
enable_pthreads_01=1
threading_model="pthreads" # Standardize the value.
elif [ "x${threading_model}" = "xno" ] ||
elif [ "x${threading_model}" = "xoff" ] ||
[ "x${threading_model}" = "xno" ] ||
[ "x${threading_model}" = "xnone" ]; then
echo "${script_name}: threading is disabled."
threading_model="off"
else
echo "${script_name}: *** Unsupported threading model: ${threading_model}."
exit 1
@@ -2707,6 +2796,13 @@ main()
enable_mixed_dt_extra_mem_01=0
enable_mixed_dt_01=0
fi
if [ "x${enable_sup_handling}" = "xyes" ]; then
echo "${script_name}: small matrix handling is enabled."
enable_sup_handling_01=1
else
echo "${script_name}: small matrix handling is disabled."
enable_sup_handling_01=0
fi
# Report integer sizes.
if [ "x${int_type_size}" = "x32" ]; then
@@ -2758,13 +2854,15 @@ main()
# Variables that may contain forward slashes, such as paths, need extra
# escaping when used in sed commands. We insert those extra escape
# characters here so that the sed commands below do the right thing.
os_name_esc=$(echo "${os_name}" | sed 's/\//\\\//g')
install_libdir_esc=$(echo "${install_libdir}" | sed 's/\//\\\//g')
install_incdir_esc=$(echo "${install_incdir}" | sed 's/\//\\\//g')
install_sharedir_esc=$(echo "${install_sharedir}" | sed 's/\//\\\//g')
dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g')
cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g')
cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g')
os_name_esc=$(echo "${os_name}" | sed 's/\//\\\//g')
prefix_esc=$(echo "${prefix}" | sed 's/\//\\\//g')
exec_prefix_esc=$(echo "${exec_prefix}" | sed 's/\//\\\//g')
libdir_esc=$(echo "${libdir}" | sed 's/\//\\\//g')
includedir_esc=$(echo "${includedir}" | sed 's/\//\\\//g')
sharedir_esc=$(echo "${sharedir}" | sed 's/\//\\\//g')
dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g')
cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g')
cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g')
#sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g')
# For RANLIB, if the variable is not set, we use a default value of
@@ -2779,7 +2877,7 @@ main()
# For Windows builds, clear the libpthread_esc variable so that
# no pthreads library is substituted into config.mk. (Windows builds
# employ an implementation of pthreads that is internal to BLIS.)
if [ $is_win = yes ]; then
if [[ $is_win == yes && "$cc_vendor" == "clang" ]]; then
libpthread_esc=
fi
@@ -2821,13 +2919,13 @@ main()
# -- Determine whether we are performing an out-of-tree build --------------
if [ ${dist_path} != "./" ]; then
if [ "${dist_path}" != "./" ]; then
# At this point, we know the user did not run "./configure". But we
# have not yet ruled out "<fullpath>/configure" or some # equivalent
# that uses relative paths. To further rule out these possibilities,
# we create a dummy file in the current build directory.
touch ./${dummy_file}
touch "./${dummy_file}"
# If the dummy file we just created in the current directory does not
# appear in the source distribution path, then we are in a different
@@ -2871,14 +2969,17 @@ main()
| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
| sed -e "s/@debug_type@/${debug_type}/g" \
| sed -e "s/@threading_model@/${threading_model}/g" \
| sed -e "s/@install_libdir@/${install_libdir_esc}/g" \
| sed -e "s/@install_incdir@/${install_incdir_esc}/g" \
| sed -e "s/@install_sharedir@/${install_sharedir_esc}/g" \
| sed -e "s/@prefix@/${prefix_esc}/g" \
| sed -e "s/@exec_prefix@/${exec_prefix_esc}/g" \
| sed -e "s/@libdir@/${libdir_esc}/g" \
| sed -e "s/@includedir@/${includedir_esc}/g" \
| sed -e "s/@sharedir@/${sharedir_esc}/g" \
| sed -e "s/@enable_verbose@/${enable_verbose}/g" \
| sed -e "s/@configured_oot@/${configured_oot}/g" \
| sed -e "s/@enable_arg_max_hack@/${enable_arg_max_hack}/g" \
| sed -e "s/@enable_static@/${enable_static}/g" \
| sed -e "s/@enable_shared@/${enable_shared}/g" \
| sed -e "s/@export_shared@/${export_shared}/g" \
| sed -e "s/@enable_blas@/${enable_blas}/g" \
| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
@@ -2910,6 +3011,7 @@ main()
| sed -e "s/@enable_cblas@/${enable_cblas_01}/g" \
| sed -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \
| sed -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
| sed -e "s/@enable_sup_handling@/${enable_sup_handling_01}/g" \
| sed -e "s/@enable_memkind@/${enable_memkind_01}/g" \
| sed -e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g" \
| sed -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \

View File

@@ -9,6 +9,9 @@
* **[Step 3b: Testing (optional)](BuildSystem.md#step-3b-testing-optional)**
* **[Step 4: Installation](BuildSystem.md#step-4-installation)**
* **[Cleaning out build products](BuildSystem.md#cleaning-out-build-products)**
* **[Compiling with BLIS](BuildSystem.md#compiling-with-blis)**
* [Disabling BLAS prototypes](BuildSystem.md#disabling-blas-prototypes)
* [CBLAS](BuildSystem.md#cblas)
* **[Linking against BLIS](BuildSystem.md#linking-against-blis)**
* **[Uninstalling](BuildSystem.md#uninstalling)**
* **[make targets](BuildSystem.md#make-targets)**
@@ -83,11 +86,11 @@ Alternatively, `configure` can automatically select a configuration based on you
```
$ ./configure auto
```
However, as of this writing, only a limited number of architectures are detected. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used.
However, as of this writing, only a limited number of architectures are detected. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used.
Upon running configure, you will get output similar to the following. The exact output will depend on whether you cloned BLIS from a `git` repository or whether you obtained BLIS via a downloadable tarball from the [releases](https://github.com/flame/blis/releases) page.
```
$ ./configure haswell
$ ./configure --prefix=$HOME/blis haswell
configure: using 'gcc' compiler.
configure: found gcc version 5.4.0 (maj: 5, min: 4, rev: 0).
configure: checking for blacklisted configurations due to gcc 5.4.0.
@@ -166,17 +169,11 @@ The installation prefix can be specified via the `--prefix=PREFIX` option:
```
$ ./configure --prefix=/usr <configname>
```
This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `$(HOME)/blis`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `/usr/local`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
```
$ ./configure --libdir=/usr/lib --includedir=/usr/include <configname>
```
The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any `PREFIX` path, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `PREFIX/lib` and `INCDIR` defaults to `PREFIX/include`, but each will be overriden by their respective `--libdir`/`--includedir` options. So,
```
$ ./configure --libdir=/usr/lib <configname>
```
will configure BLIS to install libraries to `/usr/lib` and header files to the default location (`$HOME/blis/include`).
Also, note that `configure` will create any installation directories that do not already exist.
The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any path implied by `PREFIX`, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `EXECPREFIX/lib` (where `EXECPREFIX`, set via `--exec-prefix=EXECPREFIX`, defaults to `PREFIX`) and `INCDIR` defaults to `PREFIX/include`, but `LIBDIR` and `INCDIR` will each be overriden by their respective `--libdir`/`--includedir` options. There is a third related option, `--sharedir=SHAREDIR`, where `SHAREDIR` defaults to `PREFIX/share`. This option specifies the installation directory for certain makefile fragments that contain variables determined by `configure` (e.g. `CC`, `CFLAGS`, `LDFLAGS`, etc.). These files allow certain BLIS makefiles, such as those in the `examples` or `testsuite` directories, to operate on an installed copy of BLIS rather than a local (and possibly uninstalled) copy.
For a complete list of supported `configure` options and arguments, run `configure` with the `-h` option:
```
@@ -338,6 +335,47 @@ Removing include.
Running the `distclean` target is like saying, "Remove anything ever created by the build system."
## Compiling with BLIS
All BLIS definitions and prototypes may be included in your C source file by including a single header file, `blis.h`:
```c
#include "stdio.h"
#include "stdlib.h"
#include "otherstuff.h"
#include "blis.h"
```
If the BLAS compatibility layer was enabled at configure-time (as it is by default), then `blis.h` will also provide BLAS prototypes to your source code.
### Disabling BLAS prototypes
Some applications already `#include` a header that contains BLAS prototypes. This can cause problems if those applications also try to `#include` the BLIS header file, as shown above. Suppose for a moment that `otherstuff.h` in the example above already provides BLAS prototypes.
```
$ gcc -I/path/to/blis -I/path/to/otherstuff -c main.c -o main.o
In file included from main.c:41:0:
/path/to/blis/blis.h:36900:111: error: conflicting declaration of C function int xerbla_(const bla_character*, const bla_integer*, ftnlen)
TEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
```
If your application is already declaring (prototyping) BLAS functions, then you may disable those prototypes from being defined included within `blis.h`. This prevents `blis.h` from re-declaring those prototypes, or, allows your other header to declare those functions for the first time, depending on the order that you `#include` the headers.
```c
#include "stdio.h"
#include "stdlib.h"
#include "otherstuff.h"
#define BLIS_DISABLE_BLAS_DEFS // disable BLAS prototypes within BLIS.
#include "blis.h"
```
By `#defining` the `BLIS_DISABLE_BLAS_DEFS` macro, we signal to `blis.h` that it should skip over the BLAS prototypes, but otherwise `#include` everything else as it normally would. Note that `BLIS_DISABLE_BLAS_DEFS` must be `#defined` *prior* to the `#include "blis.h"` directive in order for it to have any effect.
### CBLAS
If you build BLIS with CBLAS enabled and you wish to access CBLAS function prototypes from within your application, you will have to `#include` the `cblas.h` header separately from `blis.h`.
```
#include "blis.h"
#include "cblas.h"
```
## Linking against BLIS
Once you have instantiated (configured and compiled, and perhaps installed) a BLIS library, you can link to it in your application's makefile as you would any other library. The following is an abbreviated makefile for a small hypothetical application that has just two external dependencies: BLIS and the standard C math library. We also link against libpthread since that library has been a runtime dependency of BLIS since 70640a3 (December 2017).
@@ -357,7 +395,7 @@ OBJS = main.o util.o other.o
%.o: %.c
$(CC) $(CFLAGS) -c $< -o $@
all: $(OBJS)
all: $(OBJS)
$(LINKER) $(OBJS) $(BLIS_LIB) $(OTHER_LIBS) -o my_program.x
```
The above example assumes you will want to include BLIS definitions and function prototypes into your application via `#include blis.h`. (If you are only using the BLIS via the BLAS compatibility layer, including `blis.h` is not necessary.) Since BLIS headers are installed into a `blis` subdirectory of `PREFIX/include`, you must make sure that the compiler knows where to find the `blis.h` header file. This is typically accomplished by inserting `#include "blis.h"` into your application's source code files and compiling the code with `-I PREFIX/include/blis`.

View File

@@ -12,8 +12,8 @@ The following table lists architectures for which there exist optimized level-3
A few remarks / reminders:
* Optimizing only the [gemm microkernel](KernelsHowTo.md#gemm-microkernel) will result in optimal performance for all [level-3 operations](BLISTypedAPI#level-3-operations) except `trsm` (which will typically achieve 60 - 80% of attainable peak performance).
* The [trsm](BLISTypedAPI#trsm) operation needs the [gemmtrsm microkernel(s)](KernelsHowTo.md#gemmtrsm-microkernels), in addition to the aforementioned [gemm microkernel](KernelsHowTo.md#gemm-microkernel), in order reach optimal performance.
* Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available. Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
* Some microarchitectures use the same sub-configuration. This is not a typo. For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kabylake, and Coffeelake all use the `haswell` sub-configuration and the kernels registered therein.
* Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available, but the "native" complex domain gemm microkernel is unavailable. Note that the table below lists native kernels, so if a microarchitecture lists only `sd`, support for both `c` and `z` datatypes will be provided via the 1m method. (Note: most people cannot tell the difference between native and 1m-based performance.) Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
* Some microarchitectures use the same sub-configuration. *This is not a typo.* For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kaby Lake, and Coffee Lake all use the `haswell` sub-configuration and the kernels registered therein. Microkernels can be recycled in this manner because the key detail that determines level-3 performance outcomes is actually the vector ISA, not the microarchitecture. In the previous example, all of the microarchitectures listed support AVX2 (but not AVX-512), and therefore they can reuse the same microkernels.
* Remember that you (usually) don't have to choose your sub-configuration manually! Instead, you can always request configure-time hardware detection via `./configure auto`. This will defer to internal logic (based on CPUID for x86_64 systems) that will attempt to choose the appropriate sub-configuration automatically.
| Vendor/Microarchitecture | BLIS sub-configuration | `gemm` | `gemmtrsm` |
@@ -26,7 +26,7 @@ A few remarks / reminders:
| Intel Core2 (SSE3) | `penryn` | `sd` | `d` |
| Intel Sandy/Ivy Bridge (AVX/FMA3) | `sandybridge` | `sdcz` | |
| Intel Haswell, Broadwell (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
| Intel Sky/Kaby/Coffeelake (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
| Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
| Intel Knights Landing (AVX-512/FMA3) | `knl` | `sd` | |
| Intel SkylakeX (AVX-512/FMA3) | `skx` | `sd` | |
| ARMv7 Cortex-A9 (NEON) | `cortex-a9` | `sd` | |

View File

@@ -23,11 +23,17 @@
# Introduction
Our paper [Anatomy of High-Performance Many-Threaded Matrix Multiplication](https://github.com/flame/blis#citations), presented at IPDPS'14, identified 5 loops around the microkernel as opportunities for parallelization within level-3 operations such as `gemm`. Within BLIS, we have enabled parallelism for 4 of those loops and have extended it to the rest of the level-3 operations except for `trsm`.
Our paper [Anatomy of High-Performance Many-Threaded Matrix Multiplication](https://github.com/flame/blis#citations), presented at IPDPS'14, identified five loops around the microkernel as opportunities for parallelization within level-3 operations such as `gemm`. Within BLIS, we have enabled parallelism for four of those loops, with the fifth planned for future work. This software architecture extends naturally to all level-3 operations except for `trsm`, where its application is necessarily limited to three of the five loops due to inter-iteration dependencies.
**IMPORTANT**: Multithreading in BLIS is disabled by default. Furthermore, even when multithreading is enabled, BLIS will default to single-threaded execution at runtime. In order to both *allow* and *invoke* parallelism from within BLIS operations, you must both *enable* multithreading at configure-time and *specify* multithreading at runtime.
To summarize: In order to observe multithreaded parallelism within a BLIS operation, you must do *both* of the following:
1. Enable multithreading at configure-time. This is discussed in the [next section](docs/Multithreading.md#enabling-multithreading).
2. Specify multithreading at runtime. This is also dicussed [later on](docs/Multithreading.md#specifying-multithreading).
# Enabling multithreading
Note that BLIS disables multithreading by default. In order to extract multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.
BLIS disables multithreading by default. In order to allow multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.
As of this writing, BLIS optionally supports multithreading via either OpenMP or POSIX threads.
@@ -101,7 +107,7 @@ This pattern--automatic or manual--holds regardless of which of the three method
Regardless of which method is employed, and which specific way within each method, after setting the number of threads, the application may call the desired level-3 operation (via either the [typed API](docs/BLISTypedAPI.md) or the [object API](docs/BLISObjectAPI.md)) and the operation will execute in a multithreaded manner. (When calling BLIS via the BLAS API, only the first two (global) methods are available.)
NOTE: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the manual way will always take precedence.** Also, specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1).
**Note**: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the manual way will always take precedence.** Also, specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1).
## Globally via environment variables
@@ -109,6 +115,8 @@ The most common method of specifying multithreading in BLIS is globally via envi
Regardless of whether you end up using the automatic or manual way of expressing a request for multithreading, note that the environment variables are read (via `getenv()`) by BLIS **only once**, when the library is initialized. Subsequent to library initialization, the global settings for parallelization may only be changed via the [global runtime API](Multithreading.md#globally-at-runtime). If this constraint is not a problem, then environment variables may work fine for you. Otherwise, please consider [local settings](Multithreading.md#locally-at-runtime). (Local settings may used at any time, regardless of whether global settings were explicitly specified, and local settings always override global settings.)
**Note**: Regardless of which way ([automatic](Multithreading.md#environment-variables-the-automatic-way) or [manual](Multithreading.md#environment-variables-the-manual-way)) environment variables are used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs that are unique to BLIS.
### Environment variables: the automatic way
The automatic way of specifying parallelism entails simply setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable:
@@ -119,7 +127,7 @@ $ ./my_blis_program
```
This causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `OMP_NUM_THREADS`. If neither variable is set, the default number of threads is 1.
**Note:** We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
### Environment variables: the manual way
@@ -127,7 +135,7 @@ The manual way of specifying parallelism involves communicating which loops with
The below chart describes the five loops used in BLIS's matrix multiplication operations.
| Loop around microkernel | Environment variable | Direction | Notes |
| Loop around microkernel | Environment variable | Direction | Notes |
|:-------------------------|:---------------------|:----------|:------------|
| 5th loop | `BLIS_JC_NT` | `n` | |
| 4th loop | _N/A_ | `k` | Not enabled |
@@ -154,6 +162,8 @@ Next, which combinations of loops to parallelize depends on which caches are sha
If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized).
**Note**: Regardless of which way ([automatic](Multithreading.md#globally-at-runtime-the-automatic-way) or [manual](Multithreading.md#globally-at-runtime-the-manual-way)) the global runtime API is used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs that are unique to BLIS.
### Globally at runtime: the automatic way
If you simply want to specify an overall number of threads and let BLIS choose a thread factorization automatically, use the following function:
@@ -193,6 +203,8 @@ In addition to the global methods based on environment variables and runtime fun
As with environment variables and the global runtime API, there are two ways to specify parallelism: the automatic way and the manual way. Both ways involve allocating a BLIS-specific object, initializing the object and encoding the desired parallelization, and then passing a pointer to the object into one of the expert interfaces of either the [typed](docs/BLISTypedAPI.md) or [object](docs/BLISObjectAPI) APIs. We provide examples of utilizing this threading object below.
**Note**: Neither way ([automatic](Multithreading.md#locally-at-runtime-the-automatic-way) nor [manual](Multithreading.md#locally-at-runtime-the-manual-way)) of specifying multithreading via the local runtime API can be used via the BLAS interfaces. The local runtime API may *only* be used via the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs, which are unique to BLIS. (Furthermore, the expert interfaces of each API must be used. This is demonstrated later on in this section.)
### Initializing a rntm_t
Before specifying the parallelism (automatically or manually), you must first allocate a special BLIS object called a `rntm_t` (runtime). The object is quite small (about 64 bytes), and so we recommend allocating it statically on the function stack:

394
docs/Performance.md Normal file
View File

@@ -0,0 +1,394 @@
# Contents
* **[Contents](Performance.md#contents)**
* **[Introduction](Performance.md#introduction)**
* **[General information](Performance.md#general-information)**
* **[Level-3 performance](Performance.md#level-3-performance)**
* **[ThunderX2](Performance.md#thunderx2)**
* **[Experiment details](Performance.md#thunderx2-experiment-details)**
* **[Results](Performance.md#thunderx2-results)**
* **[SkylakeX](Performance.md#skylakex)**
* **[Experiment details](Performance.md#skylakex-experiment-details)**
* **[Results](Performance.md#skylakex-results)**
* **[Haswell](Performance.md#haswell)**
* **[Experiment details](Performance.md#haswell-experiment-details)**
* **[Results](Performance.md#haswell-results)**
* **[Epyc](Performance.md#epyc)**
* **[Experiment details](Performance.md#epyc-experiment-details)**
* **[Results](Performance.md#epyc-results)**
* **[Feedback](Performance.md#feedback)**
# Introduction
This document showcases performance results for a representative sample of
level-3 operations on large matrices with BLIS and BLAS for several hardware
architectures.
# General information
Generally speaking, for level-3 operations on large matrices, we publish three
"panels" for each type of hardware,
each of which reports one of: single-threaded performance, multithreaded
performance on a single socket, or multithreaded performance on two sockets.
Each panel will consist of a 4x5 grid of graphs, with each row representing
a different datatype (single real, double real, single complex, and double
complex) and each column representing a different operation (`gemm`,
`hemm`/`symm`, `herk`/`syrk`, `trmm`, and `trsm`).
Each of the 20 graphs within a panel will contain an x-axis that reports
problem size, with all matrix dimensions equal to the problem size (e.g.
_m_ = _n_ = _k_), resulting in square matrices.
The y-axis will report in units GFLOPS (billions of floating-point operations
per second) in the case of single-threaded performance, or GFLOPS/core in the
case of single- or dual-socket multithreaded performance, where GFLOPS/core
is simply the total GFLOPS observed divided by the number of threads utilized.
This normalization is done intentionally in order to facilitate a visual
assessment of the drop in efficiency of multithreaded performance relative
to their single-threaded baselines.
It's also worth pointing out that the top of each graph (e.g. the maximum
y-axis value depicted) _always_ corresponds to the theoretical peak performance
under the conditions associated with that graph.
Theoretical peak performance, in units of GFLOPS/core, is calculated as the
product of:
1. the maximum sustainable clock rate in GHz; and
2. the maximum number of floating-point operations (flops) that can be
executed per cycle (per core).
Note that the maximum sustainable clock rate may change depending on the
conditions.
For example, on some systems the maximum clock rate is higher when only one
core is active (e.g. single-threaded performance) versus when all cores are
active (e.g. multithreaded performance).
The maximum number of flops executable per cycle (per core) is generally
computed as the product of:
1. the maximum number of fused multiply-add (FMA) vector instructions that
can be issued per cycle (per core);
2. the maximum number of elements that can be stored within a single vector
register (for the datatype in question); and
3. 2.0, since an FMA instruction fuses two operations (a multiply and an add).
The problem size range, represented on the x-axis, is usually sampled with 50
equally-spaced problem size.
For example, for single-threaded execution, we might choose to execute with
problem sizes of 48 to 2400 in increments of 48, or 56 to 2800 in increments
of 56.
These values are almost never chosen for any particular (read: sneaky) reason;
rather, we start with a "good" maximum problem size, such as 2400 or 2800, and
then divide it by 50 to obtain the appropriate starting point and increment.
Finally, each point along each curve represents the best of three trials.
# Interpretation
In general, the the curves associated with higher-performing implementations
will appear higher in the graphs than lower-performing implementations.
Ideally, an implementation will climb in performance (as a function of problem
size) as quickly as possible and asymptotically approach some high fraction of
peak performance.
Occasionally, we may publish graphs with incomplete curves--for example,
only the first 25 data points in a typical 50-point series--usually because
the implementation being tested was slow enough that it was not practical to
allow it to finish.
Where along the x-axis you focus your attention will depend on the segment of
the problem size range that you care about most. Some people's applications
depend heavily on smaller problems, where "small" can mean anything from 10
to 1000 or even higher. Some people consider 1000 to be quite large, while
others insist that 5000 is merely "medium." What each of us considers to be
small, medium, or large (naturally) depends heavily on the kinds of dense
linear algebra problems we tend to encounter. No one is "right" or "wrong"
about their characterization of matrix smallness or bigness since each person's
relative frame of reference can vary greatly. That said, the
[Science of High-Performance Computing](http://shpc.ices.utexas.edu/) group at
[The University of Texas at Austin](https://www.utexas.edu/) tends to target
matrices that it classifies as "medium-to-large", and so most of the graphs
presented in this document will reflect that targeting in their x-axis range.
When corresponding with us, via email or when opening an
[issue](https://github.com/flame/blis/issues) on github, we kindly ask that
you specify as closely as possible (though a range is fine) your problem
size of interest so that we can better assist you.
# Level-3 performance
## ThunderX2
### ThunderX2 experiment details
* Location: Unknown
* Processor model: Marvell ThunderX2 CN9975
* Core topology: two sockets, 28 cores per socket, 56 cores total
* SMT status: disabled at boot-time
* Max clock rate: 2.2GHz (single-core and multicore)
* Max vector register length: 128 bits (NEON)
* Max FMA vector IPC: 2
* Peak performance:
* single-core: 17.6 GFLOPS (double-precision), 35.2 GFLOPS (single-precision)
* multicore: 17.6 GFLOPS/core (double-precision), 35.2 GFLOPS/core (single-precision)
* Operating system: Ubuntu 16.04 (Linux kernel 4.15.0)
* Compiler: gcc 7.3.0
* Results gathered: 14 February 2019
* Implementations tested:
* BLIS 075143df (0.5.1-39)
* configured with `./configure -t openmp thunderx2` (single- and multithreaded)
* sub-configuration exercised: `thunderx2`
* Single-threaded (1 core) execution requested via no change in environment variables
* Multithreaded (28 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=7`
* Multithreaded (56 core) execution requested via `export BLIS_JC_NT=8 BLIS_IC_NT=7`
* OpenBLAS 52d3f7a
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=56` (multithreaded, 56 cores)
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
* Multithreaded (28 core) execution requested via `export OPENBLAS_NUM_THREADS=28`
* Multithreaded (56 core) execution requested via `export OPENBLAS_NUM_THREADS=56`
* ARMPL 18.4
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
* Multithreaded (28 core) execution requested via `export OMP_NUM_THREADS=28`
* Multithreaded (56 core) execution requested via `export OMP_NUM_THREADS=56`
* Affinity:
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 55"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
* Frequency throttling (via `cpupower`):
* No changes made.
* Comments:
* ARMPL performance is remarkably uneven across datatypes and operations, though it would appear their "base" consists of OpenBLAS, which they then optimize for select, targeted routines. Unfortunately, we were unable to test the absolute latest versions of OpenBLAS and ARMPL on this hardware before we lost access. We will rerun these experiments once we gain access to a similar system.
### ThunderX2 results
#### pdf
* [ThunderX2 single-threaded](graphs/large/l3_perf_tx2_nt1.pdf)
* [ThunderX2 multithreaded (28 cores)](graphs/large/l3_perf_tx2_jc4ic7_nt28.pdf)
* [ThunderX2 multithreaded (56 cores)](graphs/large/l3_perf_tx2_jc8ic7_nt56.pdf)
#### png (inline)
* **ThunderX2 single-threaded**
![single-threaded](graphs/large/l3_perf_tx2_nt1.png)
* **ThunderX2 multithreaded (28 cores)**
![multithreaded (28 cores)](graphs/large/l3_perf_tx2_jc4ic7_nt28.png)
* **ThunderX2 multithreaded (56 cores)**
![multithreaded (56 cores)](graphs/large/l3_perf_tx2_jc8ic7_nt56.png)
---
## SkylakeX
### SkylakeX experiment details
* Location: Oracle cloud
* Processor model: Intel Xeon Platinum 8167M (SkylakeX/AVX-512)
* Core topology: two sockets, 26 cores per socket, 52 cores total
* SMT status: enabled, but not utilized
* Max clock rate: 2.0GHz (single-core and multicore)
* Max vector register length: 512 bits (AVX-512)
* Max FMA vector IPC: 2
* Peak performance:
* single-core: 64 GFLOPS (double-precision), 128 GFLOPS (single-precision)
* multicore: 64 GFLOPS/core (double-precision), 128 GFLOPS/core (single-precision)
* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
* Compiler: gcc 7.3.0
* Results gathered: 6 March 2019, 27 March 2019
* Implementations tested:
* BLIS 9f1dbe5 (0.5.1-54)
* configured with `./configure -t openmp auto` (single- and multithreaded)
* sub-configuration exercised: `skx`
* Single-threaded (1 core) execution requested via no change in environment variables
* Multithreaded (26 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=13`
* Multithreaded (52 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=13`
* OpenBLAS 0.3.5
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=52` (multithreaded, 52 cores)
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
* Multithreaded (26 core) execution requested via `export OPENBLAS_NUM_THREADS=26`
* Multithreaded (52 core) execution requested via `export OPENBLAS_NUM_THREADS=52`
* Eigen 3.3.90
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
* Multithreaded (26 core) execution requested via `export OMP_NUM_THREADS=26`
* Multithreaded (52 core) execution requested via `export OMP_NUM_THREADS=52`
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
* MKL 2019 update 1
* Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
* Multithreaded (26 core) execution requested via `export MKL_NUM_THREADS=26`
* Multithreaded (52 core) execution requested via `export MKL_NUM_THREADS=52`
* Affinity:
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 51"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
* Frequency throttling (via `cpupower`):
* Driver: acpi-cpufreq
* Governor: performance
* Hardware limits: 1.0GHz - 2.0GHz
* Adjusted minimum: 2.0GHz
* Comments:
* MKL yields superb performance for most operations, though BLIS is not far behind except for `trsm`. (We understand the `trsm` underperformance and hope to address it in the future.) OpenBLAS lags far behind MKL and BLIS due to lack of full support for AVX-512, and possibly other reasons related to software architecture and register/cache blocksizes.
### SkylakeX results
#### pdf
* [SkylakeX single-threaded](graphs/large/l3_perf_skx_nt1.pdf)
* [SkylakeX multithreaded (26 cores)](graphs/large/l3_perf_skx_jc2ic13_nt26.pdf)
* [SkylakeX multithreaded (52 cores)](graphs/large/l3_perf_skx_jc4ic13_nt52.pdf)
#### png (inline)
* **SkylakeX single-threaded**
![single-threaded](graphs/large/l3_perf_skx_nt1.png)
* **SkylakeX multithreaded (26 cores)**
![multithreaded (26 cores)](graphs/large/l3_perf_skx_jc2ic13_nt26.png)
* **SkylakeX multithreaded (52 cores)**
![multithreaded (52 cores)](graphs/large/l3_perf_skx_jc4ic13_nt52.png)
---
## Haswell
### Haswell experiment details
* Location: TACC (Lonestar5)
* Processor model: Intel Xeon E5-2690 v3 (Haswell)
* Core topology: two sockets, 12 cores per socket, 24 cores total
* SMT status: enabled, but not utilized
* Max clock rate: 3.5GHz (single-core), 3.1GHz (multicore)
* Max vector register length: 256 bits (AVX2)
* Max FMA vector IPC: 2
* Peak performance:
* single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision)
* multicore: 49.6 GFLOPS/core (double-precision), 99.2 GFLOPS/core (single-precision)
* Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103)
* Compiler: gcc 6.3.0
* Results gathered: 25-26 February 2019, 27 March 2019
* Implementations tested:
* BLIS 075143df (0.5.1-39)
* configured with `./configure -t openmp auto` (single- and multithreaded)
* sub-configuration exercised: `haswell`
* Single-threaded (1 core) execution requested via no change in environment variables
* Multithreaded (12 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=3 BLIS_JR_NT=2`
* Multithreaded (24 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=3 BLIS_JR_NT=2`
* OpenBLAS 0.3.5
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=24` (multithreaded, 24 cores)
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
* Multithreaded (12 core) execution requested via `export OPENBLAS_NUM_THREADS=12`
* Multithreaded (24 core) execution requested via `export OPENBLAS_NUM_THREADS=24`
* Eigen 3.3.90
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
* Multithreaded (24 core) execution requested via `export OMP_NUM_THREADS=24`
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
* MKL 2018 update 2
* Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
* Multithreaded (12 core) execution requested via `export MKL_NUM_THREADS=12`
* Multithreaded (24 core) execution requested via `export MKL_NUM_THREADS=24`
* Affinity:
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 23"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
* Frequency throttling (via `cpupower`):
* No changes made.
* Comments:
* We were pleasantly surprised by how competitive BLIS performs relative to MKL on this multicore Haswell system, which is a _very_ common microarchitecture, and _very_ similar to the more recent Broadwells, Skylakes (desktop), Kaby Lakes, and Coffee Lakes that succeeded it.
### Haswell results
#### pdf
* [Haswell single-threaded](graphs/large/l3_perf_has_nt1.pdf)
* [Haswell multithreaded (12 cores)](graphs/large/l3_perf_has_jc2ic3jr2_nt12.pdf)
* [Haswell multithreaded (24 cores)](graphs/large/l3_perf_has_jc4ic3jr2_nt24.pdf)
#### png (inline)
* **Haswell single-threaded**
![single-threaded](graphs/large/l3_perf_has_nt1.png)
* **Haswell multithreaded (12 cores)**
![multithreaded (12 cores)](graphs/large/l3_perf_has_jc2ic3jr2_nt12.png)
* **Haswell multithreaded (24 cores)**
![multithreaded (24 cores)](graphs/large/l3_perf_has_jc4ic3jr2_nt24.png)
---
## Epyc
### Epyc experiment details
* Location: Oracle cloud
* Processor model: AMD Epyc 7551 (Zen1)
* Core topology: two sockets, 4 dies per socket, 2 core complexes (CCX) per die, 4 cores per CCX, 64 cores total
* SMT status: enabled, but not utilized
* Max clock rate: 3.0GHz (single-core), 2.55GHz (multicore)
* Max vector register length: 256 bits (AVX2)
* Max FMA vector IPC: 1
* Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each.
* Peak performance:
* single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
* multicore: 20.4 GFLOPS/core (double-precision), 40.8 GFLOPS/core (single-precision)
* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
* Compiler: gcc 7.3.0
* Results gathered: 6 March 2019, 19 March 2019, 27 March 2019
* Implementations tested:
* BLIS 9f1dbe5 (0.5.1-54)
* configured with `./configure -t openmp auto` (single- and multithreaded)
* sub-configuration exercised: `zen`
* Single-threaded (1 core) execution requested via no change in environment variables
* Multithreaded (32 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=8 BLIS_JR_NT=4`
* Multithreaded (64 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=8 BLIS_JR_NT=4`
* OpenBLAS 0.3.5
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=64` (multithreaded, 64 cores)
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
* Multithreaded (32 core) execution requested via `export OPENBLAS_NUM_THREADS=32`
* Multithreaded (64 core) execution requested via `export OPENBLAS_NUM_THREADS=64`
* Eigen 3.3.90
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
* Multithreaded (32 core) execution requested via `export OMP_NUM_THREADS=32`
* Multithreaded (64 core) execution requested via `export OMP_NUM_THREADS=64`
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
* MKL 2019 update 1
* Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
* Multithreaded (32 core) execution requested via `export MKL_NUM_THREADS=32`
* Multithreaded (64 core) execution requested via `export MKL_NUM_THREADS=64`
* Affinity:
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 63"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
* Frequency throttling (via `cpupower`):
* Driver: acpi-cpufreq
* Governor: performance
* Hardware limits: 1.2GHz - 2.0GHz
* Adjusted minimum: 2.0GHz
* Comments:
* MKL performance is dismal, despite being linked in the same manner as on the Xeon Platinum. It's not clear what is causing the slowdown. It could be that MKL's runtime kernel/blocksize selection logic is falling back to some older, more basic implementation because CPUID is not returning Intel as the hardware vendor. Alternatively, it's possible that MKL is trying to use kernels for the closest Intel architectures--say, Haswell/Broadwell--but its implementations use Haswell-specific optimizations that, due to microarchitectural differences, degrade performance on Zen.
### Epyc results
#### pdf
* [Epyc single-threaded](graphs/large/l3_perf_epyc_nt1.pdf)
* [Epyc multithreaded (32 cores)](graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.pdf)
* [Epyc multithreaded (64 cores)](graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.pdf)
#### png (inline)
* **Epyc single-threaded**
![single-threaded](graphs/large/l3_perf_epyc_nt1.png)
* **Epyc multithreaded (32 cores)**
![multithreaded (32 cores)](graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.png)
* **Epyc multithreaded (64 cores)**
![multithreaded (64 cores)](graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.png)
---
# Feedback
Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
Thanks for your interest in BLIS!

224
docs/PerformanceSmall.md Normal file
View File

@@ -0,0 +1,224 @@
# Contents
* **[Contents](Performance.md#contents)**
* **[Introduction](Performance.md#introduction)**
* **[General information](Performance.md#general-information)**
* **[Level-3 performance](Performance.md#level-3-performance)**
* **[Kaby Lake](Performance.md#kaby-lake)**
* **[Experiment details](Performance.md#kaby-lake-experiment-details)**
* **[Results](Performance.md#kaby-lake-results)**
* **[Epyc](Performance.md#epyc)**
* **[Experiment details](Performance.md#epyc-experiment-details)**
* **[Results](Performance.md#epyc-results)**
* **[Feedback](Performance.md#feedback)**
# Introduction
This document showcases performance results for the level-3 `gemm` operation
on small matrices with BLIS and BLAS for select hardware architectures.
# General information
Generally speaking, for level-3 operations on small matrices, we publish
two "panels" for each type of hardware, one that reflects performance on
row-stored matrices and another for column-stored matrices.
Each panel will consist of a 4x7 grid of graphs, with each row representing
a different transposition case (`nn`, `nt`, `tn`, `tt`)
complex) and each column representing a different shape scenario, usually
with one or two matrix dimensions bound to a fixed size for all problem
sizes tested.
Each of the 28 graphs within a panel will contain an x-axis that reports
problem size, with one, two, or all three matrix dimensions equal to the
problem size (e.g. _m_ = 6; _n_ = _k_, also encoded as `m6npkp`).
The y-axis will report in units GFLOPS (billions of floating-point operations
per second) on a single core.
It's also worth pointing out that the top of each graph (e.g. the maximum
y-axis value depicted) _always_ corresponds to the theoretical peak performance
under the conditions associated with that graph.
Theoretical peak performance, in units of GFLOPS, is calculated as the
product of:
1. the maximum sustainable clock rate in GHz; and
2. the maximum number of floating-point operations (flops) that can be
executed per cycle.
Note that the maximum sustainable clock rate may change depending on the
conditions.
For example, on some systems the maximum clock rate is higher when only one
core is active (e.g. single-threaded performance) versus when all cores are
active (e.g. multithreaded performance).
The maximum number of flops executable per cycle (per core) is generally
computed as the product of:
1. the maximum number of fused multiply-add (FMA) vector instructions that
can be issued per cycle (per core);
2. the maximum number of elements that can be stored within a single vector
register (for the datatype in question); and
3. 2.0, since an FMA instruction fuses two operations (a multiply and an add).
The problem size range, represented on the x-axis, is sampled in
increments of 4 up to 800 for the cases where one or two dimensions is small
(and constant)
and up to 400 in the case where all dimensions (e.g. _m_, _n_, and _k_) are
bound to the problem size (i.e., square matrices).
Note that the constant small matrix dimensions were chosen to be _very_
small--in the neighborhood of 8--intentionally to showcase what happens when
at least one of the matrices is abnormally "skinny." Typically, organizations
and individuals only publish performance with square matrices, which can miss
the problem sizes of interest to many applications. Here, in addition to square
matrices (shown in the seventh column), we also show six other scenarios where
one or two `gemm` dimensions (of _m,_ _n_, and _k_) is small.
The legend in each graph contains two entries for BLIS, corresponding to the
two black lines, one solid and one dotted. The dotted line, **"BLIS conv"**,
represents the conventional implementation that targets large matrices. This
was the only implementation available in BLIS prior to the addition to the
small/skinny matrix support. The solid line, **"BLIS sup"**, makes use of the
new small/skinny matrix implementation for certain small problems. Whenever
these results differ by any significant amount (beyond noise), it denotes a
problem size for which BLIS employed the new small/skinny implementation.
Put another way, **the delta between these two lines represents the performance
improvement between BLIS's previous status quo and the new regime.**
Finally, each point along each curve represents the best of three trials.
# Interpretation
In general, the the curves associated with higher-performing implementations
will appear higher in the graphs than lower-performing implementations.
Ideally, an implementation will climb in performance (as a function of problem
size) as quickly as possible and asymptotically approach some high fraction of
peak performance.
When corresponding with us, via email or when opening an
[issue](https://github.com/flame/blis/issues) on github, we kindly ask that
you specify as closely as possible (though a range is fine) your problem
size of interest so that we can better assist you.
# Level-3 performance
## Kaby Lake
### Kaby Lake experiment details
* Location: undisclosed
* Processor model: Intel Core i5-7500 (Kaby Lake)
* Core topology: one socket, 4 cores total
* SMT status: unavailable
* Max clock rate: 3.8GHz (single-core)
* Max vector register length: 256 bits (AVX2)
* Max FMA vector IPC: 2
* Peak performance:
* single-core: 57.6 GFLOPS (double-precision), 115.2 GFLOPS (single-precision)
* Operating system: Gentoo Linux (Linux kernel 5.0.7)
* Compiler: gcc 7.3.0
* Results gathered: 31 May 2019, 3 June 2019, 19 June 2019
* Implementations tested:
* BLIS 6bf449c (0.5.2-42)
* configured with `./configure --enable-cblas auto`
* sub-configuration exercised: `haswell`
* OpenBLAS 0.3.6
* configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
* BLASFEO 2c9f312
* configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
* Eigen 3.3.90
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
* Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
* MKL 2018 update 4
* Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
* Affinity:
* N/A.
* Frequency throttling (via `cpupower`):
* Driver: intel_pstate
* Governor: performance
* Hardware limits: 800MHz - 3.8GHz
* Adjusted minimum: 3.7GHz
* Comments:
* For both row- and column-stored matrices, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution (typically MKL), except for a few cases of where the _k_ dimension is very small. It is likely the case that this shape scenario begs a different kernel approach, since the BLIS microkernel is inherently designed to iterate over many _k_ dimension iterations (which leads them to incur considerable overhead for small values of _k_).
* For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 80 to 180. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
### Kaby Lake results
#### pdf
* [Kaby Lake row-stored](graphs/sup/dgemm_rrr_kbl_nt1.pdf)
* [Kaby Lake column-stored](graphs/sup/dgemm_ccc_kbl_nt1.pdf)
#### png (inline)
* **Kaby Lake row-stored**
![row-stored](graphs/sup/dgemm_rrr_kbl_nt1.png)
* **Kaby Lake column-stored**
![column-stored](graphs/sup/dgemm_ccc_kbl_nt1.png)
---
## Epyc
### Epyc experiment details
* Location: Oracle cloud
* Processor model: AMD Epyc 7551 (Zen1)
* Core topology: two sockets, 4 dies per socket, 2 core complexes (CCX) per die, 4 cores per CCX, 64 cores total
* SMT status: enabled, but not utilized
* Max clock rate: 3.0GHz (single-core), 2.55GHz (multicore)
* Max vector register length: 256 bits (AVX2)
* Max FMA vector IPC: 1
* Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each.
* Peak performance:
* single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
* Compiler: gcc 7.3.0
* Results gathered: 31 May 2019, 3 June 2019, 19 June 2019
* Implementations tested:
* BLIS 6bf449c (0.5.2-42)
* configured with `./configure --enable-cblas auto`
* sub-configuration exercised: `zen`
* OpenBLAS 0.3.6
* configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
* BLASFEO 2c9f312
* configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
* Eigen 3.3.90
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
* Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
* MKL 2019 update 4
* Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
* Affinity:
* N/A.
* Frequency throttling (via `cpupower`):
* Driver: acpi-cpufreq
* Governor: performance
* Hardware limits: 1.2GHz - 2.0GHz
* Adjusted minimum: 2.0GHz
* Comments:
* As with Kaby Lake, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution, except for a few cases of where the _k_ dimension is very small.
* For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 12 to 256. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
### Epyc results
#### pdf
* [Epyc row-stored](graphs/sup/dgemm_rrr_epyc_nt1.pdf)
* [Epyc column-stored](graphs/sup/dgemm_ccc_epyc_nt1.pdf)
#### png (inline)
* **Epyc row-stored**
![row-stored](graphs/sup/dgemm_rrr_epyc_nt1.png)
* **Epyc column-stored**
![column-stored](graphs/sup/dgemm_ccc_epyc_nt1.png)
---
# Feedback
Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
Thanks for your interest in BLIS!

View File

@@ -4,6 +4,8 @@
## Contents
* [Changes in 0.6.0](ReleaseNotes.md#changes-in-060)
* [Changes in 0.5.2](ReleaseNotes.md#changes-in-052)
* [Changes in 0.5.1](ReleaseNotes.md#changes-in-051)
* [Changes in 0.5.0](ReleaseNotes.md#changes-in-050)
* [Changes in 0.4.1](ReleaseNotes.md#changes-in-041)
@@ -33,6 +35,70 @@
* [Changes in 0.0.2](ReleaseNotes.md#changes-in-002)
* [Changes in 0.0.1](ReleaseNotes.md#changes-in-001)
## Changes in 0.6.0
June 3, 2019
Improvements present in 0.6.0:
Framework:
- Implemented small/skinny/unpacked (sup) framework for accelerated level-3 performance when at least one matrix dimension is small (or very small). For now, only `dgemm` is optimized, and this new implementation currently only targets Intel Haswell through Coffee Lake, and AMD Zen-based Ryzen/Epyc. (The existing kernels should extend without significant modification to Zen2-based Ryzen/Epyc once they are available.) Also, multithreaded parallelism is not yet implemented, though application-level threading should be fine. (AMD)
- Changed function pointer usages of `void*` to new, typedef'ed type `void_fp`.
- Allow compile-time disabling of BLAS prototypes in BLIS, in case the application already has access to prototypes.
- In `bli_system.h`, define `_POSIX_C_SOURCE` to `200809L` if the macro is not already defined. This ensures that things such as pthreads are properly defined by an application that has `#include "blis.h"` but omits the definition of `_POSIX_C_SOURCE` from the command-line compiler options. (Christos Psarras)
Kernels:
- None.
Build system:
- Updated the way configure and the top-level Makefile handle installation prefixes (`prefix`, `exec_prefix`, `libdir`, `includedir`, `sharedir`) to better conform with GNU conventions.
- Improved clang version detection. (Isuru Fernando)
- Use pthreads on MinGW and Cygwin. (Isuru Fernando)
Testing:
- Added Eigen support to test drivers in `test/3`.
- Fix inadvertently hidden `xerbla_()` in blastest drivers when building only shared libraries. (Isuru Fernando, M. Zhou)
Documentation:
- Added `docs/PerformanceSmall.md` to showcase new BLIS small/skinny `dgemm` performance on Kaby Lake and Epyc.
- Added Eigen results (3.3.90) to performance graphs showcased in `docs/Performance.md`.
- Added BLIS thread factorization info to `docs/Performance.md`.
## Changes in 0.5.2
March 19, 2019
Improvements present in 0.5.2:
Framework:
- Added support for IC loop parallelism to the `trsm` operation.
- Implemented a pool-based small block allocator and a corresponding `configure` option (enabled by default), which minimizes the number of calls to `malloc()` and `free()` for the purposes of allocating small blocks (on the order of 100 bytes). These small blocks are used by internal data structures, and the repeated allocation and freeing of these structures could, perhaps, cause memory fragmentation issues in certain application circumstances. This was never reproduced and observed, however, and remains entirely theoretical. Still, the sba should be no slower, and perhaps a little faster, than repeatedly calling `malloc()` and `free()` for these internal data structures. Also, the sba was designed to be thread-safe. (AMD)
- Refined and extended the output enabled by `--enable-mem-tracing`, which allows a developer to follow memory allocation and release performed by BLIS.
- Initialize error messages at compile-time rather than at runtime. (Minh Quan Ho)
- Fixed a potential situation whereby the multithreading parameters in a `rntm_t` object that is passed into an expert interface is ignored.
- Prevent a redefinition of `ftnlen` in the `f2c_types.h` in blastest. (Jeff Diamond)
Kernels:
- Adjusted the cache blocksizes in the `zen` sub-configuration for `float`, `scomplex`, and `dcomplex` datatypes. The previous values, taken directly from the `haswell` subconfig, were merely meant to be reasonable placeholders until more suitable values were determined, as had already taken place for the `double` datatype. (AMD)
- Rewrote reference kernels in terms of simplified indexing annotated by the `#pragma omp simd` directive, which a compiler can use to vectorize certain constant-bounded loops. The `#pragma` is disabled via a preprocessor macro layer if the compiler is found by `configure` to not support `-fopenmp-simd`. (Devin Matthews, Jeff Hammond)
Build system:
- Added symbol-export annotation macros to all of the function prototypes and global variable declarations for public symbols, and created a new `configure` option, `--export-shared=[public|all]`, that controls which symbols--only those that are meant to be public, or all symbols--are exported to the shared library. (Isuru Fernando)
- Standardized to using `-O3` in various subconfigs, and also `-funsafe-math-optimizations` for reference kernels. (Dave Love, Jeff Hammond)
- Disabled TBM, XOP, LWP instructions in all AMD subconfigs. (Devin Matthews)
- Fixed issues that prevented using BLIS on GNU Hurd. (M. Zhou)
- Relaxed python3 requirements to allow python 3.4 or later. Previously, python 3.5 or later was required if python3 was being used. (Dave Love)
- Added `thunderx2` sub-configuration. (Devangi Parikh)
- Added `power9` sub-configuration. For now, this subconfig only uses reference kernels. (Nicholai Tukanov)
- Fixed an issue with `configure` failing on OSes--including certain flavors of BSD--that contain a slash '/' character in the output of `uname -s`. (Isuru Fernando, M. Zhou)
Testing:
- Renamed `test/3m4m` directory to `test/3`.
- Lots of updates and improvements to Makefiles, shell scripts, and matlab scripts in `test/3`.
Documentation:
- Added a new `docs/Performance.md` document that showcases single-threaded, single-socket, and dual-socket performance results of `single`, `double`, `scomplex`, and `dcomplex` level-3 operations in BLIS, OpenBLAS, and MKL/ARMPL for Haswell, SkylakeX, ThunderX2, and Epyc hardware architectures. (Note: Other implementations such as Eigen and ATLAS may be added to these graphs in the future.)
- Updated `README.md` to include new language on external packages. (Dave Love)
- Updated `docs/Multithreading.md` to be more explicit about the fact that multithreading is disabled by default at configure-time, and the fact that BLIS will run executed single-threaded at runtime by default if no multithreaded specification is given. (M. Zhou)
## Changes in 0.5.1
December 18, 2018
@@ -88,7 +154,7 @@ Kernels:
Build system:
- Added support for building Windows DLLs via AppVeyor [2], complete with a built-in implementation of pthreads for Windows, as well as an implementation of the `pthread_barrier_*()` APIs for use on OS X. (Isuru Fernando, Devin Matthews, Mathieu Poumeyrol, Matthew Honnibal)
- Defined a `cortexa53` sub-configuration, which is similar to `cortexa57` except that it uses slightly different compiler flags. (Mathieu Poumeyrol)
- Added python version checking to configure script.
- Added python version checking to `configure` script.
- Added a script to automate the regeneration of the symbols list file (now located in `build/libblis-symbols.def`).
- Various tweaks in preparation for BLIS's inclusion within Debian. (M. Zhou)
- Various fixes and cleanups.
@@ -246,16 +312,16 @@ May 2, 2017
- Implemented the 1m method for inducing complex matrix multiplication. (Please see ACM TOMS publication ["Implementing high-performance complex matrix multiplication via the 1m method"](https://github.com/flame/blis#citations) for more details.)
- Switched to simpler `trsm_r` implementation.
- Relaxed constraints that `MC % NR = 0` and `NC % MR = 0`, as this was only needed for the more sophisticated `trsm_r` implementation.
- Automatic loop thread assignment. (Devin Matthews)
- Updates to `.travis.yml` configuration file. (Devin Matthews)
- Automatic loop thread assignment. (Devin Matthews)
- Updates to `.travis.yml` configuration file. (Devin Matthews)
- Updates to non-default haswell microkernels.
- Match storage format of the temporary micro-tiles in macrokernels to that of the microkernel storage preference for edge cases.
- Added support for Intel's Knight's Landing. (Devin Matthews)
- Added more flexible options to specify multithreading via the configure script. (Devin Matthews)
- OS X compatibility fixes. (Devin Matthews)
- Other small changes and fixes.
- Added support for Intel's Knight's Landing. (Devin Matthews)
- Added more flexible options to specify multithreading via the configure script. (Devin Matthews)
- OS X compatibility fixes. (Devin Matthews)
- Other small changes and fixes.
Also, thanks to Elmar Peise, Krzysztof Drewniak, and Francisco Igual for their contributions in reporting/fixing certain bugs that were addressed in this version.
Also, thanks to Elmar Peise, Krzysztof Drewniak, and Francisco Igual for their contributions in reporting/fixing certain bugs that were addressed in this version.
## Changes in 0.2.1
October 5, 2016
@@ -439,7 +505,7 @@ While neither `bli_config.h` nor `bli_kernel.h` has changed formats since 0.0.7,
## Changes in 0.0.7
April 30, 2013
This version incorporates many small fixes and feature enhancements made during our SC13 collaboration.
This version incorporates many small fixes and feature enhancements made during our SC13 collaboration.
## Changes in 0.0.6
April 13, 2013
@@ -478,7 +544,7 @@ The compatibility layer is enabled via a configuration option in `bl2_config.h`.
## Changes in 0.0.2
February 11, 2013
Most notably, this version contains the new test suite I've been working on for the last month.
Most notably, this version contains the new test suite I've been working on for the last month.
What is the test suite? It is a highly configurable test driver that allows one to test an arbitrary set of BLIS operations, with an arbitrary set of parameter combinations, and matrix/vector storage formats, as well as whichever datatypes you are interested in. (For now, only homogeneous datatyping is supported, which is what most people want.) You can also specify an arbitrary problem size range with arbitrary increments, and arbitrary ratios between dimensions (or anchor a dimension to a single value), and you can output directly to files which store the output in matlab syntax, which makes it easy to generate performance graphs.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 169 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 195 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 171 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 203 KiB

View File

@@ -114,7 +114,7 @@ CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
CFLAGS += -I$(TEST_SRC_PATH)
# Locate the libblis library to which we will link.
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
# Binary executable name.
TEST_BINS := 00obj_basic.x \

View File

@@ -102,7 +102,7 @@ CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
CFLAGS += -I$(TEST_SRC_PATH)
# Locate the libblis library to which we will link.
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
# Binary executable name.
TEST_BINS := 00level1v.x \

View File

@@ -64,7 +64,7 @@ void PASTEMAC0(opname) \
bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \
@@ -100,7 +100,7 @@ void PASTEMAC0(opname) \
PASTEMAC(opname,_check)( chi, psi ); \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
\
f \
@@ -137,7 +137,7 @@ void PASTEMAC0(opname) \
PASTEMAC(opname,_check)( chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
\
f \
@@ -170,7 +170,7 @@ void PASTEMAC0(opname) \
PASTEMAC(opname,_check)( chi, psi ); \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
\
f \
@@ -213,7 +213,7 @@ void PASTEMAC0(opname) \
else dt_use = dt_chi; \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_use ); \
\
f \
@@ -247,7 +247,7 @@ void PASTEMAC0(opname) \
PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \
@@ -290,7 +290,7 @@ void PASTEMAC0(opname) \
bli_obj_scalar_set_dt_buffer( chi, dt_zeta_c, &dt_chi, &buf_chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \
@@ -327,7 +327,7 @@ void PASTEMAC0(opname) \
PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
\
/* Query a type-specific function pointer, except one that uses
void* instead of typed pointers. */ \
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* absq \
@@ -53,7 +53,7 @@ GENPROT( normfsc )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* psi \
@@ -69,7 +69,7 @@ GENPROT( subsc )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi \
);
@@ -80,7 +80,7 @@ GENPROT( invertsc )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
double* zeta_r, \
@@ -93,7 +93,7 @@ GENPROT( getsc )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
double zeta_r, \
double zeta_i, \
@@ -106,7 +106,7 @@ GENPROT( setsc )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* zeta_r, \
@@ -119,7 +119,7 @@ GENPROT( unzipsc )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* zeta_r, \
obj_t* zeta_i, \

View File

@@ -40,7 +40,7 @@
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
conj_t conjchi, \
ctype* chi, \
@@ -56,7 +56,7 @@ INSERT_GENTPROT_BASIC0( subsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
conj_t conjchi, \
ctype* chi \
@@ -68,7 +68,7 @@ INSERT_GENTPROT_BASIC0( invertsc )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype_r* absq \
@@ -81,7 +81,7 @@ INSERT_GENTPROTR_BASIC0( normfsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype* psi \
@@ -93,7 +93,7 @@ INSERT_GENTPROT_BASIC0( sqrtsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
double* zeta_r, \
@@ -106,7 +106,7 @@ INSERT_GENTPROT_BASIC0( getsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
double zeta_r, \
double zeta_i, \
@@ -119,7 +119,7 @@ INSERT_GENTPROT_BASIC0( setsc )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype_r* zeta_r, \
@@ -132,7 +132,7 @@ INSERT_GENTPROTR_BASIC0( unzipsc )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype_r* zeta_r, \
ctype_r* zeta_i, \
@@ -143,14 +143,14 @@ INSERT_GENTPROTR_BASIC0( zipsc )
// -----------------------------------------------------------------------------
void bli_igetsc
BLIS_EXPORT_BLIS void bli_igetsc
(
dim_t* chi,
double* zeta_r,
double* zeta_i
);
void bli_isetsc
BLIS_EXPORT_BLIS void bli_isetsc
(
double zeta_r,
double zeta_i,

View File

@@ -40,7 +40,7 @@
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* psi \
@@ -55,7 +55,7 @@ GENFRONT( copysc )
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
void PASTEMAC2(chx,chy,varname) \
BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjchi, \
void* chi, \

Some files were not shown because too many files have changed in this diff Show More