Merge branch 'amd-staging-rome2.1' of ssh://git.amd.com:29418/cpulibraries/er/blis into amd-blis-cpp
Change-Id: I97a10ab7546d475474b0ff733bafb8248843c352
@@ -39,7 +39,7 @@ build_script:
|
||||
- bash -lc "cd /c/projects/blis && ./configure %CONFIGURE_OPTS% --enable-threading=%THREADING% --enable-arg-max-hack --prefix=/c/blis %CONFIG%"
|
||||
- bash -lc "cd /c/projects/blis && mingw32-make -j4 V=1"
|
||||
- bash -lc "cd /c/projects/blis && mingw32-make install"
|
||||
- ps: Compress-Archive -Path C:\blis -DestinationPath C:\blis.zip
|
||||
- 7z a C:\blis.zip C:\blis
|
||||
- ps: Push-AppveyorArtifact C:\blis.zip
|
||||
|
||||
test_script:
|
||||
|
||||
11
CREDITS
@@ -9,18 +9,22 @@ The BLIS framework was primarily authored by
|
||||
|
||||
but many others have contributed code and feedback, including
|
||||
|
||||
Sameer Agarwal @sandwichmaker (Google)
|
||||
Murtaza Ali (Texas Instruments)
|
||||
Sajid Ali @s-sajid-ali (Northwestern University)
|
||||
Erling Andersen @erling-d-andersen
|
||||
Alex Arslan @ararslan
|
||||
Vernon Austel (IBM, T.J. Watson Research Center)
|
||||
Matthew Brett @matthew-brett (University of Birmingham)
|
||||
Jed Brown @jedbrown (Argonne National Laboratory)
|
||||
Robin Christ @robinchrist
|
||||
Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany)
|
||||
Jeff Diamond (Oracle)
|
||||
Johannes Dieterich @iotamudelta
|
||||
Krzysztof Drewniak @krzysz00
|
||||
Marat Dukhan @Maratyszcza (Google)
|
||||
Victor Eijkhout @VictorEijkhout (Texas Advanced Computing Center)
|
||||
Evgeny Epifanovsky @epifanovsky (Q-Chem)
|
||||
Isuru Fernando @isuruf
|
||||
Roman Gareev @gareevroman
|
||||
Richard Goldschmidt @SuperFluffy
|
||||
@@ -30,7 +34,7 @@ but many others have contributed code and feedback, including
|
||||
Jeff Hammond @jeffhammond (Intel)
|
||||
Jacob Gorm Hansen @jacobgorm
|
||||
Jean-Michel Hautbois @jhautbois
|
||||
Ian Henriksen @insertinterestingnamehere
|
||||
Ian Henriksen @insertinterestingnamehere (The University of Texas at Austin)
|
||||
Minh Quan Ho @hominhquan
|
||||
Matthew Honnibal @honnibal
|
||||
Stefan Husmann @stefanhusmann
|
||||
@@ -53,6 +57,7 @@ but many others have contributed code and feedback, including
|
||||
Ilya Polkovnichenko
|
||||
Jack Poulson @poulson (Stanford)
|
||||
Mathieu Poumeyrol @kali
|
||||
Christos Psarras @ChrisPsa (RWTH-Aachen)
|
||||
@qnerd
|
||||
Michael Rader @mrader1248
|
||||
Pradeep Rao @pradeeptrgit (AMD)
|
||||
@@ -63,11 +68,13 @@ but many others have contributed code and feedback, including
|
||||
Rene Sitt
|
||||
Tony Skjellum @tonyskjellum (The University of Tennessee at Chattanooga)
|
||||
Mikhail Smelyanskiy (Intel, Parallel Computing Lab)
|
||||
Nathaniel Smith @njsmith
|
||||
Shaden Smith @ShadenSmith
|
||||
Tyler Smith @tlrmchlsmth (The University of Texas at Austin)
|
||||
Paul Springer @springer13 (RWTH-Aachen)
|
||||
Vladimir Sukarev
|
||||
Santanu Thangaraj (AMD)
|
||||
Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin)
|
||||
Rhys Ulerich @RhysU (The University of Texas at Austin)
|
||||
Robert van de Geijn @rvdg (The University of Texas at Austin)
|
||||
Kiran Varaganti @kvaragan (AMD)
|
||||
@@ -83,8 +90,10 @@ partners, including
|
||||
|
||||
AMD
|
||||
Hewlett Packard Enterprise
|
||||
Huawei
|
||||
Intel
|
||||
Microsoft
|
||||
Oracle
|
||||
Texas Instruments
|
||||
|
||||
as well as the National Science Foundation (NSF Awards CCF-0917167,
|
||||
|
||||
110
Makefile
@@ -386,23 +386,22 @@ ifeq ($(IS_CONFIGURED),yes)
|
||||
# named with three .so version numbers.
|
||||
UNINSTALL_OLD_LIBS :=
|
||||
|
||||
UNINSTALL_OLD_LIBS += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS_SO).?.?.?" 2> /dev/null | $(GREP) -v "$(LIBBLIS).$(LIBBLIS_SO_MMB_EXT)")
|
||||
UNINSTALL_OLD_LIBS += $(filter-out $(INSTALL_LIBDIR)/$(LIBBLIS).$(LIBBLIS_SO_MMB_EXT),$(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS_SO).?.?.?))
|
||||
|
||||
# These shell commands gather the filepaths to any library symlink in the
|
||||
# current LIBDIR that might be left over from an old installation. We start
|
||||
# with symlinks named using the .so major version number.
|
||||
UNINSTALL_OLD_SYML := $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS_SO).?" 2> /dev/null | $(GREP) -v "$(LIBBLIS_SO).$(SO_MAJOR)")
|
||||
UNINSTALL_OLD_SYML := $(filter-out $(INSTALL_LIBDIR)/$(LIBBLIS_SO).$(SO_MAJOR),$(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS_SO).?))
|
||||
|
||||
# We also prepare to uninstall older-style symlinks whose names contain the
|
||||
# BLIS version number and configuration family.
|
||||
UNINSTALL_OLD_SYML += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS)-*.a" 2> /dev/null | $(GREP) -v "$(LIBBLIS)-$(VERS_CONF).a")
|
||||
|
||||
UNINSTALL_OLD_SYML += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS)-*.$(SHLIB_EXT)" 2> /dev/null | $(GREP) -v "$(LIBBLIS)-$(VERS_CONF).$(SHLIB_EXT)")
|
||||
UNINSTALL_OLD_SYML += $(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS)-*.a)
|
||||
UNINSTALL_OLD_SYML += $(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS)-*.$(SHLIB_EXT))
|
||||
|
||||
# This shell command grabs all files named "*.h" that are not blis.h or cblas.h
|
||||
# in the installation directory. We consider this set of headers to be "old" and
|
||||
# eligible for removal upon running of the uninstall-old-headers target.
|
||||
UNINSTALL_OLD_HEADERS := $(shell $(FIND) $(INSTALL_INCDIR)/blis/ -name "*.h" 2> /dev/null | $(GREP) -v "$(BLIS_H)" | $(GREP) -v "$(CBLAS_H)")
|
||||
UNINSTALL_OLD_HEADERS := $(filter-out $(BLIS_H),$(filter-out $(CBLAS_H),$(wildcard $(INSTALL_INCDIR)/blis/*.h)))
|
||||
|
||||
endif # IS_CONFIGURED
|
||||
|
||||
@@ -1027,23 +1026,24 @@ endif # ifeq ($(IS_WIN),no)
|
||||
# --- Query current configuration ---
|
||||
|
||||
showconfig: check-env
|
||||
@echo "configuration family: $(CONFIG_NAME)"
|
||||
@echo "sub-configurations: $(CONFIG_LIST)"
|
||||
@echo "requisite kernels: $(KERNEL_LIST)"
|
||||
@echo "kernel-to-config map: $(KCONFIG_MAP)"
|
||||
@echo "-----------------------"
|
||||
@echo "BLIS version string: $(VERSION)"
|
||||
@echo ".so major version: $(SO_MAJOR)"
|
||||
@echo ".so minor.build vers: $(SO_MINORB)"
|
||||
@echo "install libdir: $(INSTALL_LIBDIR)"
|
||||
@echo "install includedir: $(INSTALL_INCDIR)"
|
||||
@echo "debugging status: $(DEBUG_TYPE)"
|
||||
@echo "multithreading status: $(THREADING_MODEL)"
|
||||
@echo "enable BLAS API? $(MK_ENABLE_BLAS)"
|
||||
@echo "enable CBLAS API? $(MK_ENABLE_CBLAS)"
|
||||
@echo "build static library? $(MK_ENABLE_STATIC)"
|
||||
@echo "build shared library? $(MK_ENABLE_SHARED)"
|
||||
@echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)"
|
||||
@echo "configuration family: $(CONFIG_NAME)"
|
||||
@echo "sub-configurations: $(CONFIG_LIST)"
|
||||
@echo "requisite kernels sets: $(KERNEL_LIST)"
|
||||
@echo "kernel-to-config map: $(KCONFIG_MAP)"
|
||||
@echo "-------------------------"
|
||||
@echo "BLIS version string: $(VERSION)"
|
||||
@echo ".so major version: $(SO_MAJOR)"
|
||||
@echo ".so minor.build vers: $(SO_MINORB)"
|
||||
@echo "install libdir: $(INSTALL_LIBDIR)"
|
||||
@echo "install includedir: $(INSTALL_INCDIR)"
|
||||
@echo "install sharedir: $(INSTALL_SHAREDIR)"
|
||||
@echo "debugging status: $(DEBUG_TYPE)"
|
||||
@echo "multithreading status: $(THREADING_MODEL)"
|
||||
@echo "enable BLAS API? $(MK_ENABLE_BLAS)"
|
||||
@echo "enable CBLAS API? $(MK_ENABLE_CBLAS)"
|
||||
@echo "build static library? $(MK_ENABLE_STATIC)"
|
||||
@echo "build shared library? $(MK_ENABLE_SHARED)"
|
||||
@echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)"
|
||||
|
||||
|
||||
# --- Clean rules ---
|
||||
@@ -1059,16 +1059,16 @@ ifneq ($(SANDBOX),)
|
||||
- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
endif
|
||||
else
|
||||
@echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)."
|
||||
@echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)"
|
||||
@- $(FIND) $(CONFIG_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
@echo "Removing makefile fragments from $(FRAME_FRAG_PATH)."
|
||||
@echo "Removing makefile fragments from $(FRAME_FRAG_PATH)"
|
||||
@- $(FIND) $(FRAME_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
@echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)."
|
||||
@echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)"
|
||||
@- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)."
|
||||
@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)"
|
||||
@- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
ifneq ($(SANDBOX),)
|
||||
@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)."
|
||||
@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)"
|
||||
@- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
endif
|
||||
endif
|
||||
@@ -1080,7 +1080,7 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(RM_F) $(BLIS_H_FLAT)
|
||||
$(RM_F) $(CBLAS_H_FLAT)
|
||||
else
|
||||
@echo "Removing flattened header files from $(BASE_INC_PATH)."
|
||||
@echo "Removing flattened header files from $(BASE_INC_PATH)"
|
||||
@$(RM_F) $(BLIS_H_FLAT)
|
||||
@$(RM_F) $(CBLAS_H_FLAT)
|
||||
endif
|
||||
@@ -1093,9 +1093,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(LIBBLIS_A_PATH)
|
||||
- $(RM_F) $(LIBBLIS_SO_PATH)
|
||||
else
|
||||
@echo "Removing object files from $(BASE_OBJ_PATH)."
|
||||
@echo "Removing object files from $(BASE_OBJ_PATH)"
|
||||
@- $(FIND) $(BASE_OBJ_PATH) -name "*.o" | $(XARGS) $(RM_F)
|
||||
@echo "Removing libraries from $(BASE_LIB_PATH)."
|
||||
@echo "Removing libraries from $(BASE_LIB_PATH)"
|
||||
@- $(RM_F) $(LIBBLIS_A_PATH)
|
||||
@- $(RM_F) $(LIBBLIS_SO_PATH)
|
||||
endif
|
||||
@@ -1117,13 +1117,13 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
|
||||
- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
|
||||
else
|
||||
@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)."
|
||||
@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)"
|
||||
@- $(RM_F) $(BLASTEST_F2C_OBJS) $(BLASTEST_DRV_OBJS)
|
||||
@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)."
|
||||
@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)"
|
||||
@- $(RM_F) $(BLASTEST_F2C_LIB)
|
||||
@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)."
|
||||
@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)"
|
||||
@- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
|
||||
@echo "Removing driver output files 'out.*'."
|
||||
@echo "Removing driver output files 'out.*'"
|
||||
@- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
|
||||
endif # ENABLE_VERBOSE
|
||||
endif # IS_CONFIGURED
|
||||
@@ -1136,13 +1136,13 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(BLASTEST_DIR)/$(BLASTEST_F2C_LIB_NAME)
|
||||
- $(RM_F) $(addprefix $(BLASTEST_DIR)/out.,$(BLASTEST_DRV_BASES))
|
||||
else
|
||||
@echo "Removing object files from ./$(BLASTEST_DIR)/$(OBJ_DIR)."
|
||||
@echo "Removing object files from ./$(BLASTEST_DIR)/$(OBJ_DIR)"
|
||||
@- $(FIND) $(BLASTEST_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
|
||||
@echo "Removing libf2c.a from ./$(BLASTEST_DIR)."
|
||||
@echo "Removing libf2c.a from ./$(BLASTEST_DIR)"
|
||||
@- $(RM_F) $(BLASTEST_DIR)/$(BLASTEST_F2C_LIB_NAME)
|
||||
@echo "Removing binaries from ./$(BLASTEST_DIR)."
|
||||
@echo "Removing binaries from ./$(BLASTEST_DIR)"
|
||||
@- $(FIND) $(BLASTEST_DIR) -name "*.x" | $(XARGS) $(RM_F)
|
||||
@echo "Removing driver output files 'out.*' from ./$(BLASTEST_DIR)."
|
||||
@echo "Removing driver output files 'out.*' from ./$(BLASTEST_DIR)"
|
||||
@- $(RM_F) $(addprefix $(BLASTEST_DIR)/out.,$(BLASTEST_DRV_BASES))
|
||||
endif # ENABLE_VERBOSE
|
||||
endif # IS_CONFIGURED
|
||||
@@ -1160,11 +1160,11 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(TESTSUITE_BIN)
|
||||
- $(RM_F) $(TESTSUITE_OUT_FILE)
|
||||
else
|
||||
@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)."
|
||||
@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)"
|
||||
@- $(RM_F) $(MK_TESTSUITE_OBJS)
|
||||
@echo "Removing binary $(TESTSUITE_BIN)."
|
||||
@echo "Removing binary $(TESTSUITE_BIN)"
|
||||
@- $(RM_F) $(TESTSUITE_BIN)
|
||||
@echo "Removing $(TESTSUITE_OUT_FILE)."
|
||||
@echo "Removing $(TESTSUITE_OUT_FILE)"
|
||||
@- $(RM_F) $(TESTSUITE_OUT_FILE)
|
||||
endif # ENABLE_VERBOSE
|
||||
endif # IS_CONFIGURED
|
||||
@@ -1176,9 +1176,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
|
||||
- $(MAKE) -C $(CPP_TEST_DIR) clean
|
||||
else
|
||||
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)."
|
||||
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
|
||||
@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
|
||||
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)."
|
||||
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
|
||||
@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
|
||||
@$(MAKE) -C $(CPP_TEST_DIR) clean
|
||||
endif # ENABLE_VERBOSE
|
||||
@@ -1193,15 +1193,15 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_RF) $(LIB_DIR)
|
||||
- $(RM_RF) $(INCLUDE_DIR)
|
||||
else
|
||||
@echo "Removing $(BLIS_CONFIG_H)."
|
||||
@echo "Removing $(BLIS_CONFIG_H)"
|
||||
@$(RM_F) $(BLIS_CONFIG_H)
|
||||
@echo "Removing $(CONFIG_MK_FILE)."
|
||||
@echo "Removing $(CONFIG_MK_FILE)"
|
||||
@- $(RM_F) $(CONFIG_MK_FILE)
|
||||
@echo "Removing $(OBJ_DIR)."
|
||||
@echo "Removing $(OBJ_DIR)"
|
||||
@- $(RM_RF) $(OBJ_DIR)
|
||||
@echo "Removing $(LIB_DIR)."
|
||||
@echo "Removing $(LIB_DIR)"
|
||||
@- $(RM_RF) $(LIB_DIR)
|
||||
@echo "Removing $(INCLUDE_DIR)."
|
||||
@echo "Removing $(INCLUDE_DIR)"
|
||||
@- $(RM_RF) $(INCLUDE_DIR)
|
||||
endif
|
||||
endif
|
||||
@@ -1210,7 +1210,7 @@ endif
|
||||
# --- CHANGELOG rules ---
|
||||
|
||||
changelog:
|
||||
@echo "Updating '$(DIST_PATH)/$(CHANGELOG)' via '$(GIT_LOG)'."
|
||||
@echo "Updating '$(DIST_PATH)/$(CHANGELOG)' via '$(GIT_LOG)'"
|
||||
@$(GIT_LOG) > $(DIST_PATH)/$(CHANGELOG)
|
||||
|
||||
|
||||
@@ -1225,7 +1225,7 @@ uninstall-libs: check-env
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(MK_LIBS_INST)
|
||||
else
|
||||
@echo "Uninstalling libraries $(notdir $(MK_LIBS_INST)) from $(dir $(firstword $(MK_LIBS_INST)))."
|
||||
@echo "Uninstalling libraries $(notdir $(MK_LIBS_INST)) from $(dir $(firstword $(MK_LIBS_INST)))"
|
||||
@- $(RM_F) $(MK_LIBS_INST)
|
||||
endif
|
||||
|
||||
@@ -1233,7 +1233,7 @@ uninstall-lib-symlinks: check-env
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(MK_LIBS_SYML)
|
||||
else
|
||||
@echo "Uninstalling symlinks $(notdir $(MK_LIBS_SYML)) from $(dir $(firstword $(MK_LIBS_SYML)))."
|
||||
@echo "Uninstalling symlinks $(notdir $(MK_LIBS_SYML)) from $(dir $(firstword $(MK_LIBS_SYML)))"
|
||||
@- $(RM_F) $(MK_LIBS_SYML)
|
||||
endif
|
||||
|
||||
@@ -1241,7 +1241,7 @@ uninstall-headers: check-env
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_RF) $(MK_INCL_DIR_INST)
|
||||
else
|
||||
@echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))."
|
||||
@echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))"
|
||||
@- $(RM_RF) $(MK_INCL_DIR_INST)
|
||||
endif
|
||||
|
||||
@@ -1249,7 +1249,7 @@ uninstall-share: check-env
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_RF) $(MK_SHARE_DIR_INST)
|
||||
else
|
||||
@echo "Uninstalling directory '$(notdir $(MK_SHARE_DIR_INST))' from $(dir $(MK_SHARE_DIR_INST))."
|
||||
@echo "Uninstalling directory '$(notdir $(MK_SHARE_DIR_INST))' from $(dir $(MK_SHARE_DIR_INST))"
|
||||
@- $(RM_RF) $(MK_SHARE_DIR_INST)
|
||||
endif
|
||||
|
||||
@@ -1265,7 +1265,7 @@ $(UNINSTALL_OLD_LIBS) $(UNINSTALL_OLD_SYML) $(UNINSTALL_OLD_HEADERS): check-env
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $@
|
||||
else
|
||||
@echo "Uninstalling $(@F) from $(@D)/."
|
||||
@echo "Uninstalling $(@F) from $(@D)/"
|
||||
@- $(RM_F) $@
|
||||
endif
|
||||
|
||||
|
||||
44
README.md
@@ -6,6 +6,7 @@ Contents
|
||||
--------
|
||||
|
||||
* **[Introduction](#introduction)**
|
||||
* **[Education and Learning](#education-and-learning)**
|
||||
* **[What's New](#whats-new)**
|
||||
* **[What People Are Saying About BLIS](#what-people-are-saying-about-blis)**
|
||||
* **[Key Features](#key-features)**
|
||||
@@ -76,9 +77,38 @@ and [collaborators](http://shpc.ices.utexas.edu/collaborators.html),
|
||||
[publications](http://shpc.ices.utexas.edu/publications.html),
|
||||
and [other educational projects](http://www.ulaff.net/) (such as MOOCs).
|
||||
|
||||
Education and Learning
|
||||
----------------------
|
||||
|
||||
Want to understand what's under the hood?
|
||||
Many of the same concepts and principles employed when developing BLIS are
|
||||
introduced and taught in a basic pedagogical setting as part of
|
||||
[LAFF-On Programming for High Performance (LAFF-On-PfHP)](http://www.ulaff.net/),
|
||||
one of several massive open online courses (MOOCs) in the
|
||||
[Linear Algebra: Foundations to Frontiers](http://www.ulaff.net/) series,
|
||||
all of which are available for free via the [edX platform](http://www.edx.org/).
|
||||
|
||||
What's New
|
||||
----------
|
||||
|
||||
* **Small/skinny matrix support for dgemm now available!** Thanks to
|
||||
contributions made possible by our partnership with AMD, we have dramatically
|
||||
accelerated `gemm` for double-precision real matrix problems where one or two
|
||||
dimensions is exceedingly small. A natural byproduct of this optimization is
|
||||
that the traditional case of small _m = n = k_ (i.e. square matrices) is also
|
||||
accelerated, even though it was not targeted specifically. And though only
|
||||
`dgemm` was optimized for now, support for other datatypes, other operations,
|
||||
and/or multithreading may be implemented in the future. We've also added a new
|
||||
[PerformanceSmall](docs/PerformanceSmall.md) document to showcase the
|
||||
improvement in performance when some matrix dimensions are small.
|
||||
|
||||
* **Performance comparisons now available!** We recently measured the
|
||||
performance of various level-3 operations on a variety of hardware architectures,
|
||||
as implemented within BLIS and other BLAS libraries for all four of the standard
|
||||
floating-point datatypes. The results speak for themselves! Check out our
|
||||
extensive performance graphs and background info in our new
|
||||
[Performance](docs/Performance.md) document.
|
||||
|
||||
* **BLIS is now in Debian Unstable!** Thanks to Debian developer-maintainers
|
||||
[M. Zhou](https://github.com/cdluminate) and
|
||||
[Nico Schlömer](https://github.com/nschloe) for sponsoring our package in Debian.
|
||||
@@ -87,7 +117,7 @@ the second-most popular Linux distribution (behind Ubuntu, which Debian packages
|
||||
feed into). The Debian tracker page may be found
|
||||
[here](https://tracker.debian.org/pkg/blis).
|
||||
|
||||
* **BLIS now supports mixed-datatype gemm.** The `gemm` operation may now be
|
||||
* **BLIS now supports mixed-datatype gemm!** The `gemm` operation may now be
|
||||
executed on operands of mixed domains and/or mixed precisions. Any combination
|
||||
of storage datatype for A, B, and C is now supported, along with a separate
|
||||
computation precision that can differ from the storage precision of A and B.
|
||||
@@ -313,10 +343,20 @@ table of supported microarchitectures.
|
||||
* **[Multithreading](docs/Multithreading.md).** This document describes how to
|
||||
use the multithreading features of BLIS.
|
||||
|
||||
* **[Mixed-Datatype](docs/MixedDatatype.md).** This document provides an
|
||||
* **[Mixed-Datatypes](docs/MixedDatatypes.md).** This document provides an
|
||||
overview of BLIS's mixed-datatype functionality and provides a brief example
|
||||
of how to take advantage of this new code.
|
||||
|
||||
* **[Performance](docs/Performance.md).** This document reports empirically
|
||||
measured performance of a representative set of level-3 operations on a variety
|
||||
of hardware architectures, as implemented within BLIS and other BLAS libraries
|
||||
for all four of the standard floating-point datatypes.
|
||||
|
||||
* **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
|
||||
empirically measured performance of `gemm` on select hardware architectures
|
||||
within BLIS and other BLAS libraries when performing matrix problems where one
|
||||
or two dimensions is exceedingly small.
|
||||
|
||||
* **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of
|
||||
changes included with each new version of BLIS, along with contributor credits
|
||||
for key features.
|
||||
|
||||
@@ -136,7 +136,7 @@ CFLAGS += -Wno-maybe-uninitialized -Wno-parentheses -Wfatal-errors \
|
||||
-I$(INC_PATH) -DHAVE_BLIS_H
|
||||
|
||||
# Locate the libblis library to which we will link.
|
||||
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
|
||||
# Override the location of the check-blastest.sh script.
|
||||
#BLASTEST_CHECK := ./check-blastest.sh
|
||||
|
||||
@@ -135,6 +135,12 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if @enable_sup_handling@
|
||||
#define BLIS_ENABLE_SUP_HANDLING
|
||||
#else
|
||||
#define BLIS_DISABLE_SUP_HANDLING
|
||||
#endif
|
||||
|
||||
#if @enable_memkind@
|
||||
#define BLIS_ENABLE_MEMKIND
|
||||
#else
|
||||
@@ -159,4 +165,5 @@
|
||||
#define BLIS_DISABLE_SHARED
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -115,13 +115,33 @@ THREADING_MODEL := @threading_model@
|
||||
# Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option.
|
||||
PRAGMA_OMP_SIMD := @pragma_omp_simd@
|
||||
|
||||
# The install libdir, includedir, and shareddir values from configure tell
|
||||
# us where to install the libraries, header files, and public makefile
|
||||
# fragments, respectively. Notice that we support the use of DESTDIR so that
|
||||
# advanced users may install to a temporary location.
|
||||
INSTALL_LIBDIR := $(DESTDIR)@install_libdir@
|
||||
INSTALL_INCDIR := $(DESTDIR)@install_incdir@
|
||||
INSTALL_SHAREDIR := $(DESTDIR)@install_sharedir@
|
||||
# The installation prefix, exec_prefix, libdir, includedir, and shareddir
|
||||
# values from configure tell us where to install the libraries, header files,
|
||||
# and public makefile fragments. We must first assign each substituted
|
||||
# @anchor@ to its own variable. Why? Because the subsitutions may contain
|
||||
# unevaluated variable expressions. For example, '@libdir@' may be replaced
|
||||
# with '${exec_prefix}/lib'. By assigning the anchors to variables first, and
|
||||
# then assigning them to their final INSTALL_* variables, we allow prefix and
|
||||
# exec_prefix to be used in the definitions of exec_prefix, libdir,
|
||||
# includedir, and sharedir.
|
||||
prefix := @prefix@
|
||||
exec_prefix := @exec_prefix@
|
||||
libdir := @libdir@
|
||||
includedir := @includedir@
|
||||
sharedir := @sharedir@
|
||||
|
||||
# Notice that we support the use of DESTDIR so that advanced users may install
|
||||
# to a temporary location.
|
||||
INSTALL_LIBDIR := $(DESTDIR)$(libdir)
|
||||
INSTALL_INCDIR := $(DESTDIR)$(includedir)
|
||||
INSTALL_SHAREDIR := $(DESTDIR)$(sharedir)
|
||||
|
||||
#$(info prefix = $(prefix) )
|
||||
#$(info exec_prefix = $(exec_prefix) )
|
||||
#$(info libdir = $(libdir) )
|
||||
#$(info includedir = $(includedir) )
|
||||
#$(info sharedir = $(sharedir) )
|
||||
#$(error .)
|
||||
|
||||
# Whether to output verbose command-line feedback as the Makefile is
|
||||
# processed.
|
||||
@@ -135,11 +155,15 @@ BUILDING_OOT := @configured_oot@
|
||||
ARG_MAX_HACK := @enable_arg_max_hack@
|
||||
|
||||
# Whether to build the static and shared libraries.
|
||||
# Note the "MK_" prefix, which helps differentiate these variables from
|
||||
# NOTE: The "MK_" prefix, which helps differentiate these variables from
|
||||
# their corresonding cpp macros that use the BLIS_ prefix.
|
||||
MK_ENABLE_STATIC := @enable_static@
|
||||
MK_ENABLE_SHARED := @enable_shared@
|
||||
|
||||
# Whether to export all symbols within the shared library, even those symbols
|
||||
# that are considered to be for internal use only.
|
||||
EXPORT_SHARED := @export_shared@
|
||||
|
||||
# Whether to enable either the BLAS or CBLAS compatibility layers.
|
||||
MK_ENABLE_BLAS := @enable_blas@
|
||||
MK_ENABLE_CBLAS := @enable_cblas@
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
|
||||
*/
|
||||
|
||||
#define BLIS_EXPORT_BLIS
|
||||
#include "bli_system.h"
|
||||
#include "bli_type_defs.h"
|
||||
#include "bli_arch.h"
|
||||
|
||||
@@ -244,10 +244,24 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
|
||||
# directive.
|
||||
header_path = get_header_path( header, header_dirpaths )
|
||||
|
||||
# If the header was found, we recurse. Otherwise, we output
|
||||
# the #include directive with a comment indicating that it
|
||||
# was skipped.
|
||||
if header_path:
|
||||
# First, check if the header is our root header (and if so, ignore it).
|
||||
# Otherwise, if the header was found, we recurse. Otherwise, we output
|
||||
# the #include directive with a comment indicating that it as skipped
|
||||
if header == root_inputfile:
|
||||
|
||||
markl = result.group(1)
|
||||
markr = result.group(3)
|
||||
|
||||
echov2( "%sthis is the root header '%s'; commenting out / skipping." \
|
||||
% ( cursp, header ) )
|
||||
|
||||
# If the header found is our root header, then we cannot
|
||||
# recurse into it lest we enter an infinite loop. Output the
|
||||
# line but make sure it's commented out entirely.
|
||||
ostring += "%s #include %c%s%c %c" \
|
||||
% ( skipstr, markl, header, markr, '\n' )
|
||||
|
||||
elif header_path:
|
||||
|
||||
echov2( "%slocated file '%s'; recursing." \
|
||||
% ( cursp, header_path ) )
|
||||
@@ -327,6 +341,7 @@ strip_comments = None
|
||||
recursive_flag = None
|
||||
verbose_flag = None
|
||||
regex = None
|
||||
root_inputfile = None
|
||||
|
||||
def main():
|
||||
|
||||
@@ -336,6 +351,7 @@ def main():
|
||||
global recursive_flag
|
||||
global verbose_flag
|
||||
global regex
|
||||
global root_inputfile
|
||||
|
||||
# Obtain the script name.
|
||||
path, script_name = os.path.split(sys.argv[0])
|
||||
@@ -397,6 +413,10 @@ def main():
|
||||
temp_dir = args[2]
|
||||
dir_list = args[3]
|
||||
|
||||
# Save the filename (basename) part of the input file (or root file) into a
|
||||
# global variable that we can access later from within flatten_header().
|
||||
root_inputfile = os.path.basename( inputfile )
|
||||
|
||||
# Separate the directories into distinct strings.
|
||||
dir_list = dir_list.split()
|
||||
|
||||
|
||||
@@ -417,8 +417,9 @@ main()
|
||||
|
||||
# The arguments to this function. They'll get assigned meaningful
|
||||
# values after getopts.
|
||||
mkfile_frag_tmpl_path=""
|
||||
root_dir=""
|
||||
frag_dir=""
|
||||
mkfile_frag_tmpl_path=""
|
||||
suffix_file=""
|
||||
ignore_file=""
|
||||
|
||||
|
||||
@@ -183,13 +183,11 @@ bli_cgemm4mb
|
||||
bli_cgemm4mb_ker_var2
|
||||
bli_cgemm4mh
|
||||
bli_cgemm_ex
|
||||
bli_cgemm_haswell_asm_3x8
|
||||
bli_cgemm_haswell_asm_8x3
|
||||
bli_cgemm_ker_var2
|
||||
bli_cgemm_md_c2r_ref
|
||||
bli_cgemm_ukernel
|
||||
bli_cgemmtrsm_l_ukernel
|
||||
bli_cgemmtrsm_u_ukernel
|
||||
bli_cgemm_ukernel
|
||||
bli_cgemv
|
||||
bli_cgemv_ex
|
||||
bli_cgemv_unb_var1
|
||||
@@ -285,12 +283,6 @@ bli_chemv_unf_var3a
|
||||
bli_cher
|
||||
bli_cher2
|
||||
bli_cher2_ex
|
||||
bli_cher2_unb_var1
|
||||
bli_cher2_unb_var2
|
||||
bli_cher2_unb_var3
|
||||
bli_cher2_unb_var4
|
||||
bli_cher2_unf_var1
|
||||
bli_cher2_unf_var4
|
||||
bli_cher2k
|
||||
bli_cher2k1m
|
||||
bli_cher2k3m1
|
||||
@@ -298,9 +290,13 @@ bli_cher2k3mh
|
||||
bli_cher2k4m1
|
||||
bli_cher2k4mh
|
||||
bli_cher2k_ex
|
||||
bli_cher2_unb_var1
|
||||
bli_cher2_unb_var2
|
||||
bli_cher2_unb_var3
|
||||
bli_cher2_unb_var4
|
||||
bli_cher2_unf_var1
|
||||
bli_cher2_unf_var4
|
||||
bli_cher_ex
|
||||
bli_cher_unb_var1
|
||||
bli_cher_unb_var2
|
||||
bli_cherk
|
||||
bli_cherk1m
|
||||
bli_cherk3m1
|
||||
@@ -310,6 +306,8 @@ bli_cherk4mh
|
||||
bli_cherk_ex
|
||||
bli_cherk_l_ker_var2
|
||||
bli_cherk_u_ker_var2
|
||||
bli_cher_unb_var1
|
||||
bli_cher_unb_var2
|
||||
bli_cinvertd
|
||||
bli_cinvertd_ex
|
||||
bli_cinvertsc
|
||||
@@ -354,8 +352,8 @@ bli_cntl_copy
|
||||
bli_cntl_create_node
|
||||
bli_cntl_free
|
||||
bli_cntl_free_node
|
||||
bli_cntl_free_w_thrinfo
|
||||
bli_cntl_free_wo_thrinfo
|
||||
bli_cntl_free_w_thrinfo
|
||||
bli_cntl_mark_family
|
||||
bli_cntx_1m_stage
|
||||
bli_cntx_3m1_stage
|
||||
@@ -544,8 +542,8 @@ bli_ctrsm1m
|
||||
bli_ctrsm3m1
|
||||
bli_ctrsm4m1
|
||||
bli_ctrsm_ex
|
||||
bli_ctrsm_l_ukernel
|
||||
bli_ctrsm_ll_ker_var2
|
||||
bli_ctrsm_l_ukernel
|
||||
bli_ctrsm_lu_ker_var2
|
||||
bli_ctrsm_rl_ker_var2
|
||||
bli_ctrsm_ru_ker_var2
|
||||
@@ -591,7 +589,6 @@ bli_daddv
|
||||
bli_daddv_ex
|
||||
bli_damaxv
|
||||
bli_damaxv_ex
|
||||
bli_damaxv_zen_int
|
||||
bli_dasumv
|
||||
bli_dasumv_ex
|
||||
bli_dasumv_unb_var1
|
||||
@@ -603,14 +600,11 @@ bli_daxpyd
|
||||
bli_daxpyd_ex
|
||||
bli_daxpyf
|
||||
bli_daxpyf_ex
|
||||
bli_daxpyf_zen_int_8
|
||||
bli_daxpym
|
||||
bli_daxpym_ex
|
||||
bli_daxpym_unb_var1
|
||||
bli_daxpyv
|
||||
bli_daxpyv_ex
|
||||
bli_daxpyv_zen_int
|
||||
bli_daxpyv_zen_int10
|
||||
bli_dccastm
|
||||
bli_dccastnzm
|
||||
bli_dccastv
|
||||
@@ -640,16 +634,12 @@ bli_ddotaxpyv
|
||||
bli_ddotaxpyv_ex
|
||||
bli_ddotv
|
||||
bli_ddotv_ex
|
||||
bli_ddotv_zen_int
|
||||
bli_ddotv_zen_int10
|
||||
bli_ddotxaxpyf
|
||||
bli_ddotxaxpyf_ex
|
||||
bli_ddotxf
|
||||
bli_ddotxf_ex
|
||||
bli_ddotxf_zen_int_8
|
||||
bli_ddotxv
|
||||
bli_ddotxv_ex
|
||||
bli_ddotxv_zen_int
|
||||
bli_ddpackm_blk_var1_md
|
||||
bli_ddpackm_cxk_1e_md
|
||||
bli_ddpackm_cxk_1r_md
|
||||
@@ -673,14 +663,10 @@ bli_dgemm4mb
|
||||
bli_dgemm4mb_ker_var2
|
||||
bli_dgemm4mh
|
||||
bli_dgemm_ex
|
||||
bli_dgemm_haswell_asm_6x8
|
||||
bli_dgemm_haswell_asm_8x6
|
||||
bli_dgemm_ker_var2
|
||||
bli_dgemm_ukernel
|
||||
bli_dgemmtrsm_l_haswell_asm_6x8
|
||||
bli_dgemmtrsm_l_ukernel
|
||||
bli_dgemmtrsm_u_haswell_asm_6x8
|
||||
bli_dgemmtrsm_u_ukernel
|
||||
bli_dgemm_ukernel
|
||||
bli_dgemv
|
||||
bli_dgemv_ex
|
||||
bli_dgemv_unb_var1
|
||||
@@ -713,12 +699,6 @@ bli_dhemv_unf_var3a
|
||||
bli_dher
|
||||
bli_dher2
|
||||
bli_dher2_ex
|
||||
bli_dher2_unb_var1
|
||||
bli_dher2_unb_var2
|
||||
bli_dher2_unb_var3
|
||||
bli_dher2_unb_var4
|
||||
bli_dher2_unf_var1
|
||||
bli_dher2_unf_var4
|
||||
bli_dher2k
|
||||
bli_dher2k1m
|
||||
bli_dher2k3m1
|
||||
@@ -726,9 +706,13 @@ bli_dher2k3mh
|
||||
bli_dher2k4m1
|
||||
bli_dher2k4mh
|
||||
bli_dher2k_ex
|
||||
bli_dher2_unb_var1
|
||||
bli_dher2_unb_var2
|
||||
bli_dher2_unb_var3
|
||||
bli_dher2_unb_var4
|
||||
bli_dher2_unf_var1
|
||||
bli_dher2_unf_var4
|
||||
bli_dher_ex
|
||||
bli_dher_unb_var1
|
||||
bli_dher_unb_var2
|
||||
bli_dherk
|
||||
bli_dherk1m
|
||||
bli_dherk3m1
|
||||
@@ -738,6 +722,8 @@ bli_dherk4mh
|
||||
bli_dherk_ex
|
||||
bli_dherk_l_ker_var2
|
||||
bli_dherk_u_ker_var2
|
||||
bli_dher_unb_var1
|
||||
bli_dher_unb_var2
|
||||
bli_dinvertd
|
||||
bli_dinvertd_ex
|
||||
bli_dinvertsc
|
||||
@@ -746,11 +732,6 @@ bli_dinvertv_ex
|
||||
bli_divsc
|
||||
bli_divsc_check
|
||||
bli_divsc_qfp
|
||||
bli_dlamc1
|
||||
bli_dlamc2
|
||||
bli_dlamc3
|
||||
bli_dlamc4
|
||||
bli_dlamc5
|
||||
bli_dlamch
|
||||
bli_dmachval
|
||||
bli_dmkherm
|
||||
@@ -838,8 +819,6 @@ bli_dscalm_ex
|
||||
bli_dscalm_unb_var1
|
||||
bli_dscalv
|
||||
bli_dscalv_ex
|
||||
bli_dscalv_zen_int
|
||||
bli_dscalv_zen_int10
|
||||
bli_dscastm
|
||||
bli_dscastnzm
|
||||
bli_dscastv
|
||||
@@ -906,11 +885,6 @@ bli_dsyrk3mh
|
||||
bli_dsyrk4m1
|
||||
bli_dsyrk4mh
|
||||
bli_dsyrk_ex
|
||||
bli_dt_size
|
||||
bli_dt_size_check
|
||||
bli_dt_string
|
||||
bli_dt_string_check
|
||||
bli_dt_union_check
|
||||
bli_dtrmm
|
||||
bli_dtrmm1m
|
||||
bli_dtrmm3
|
||||
@@ -938,8 +912,8 @@ bli_dtrsm1m
|
||||
bli_dtrsm3m1
|
||||
bli_dtrsm4m1
|
||||
bli_dtrsm_ex
|
||||
bli_dtrsm_l_ukernel
|
||||
bli_dtrsm_ll_ker_var2
|
||||
bli_dtrsm_l_ukernel
|
||||
bli_dtrsm_lu_ker_var2
|
||||
bli_dtrsm_rl_ker_var2
|
||||
bli_dtrsm_ru_ker_var2
|
||||
@@ -950,6 +924,11 @@ bli_dtrsv_unb_var1
|
||||
bli_dtrsv_unb_var2
|
||||
bli_dtrsv_unf_var1
|
||||
bli_dtrsv_unf_var2
|
||||
bli_dt_size
|
||||
bli_dt_size_check
|
||||
bli_dt_string
|
||||
bli_dt_string_check
|
||||
bli_dt_union_check
|
||||
bli_dunpackm_blk_var1
|
||||
bli_dunpackm_cxk
|
||||
bli_dunpackm_unb_var1
|
||||
@@ -1018,6 +997,7 @@ bli_gemm_basic_check
|
||||
bli_gemm_blk_var1
|
||||
bli_gemm_blk_var2
|
||||
bli_gemm_blk_var3
|
||||
bli_gemmbp_cntl_create
|
||||
bli_gemm_check
|
||||
bli_gemm_cntl_create
|
||||
bli_gemm_cntl_create_node
|
||||
@@ -1028,6 +1008,8 @@ bli_gemm_determine_kc_f
|
||||
bli_gemm_direct
|
||||
bli_gemm_ex
|
||||
bli_gemm_front
|
||||
bli_gemmind
|
||||
bli_gemmind_get_avail
|
||||
bli_gemm_int
|
||||
bli_gemm_ker_var2
|
||||
bli_gemm_ker_var2_md
|
||||
@@ -1040,20 +1022,17 @@ bli_gemm_md_rcc
|
||||
bli_gemm_md_rcr
|
||||
bli_gemm_md_rrc
|
||||
bli_gemm_md_rrr
|
||||
bli_gemmnat
|
||||
bli_gemm_packa
|
||||
bli_gemm_packb
|
||||
bli_gemm_prune_unref_mparts_k
|
||||
bli_gemm_prune_unref_mparts_m
|
||||
bli_gemm_prune_unref_mparts_n
|
||||
bli_gemmtrsm_l_ukernel_qfp
|
||||
bli_gemmtrsm_ukernel
|
||||
bli_gemmtrsm_u_ukernel_qfp
|
||||
bli_gemm_ukernel
|
||||
bli_gemm_ukernel_qfp
|
||||
bli_gemmbp_cntl_create
|
||||
bli_gemmind
|
||||
bli_gemmind_get_avail
|
||||
bli_gemmnat
|
||||
bli_gemmtrsm_l_ukernel_qfp
|
||||
bli_gemmtrsm_u_ukernel_qfp
|
||||
bli_gemmtrsm_ukernel
|
||||
bli_gemv
|
||||
bli_gemv_check
|
||||
bli_gemv_ex
|
||||
@@ -1120,30 +1099,18 @@ bli_hemv_unb_var3_qfp
|
||||
bli_hemv_unb_var4
|
||||
bli_hemv_unb_var4_qfp
|
||||
bli_hemv_unf_var1
|
||||
bli_hemv_unf_var1_qfp
|
||||
bli_hemv_unf_var1a
|
||||
bli_hemv_unf_var1a_qfp
|
||||
bli_hemv_unf_var1_qfp
|
||||
bli_hemv_unf_var3
|
||||
bli_hemv_unf_var3_qfp
|
||||
bli_hemv_unf_var3a
|
||||
bli_hemv_unf_var3a_qfp
|
||||
bli_hemv_unf_var3_qfp
|
||||
bli_her
|
||||
bli_her2
|
||||
bli_her2_check
|
||||
bli_her2_ex
|
||||
bli_her2_ex_qfp
|
||||
bli_her2_unb_var1
|
||||
bli_her2_unb_var1_qfp
|
||||
bli_her2_unb_var2
|
||||
bli_her2_unb_var2_qfp
|
||||
bli_her2_unb_var3
|
||||
bli_her2_unb_var3_qfp
|
||||
bli_her2_unb_var4
|
||||
bli_her2_unb_var4_qfp
|
||||
bli_her2_unf_var1
|
||||
bli_her2_unf_var1_qfp
|
||||
bli_her2_unf_var4
|
||||
bli_her2_unf_var4_qfp
|
||||
bli_her2k
|
||||
bli_her2k1m
|
||||
bli_her2k3m1
|
||||
@@ -1157,13 +1124,21 @@ bli_her2k_front
|
||||
bli_her2kind
|
||||
bli_her2kind_get_avail
|
||||
bli_her2knat
|
||||
bli_her2_unb_var1
|
||||
bli_her2_unb_var1_qfp
|
||||
bli_her2_unb_var2
|
||||
bli_her2_unb_var2_qfp
|
||||
bli_her2_unb_var3
|
||||
bli_her2_unb_var3_qfp
|
||||
bli_her2_unb_var4
|
||||
bli_her2_unb_var4_qfp
|
||||
bli_her2_unf_var1
|
||||
bli_her2_unf_var1_qfp
|
||||
bli_her2_unf_var4
|
||||
bli_her2_unf_var4_qfp
|
||||
bli_her_check
|
||||
bli_her_ex
|
||||
bli_her_ex_qfp
|
||||
bli_her_unb_var1
|
||||
bli_her_unb_var1_qfp
|
||||
bli_her_unb_var2
|
||||
bli_her_unb_var2_qfp
|
||||
bli_herk
|
||||
bli_herk1m
|
||||
bli_herk3m1
|
||||
@@ -1178,15 +1153,19 @@ bli_herk_determine_kc_f
|
||||
bli_herk_direct
|
||||
bli_herk_ex
|
||||
bli_herk_front
|
||||
bli_herkind
|
||||
bli_herkind_get_avail
|
||||
bli_herk_l_ker_var2
|
||||
bli_herknat
|
||||
bli_herk_prune_unref_mparts_k
|
||||
bli_herk_prune_unref_mparts_m
|
||||
bli_herk_prune_unref_mparts_n
|
||||
bli_herk_u_ker_var2
|
||||
bli_herk_x_ker_var2
|
||||
bli_herkind
|
||||
bli_herkind_get_avail
|
||||
bli_herknat
|
||||
bli_her_unb_var1
|
||||
bli_her_unb_var1_qfp
|
||||
bli_her_unb_var2
|
||||
bli_her_unb_var2_qfp
|
||||
bli_ifprintm
|
||||
bli_ifprintv
|
||||
bli_igetsc
|
||||
@@ -1217,9 +1196,9 @@ bli_info_get_enable_sba_pools
|
||||
bli_info_get_enable_stay_auto_init
|
||||
bli_info_get_enable_threading
|
||||
bli_info_get_gemm_impl_string
|
||||
bli_info_get_gemm_ukr_impl_string
|
||||
bli_info_get_gemmtrsm_l_ukr_impl_string
|
||||
bli_info_get_gemmtrsm_u_ukr_impl_string
|
||||
bli_info_get_gemm_ukr_impl_string
|
||||
bli_info_get_heap_addr_align_size
|
||||
bli_info_get_heap_stride_align_size
|
||||
bli_info_get_hemm_impl_string
|
||||
@@ -1278,12 +1257,12 @@ bli_l1d_xy_check
|
||||
bli_l1m_ax_check
|
||||
bli_l1m_axy_check
|
||||
bli_l1m_xy_check
|
||||
bli_l1v_ax_check
|
||||
bli_l1v_axby_check
|
||||
bli_l1v_ax_check
|
||||
bli_l1v_axy_check
|
||||
bli_l1v_dot_check
|
||||
bli_l1v_x_check
|
||||
bli_l1v_xby_check
|
||||
bli_l1v_x_check
|
||||
bli_l1v_xi_check
|
||||
bli_l1v_xy_check
|
||||
bli_l3_basic_check
|
||||
@@ -1452,12 +1431,10 @@ bli_pool_init
|
||||
bli_pool_print
|
||||
bli_pool_reinit
|
||||
bli_pool_shrink
|
||||
bli_pow_di
|
||||
bli_pow_ri
|
||||
bli_prime_factorization
|
||||
bli_print_msg
|
||||
bli_printm
|
||||
bli_printm_ex
|
||||
bli_print_msg
|
||||
bli_printv
|
||||
bli_printv_ex
|
||||
bli_projm
|
||||
@@ -1510,7 +1487,6 @@ bli_saddv
|
||||
bli_saddv_ex
|
||||
bli_samaxv
|
||||
bli_samaxv_ex
|
||||
bli_samaxv_zen_int
|
||||
bli_sasumv
|
||||
bli_sasumv_ex
|
||||
bli_sasumv_unb_var1
|
||||
@@ -1522,14 +1498,11 @@ bli_saxpyd
|
||||
bli_saxpyd_ex
|
||||
bli_saxpyf
|
||||
bli_saxpyf_ex
|
||||
bli_saxpyf_zen_int_8
|
||||
bli_saxpym
|
||||
bli_saxpym_ex
|
||||
bli_saxpym_unb_var1
|
||||
bli_saxpyv
|
||||
bli_saxpyv_ex
|
||||
bli_saxpyv_zen_int
|
||||
bli_saxpyv_zen_int10
|
||||
bli_sba_acquire
|
||||
bli_sba_checkin_array
|
||||
bli_sba_checkout_array
|
||||
@@ -1591,16 +1564,12 @@ bli_sdotaxpyv
|
||||
bli_sdotaxpyv_ex
|
||||
bli_sdotv
|
||||
bli_sdotv_ex
|
||||
bli_sdotv_zen_int
|
||||
bli_sdotv_zen_int10
|
||||
bli_sdotxaxpyf
|
||||
bli_sdotxaxpyf_ex
|
||||
bli_sdotxf
|
||||
bli_sdotxf_ex
|
||||
bli_sdotxf_zen_int_8
|
||||
bli_sdotxv
|
||||
bli_sdotxv_ex
|
||||
bli_sdotxv_zen_int
|
||||
bli_sdpackm_blk_var1_md
|
||||
bli_sdpackm_cxk_1e_md
|
||||
bli_sdpackm_cxk_1r_md
|
||||
@@ -1643,14 +1612,10 @@ bli_sgemm4mb
|
||||
bli_sgemm4mb_ker_var2
|
||||
bli_sgemm4mh
|
||||
bli_sgemm_ex
|
||||
bli_sgemm_haswell_asm_16x6
|
||||
bli_sgemm_haswell_asm_6x16
|
||||
bli_sgemm_ker_var2
|
||||
bli_sgemm_ukernel
|
||||
bli_sgemmtrsm_l_haswell_asm_6x16
|
||||
bli_sgemmtrsm_l_ukernel
|
||||
bli_sgemmtrsm_u_haswell_asm_6x16
|
||||
bli_sgemmtrsm_u_ukernel
|
||||
bli_sgemm_ukernel
|
||||
bli_sgemv
|
||||
bli_sgemv_ex
|
||||
bli_sgemv_unb_var1
|
||||
@@ -1683,12 +1648,6 @@ bli_shemv_unf_var3a
|
||||
bli_sher
|
||||
bli_sher2
|
||||
bli_sher2_ex
|
||||
bli_sher2_unb_var1
|
||||
bli_sher2_unb_var2
|
||||
bli_sher2_unb_var3
|
||||
bli_sher2_unb_var4
|
||||
bli_sher2_unf_var1
|
||||
bli_sher2_unf_var4
|
||||
bli_sher2k
|
||||
bli_sher2k1m
|
||||
bli_sher2k3m1
|
||||
@@ -1696,9 +1655,13 @@ bli_sher2k3mh
|
||||
bli_sher2k4m1
|
||||
bli_sher2k4mh
|
||||
bli_sher2k_ex
|
||||
bli_sher2_unb_var1
|
||||
bli_sher2_unb_var2
|
||||
bli_sher2_unb_var3
|
||||
bli_sher2_unb_var4
|
||||
bli_sher2_unf_var1
|
||||
bli_sher2_unf_var4
|
||||
bli_sher_ex
|
||||
bli_sher_unb_var1
|
||||
bli_sher_unb_var2
|
||||
bli_sherk
|
||||
bli_sherk1m
|
||||
bli_sherk3m1
|
||||
@@ -1708,6 +1671,8 @@ bli_sherk4mh
|
||||
bli_sherk_ex
|
||||
bli_sherk_l_ker_var2
|
||||
bli_sherk_u_ker_var2
|
||||
bli_sher_unb_var1
|
||||
bli_sher_unb_var2
|
||||
bli_shiftd
|
||||
bli_shiftd_check
|
||||
bli_shiftd_ex
|
||||
@@ -1717,11 +1682,6 @@ bli_sinvertd_ex
|
||||
bli_sinvertsc
|
||||
bli_sinvertv
|
||||
bli_sinvertv_ex
|
||||
bli_slamc1
|
||||
bli_slamc2
|
||||
bli_slamc3
|
||||
bli_slamc4
|
||||
bli_slamc5
|
||||
bli_slamch
|
||||
bli_sleep
|
||||
bli_smachval
|
||||
@@ -1793,8 +1753,6 @@ bli_sscalm_ex
|
||||
bli_sscalm_unb_var1
|
||||
bli_sscalv
|
||||
bli_sscalv_ex
|
||||
bli_sscalv_zen_int
|
||||
bli_sscalv_zen_int10
|
||||
bli_sscastm
|
||||
bli_sscastnzm
|
||||
bli_sscastv
|
||||
@@ -1889,8 +1847,8 @@ bli_strsm1m
|
||||
bli_strsm3m1
|
||||
bli_strsm4m1
|
||||
bli_strsm_ex
|
||||
bli_strsm_l_ukernel
|
||||
bli_strsm_ll_ker_var2
|
||||
bli_strsm_l_ukernel
|
||||
bli_strsm_lu_ker_var2
|
||||
bli_strsm_rl_ker_var2
|
||||
bli_strsm_ru_ker_var2
|
||||
@@ -2062,17 +2020,17 @@ bli_trmm_determine_kc_f
|
||||
bli_trmm_direct
|
||||
bli_trmm_ex
|
||||
bli_trmm_front
|
||||
bli_trmmind
|
||||
bli_trmmind_get_avail
|
||||
bli_trmm_ll_ker_var2
|
||||
bli_trmm_lu_ker_var2
|
||||
bli_trmmnat
|
||||
bli_trmm_prune_unref_mparts_k
|
||||
bli_trmm_prune_unref_mparts_m
|
||||
bli_trmm_prune_unref_mparts_n
|
||||
bli_trmm_rl_ker_var2
|
||||
bli_trmm_ru_ker_var2
|
||||
bli_trmm_xx_ker_var2
|
||||
bli_trmmind
|
||||
bli_trmmind_get_avail
|
||||
bli_trmmnat
|
||||
bli_trmv
|
||||
bli_trmv_check
|
||||
bli_trmv_ex
|
||||
@@ -2102,11 +2060,14 @@ bli_trsm_determine_kc_f
|
||||
bli_trsm_direct
|
||||
bli_trsm_ex
|
||||
bli_trsm_front
|
||||
bli_trsmind
|
||||
bli_trsmind_get_avail
|
||||
bli_trsm_int
|
||||
bli_trsm_l_cntl_create
|
||||
bli_trsm_l_ukernel_qfp
|
||||
bli_trsm_ll_ker_var2
|
||||
bli_trsm_l_ukernel_qfp
|
||||
bli_trsm_lu_ker_var2
|
||||
bli_trsmnat
|
||||
bli_trsm_packa
|
||||
bli_trsm_packb
|
||||
bli_trsm_prune_unref_mparts_k
|
||||
@@ -2115,12 +2076,9 @@ bli_trsm_prune_unref_mparts_n
|
||||
bli_trsm_r_cntl_create
|
||||
bli_trsm_rl_ker_var2
|
||||
bli_trsm_ru_ker_var2
|
||||
bli_trsm_u_ukernel_qfp
|
||||
bli_trsm_ukernel
|
||||
bli_trsm_u_ukernel_qfp
|
||||
bli_trsm_xx_ker_var2
|
||||
bli_trsmind
|
||||
bli_trsmind_get_avail
|
||||
bli_trsmnat
|
||||
bli_trsv
|
||||
bli_trsv_check
|
||||
bli_trsv_ex
|
||||
@@ -2245,13 +2203,11 @@ bli_zgemm4mb
|
||||
bli_zgemm4mb_ker_var2
|
||||
bli_zgemm4mh
|
||||
bli_zgemm_ex
|
||||
bli_zgemm_haswell_asm_3x4
|
||||
bli_zgemm_haswell_asm_4x3
|
||||
bli_zgemm_ker_var2
|
||||
bli_zgemm_md_c2r_ref
|
||||
bli_zgemm_ukernel
|
||||
bli_zgemmtrsm_l_ukernel
|
||||
bli_zgemmtrsm_u_ukernel
|
||||
bli_zgemm_ukernel
|
||||
bli_zgemv
|
||||
bli_zgemv_ex
|
||||
bli_zgemv_unb_var1
|
||||
@@ -2284,12 +2240,6 @@ bli_zhemv_unf_var3a
|
||||
bli_zher
|
||||
bli_zher2
|
||||
bli_zher2_ex
|
||||
bli_zher2_unb_var1
|
||||
bli_zher2_unb_var2
|
||||
bli_zher2_unb_var3
|
||||
bli_zher2_unb_var4
|
||||
bli_zher2_unf_var1
|
||||
bli_zher2_unf_var4
|
||||
bli_zher2k
|
||||
bli_zher2k1m
|
||||
bli_zher2k3m1
|
||||
@@ -2297,9 +2247,13 @@ bli_zher2k3mh
|
||||
bli_zher2k4m1
|
||||
bli_zher2k4mh
|
||||
bli_zher2k_ex
|
||||
bli_zher2_unb_var1
|
||||
bli_zher2_unb_var2
|
||||
bli_zher2_unb_var3
|
||||
bli_zher2_unb_var4
|
||||
bli_zher2_unf_var1
|
||||
bli_zher2_unf_var4
|
||||
bli_zher_ex
|
||||
bli_zher_unb_var1
|
||||
bli_zher_unb_var2
|
||||
bli_zherk
|
||||
bli_zherk1m
|
||||
bli_zherk3m1
|
||||
@@ -2309,6 +2263,8 @@ bli_zherk4mh
|
||||
bli_zherk_ex
|
||||
bli_zherk_l_ker_var2
|
||||
bli_zherk_u_ker_var2
|
||||
bli_zher_unb_var1
|
||||
bli_zher_unb_var2
|
||||
bli_zinvertd
|
||||
bli_zinvertd_ex
|
||||
bli_zinvertsc
|
||||
@@ -2492,8 +2448,8 @@ bli_ztrsm1m
|
||||
bli_ztrsm3m1
|
||||
bli_ztrsm4m1
|
||||
bli_ztrsm_ex
|
||||
bli_ztrsm_l_ukernel
|
||||
bli_ztrsm_ll_ker_var2
|
||||
bli_ztrsm_l_ukernel
|
||||
bli_ztrsm_lu_ker_var2
|
||||
bli_ztrsm_rl_ker_var2
|
||||
bli_ztrsm_ru_ker_var2
|
||||
@@ -2528,19 +2484,6 @@ bli_zzpackm_struc_cxk_md
|
||||
bli_zzxpbym_md
|
||||
bli_zzxpbym_md_ex
|
||||
bli_zzxpbym_md_unb_var1
|
||||
bla_c_abs
|
||||
bla_c_div
|
||||
bla_d_abs
|
||||
bla_d_cnjg
|
||||
bla_d_imag
|
||||
bla_d_sign
|
||||
bla_f__cabs
|
||||
bla_r_abs
|
||||
bla_r_cnjg
|
||||
bla_r_imag
|
||||
bla_r_sign
|
||||
bla_z_abs
|
||||
bla_z_div
|
||||
sasum_
|
||||
sasumsub_
|
||||
saxpy_
|
||||
@@ -2567,14 +2510,14 @@ srotmg_
|
||||
ssbmv_
|
||||
sscal_
|
||||
sspmv_
|
||||
sspr2_
|
||||
sspr_
|
||||
sspr2_
|
||||
sswap_
|
||||
ssymm_
|
||||
ssymv_
|
||||
ssyr_
|
||||
ssyr2_
|
||||
ssyr2k_
|
||||
ssyr_
|
||||
ssyrk_
|
||||
stbmv_
|
||||
stbsv_
|
||||
@@ -2606,14 +2549,14 @@ dscal_
|
||||
dsdot_
|
||||
dsdotsub_
|
||||
dspmv_
|
||||
dspr2_
|
||||
dspr_
|
||||
dspr2_
|
||||
dswap_
|
||||
dsymm_
|
||||
dsymv_
|
||||
dsyr_
|
||||
dsyr2_
|
||||
dsyr2k_
|
||||
dsyr_
|
||||
dsyrk_
|
||||
dtbmv_
|
||||
dtbsv_
|
||||
@@ -2641,13 +2584,13 @@ cgeru_
|
||||
chbmv_
|
||||
chemm_
|
||||
chemv_
|
||||
cher_
|
||||
cher2_
|
||||
cher2k_
|
||||
cher_
|
||||
cherk_
|
||||
chpmv_
|
||||
chpr2_
|
||||
chpr_
|
||||
chpr2_
|
||||
crotg_
|
||||
cscal_
|
||||
csrot_
|
||||
@@ -2680,13 +2623,13 @@ zgeru_
|
||||
zhbmv_
|
||||
zhemm_
|
||||
zhemv_
|
||||
zher_
|
||||
zher2_
|
||||
zher2k_
|
||||
zher_
|
||||
zherk_
|
||||
zhpmv_
|
||||
zhpr2_
|
||||
zhpr_
|
||||
zhpr2_
|
||||
zrotg_
|
||||
zscal_
|
||||
zswap_
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2019, The University of Texas at Austin
|
||||
# Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
|
||||
118
common.mk
@@ -118,7 +118,8 @@ get-noopt-cxxflags-for = $(strip $(CFLAGS_PRESET) \
|
||||
get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
-DBLIS_CNAME=$(1) \
|
||||
$(BUILD_FLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
|
||||
@@ -126,23 +127,27 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(COMPSIMDFLAGS) \
|
||||
-DBLIS_CNAME=$(1) \
|
||||
$(BUILD_FLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(BUILD_FLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
get-frame-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(BUILD_FLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
|
||||
$(call load-var-for,CKVECFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(BUILD_FLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
# When compiling sandboxes, we use flags similar to those of general framework
|
||||
@@ -153,19 +158,24 @@ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
|
||||
get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(CSBOXINCFLAGS) \
|
||||
$(BUILD_FLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cxxflags-for,$(1)) \
|
||||
$(CSBOXINCFLAGS) \
|
||||
$(BUILD_FLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
# Define a separate function that will return appropriate flags for use by
|
||||
# applications that want to use the same basic flags as those used when BLIS
|
||||
# was compiled. (This is the same as get-frame-cflags-for(), except that it
|
||||
# omits the BUILD_FLAGS, which are exclusively for use when BLIS is being
|
||||
# compiled.)
|
||||
# was compiled. (NOTE: This is the same as the $(get-frame-cflags-for ...)
|
||||
# function, except that it omits two variables that contain flags exclusively
|
||||
# for use when BLIS is being compiled/built: BUILD_CPPFLAGS, which contains a
|
||||
# cpp macro that confirms that BLIS is being built; and BUILD_SYMFLAGS, which
|
||||
# contains symbol export flags that are only needed when a shared library is
|
||||
# being compiled/linked.)
|
||||
get-user-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
)
|
||||
@@ -508,9 +518,9 @@ SOFLAGS := -shared
|
||||
ifeq ($(IS_WIN),yes)
|
||||
# Windows shared library link flags.
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
SOFLAGS += -Wl,-def:build/libblis-symbols.def -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib
|
||||
SOFLAGS += -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib
|
||||
else
|
||||
SOFLAGS += -Wl,--export-all-symbols -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a
|
||||
SOFLAGS += -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a
|
||||
endif
|
||||
else
|
||||
# Linux shared library link flags.
|
||||
@@ -532,6 +542,11 @@ ifeq ($(IS_WIN),no)
|
||||
LDFLAGS += -Wl,-rpath,$(BASE_LIB_PATH)
|
||||
endif
|
||||
endif
|
||||
# On windows, use the shared library even if static is created.
|
||||
ifeq ($(IS_WIN),yes)
|
||||
LIBBLIS_L := $(LIBBLIS_SO)
|
||||
LIBBLIS_LINK := $(LIBBLIS_SO_PATH)
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@@ -610,7 +625,7 @@ endif
|
||||
|
||||
$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c))))
|
||||
|
||||
# --- Shared library (position-independent code) flags ---
|
||||
# --- Position-independent code flags (shared libraries only) ---
|
||||
|
||||
# Emit position-independent code for dynamic linking.
|
||||
ifeq ($(IS_WIN),yes)
|
||||
@@ -622,6 +637,71 @@ CPICFLAGS := -fPIC
|
||||
endif
|
||||
$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPICFLAGS,$(c))))
|
||||
|
||||
# --- Symbol exporting flags (shared libraries only) ---
|
||||
|
||||
# NOTE: These flags are only applied when building BLIS and not used by
|
||||
# applications that import BLIS compilation flags via the
|
||||
# $(get-user-cflags-for ...) function.
|
||||
|
||||
# Determine default export behavior / visibility of symbols for gcc.
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
ifeq ($(IS_WIN),yes)
|
||||
ifeq ($(EXPORT_SHARED),all)
|
||||
BUILD_SYMFLAGS := -Wl,--export-all-symbols, -Wl,--enable-auto-import
|
||||
else # ifeq ($(EXPORT_SHARED),public)
|
||||
BUILD_SYMFLAGS := -Wl,--exclude-all-symbols
|
||||
endif
|
||||
else # ifeq ($(IS_WIN),no)
|
||||
ifeq ($(EXPORT_SHARED),all)
|
||||
# Export all symbols by default.
|
||||
BUILD_SYMFLAGS := -fvisibility=default
|
||||
else # ifeq ($(EXPORT_SHARED),public)
|
||||
# Hide all symbols by default and export only those that have been annotated
|
||||
# as needing to be exported.
|
||||
BUILD_SYMFLAGS := -fvisibility=hidden
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Determine default export behavior / visibility of symbols for icc.
|
||||
# NOTE: The Windows branches have been omitted since we currently make no
|
||||
# effort to support Windows builds via icc (only gcc/clang via AppVeyor).
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
ifeq ($(EXPORT_SHARED),all)
|
||||
# Export all symbols by default.
|
||||
BUILD_SYMFLAGS := -fvisibility=default
|
||||
else # ifeq ($(EXPORT_SHARED),public)
|
||||
# Hide all symbols by default and export only those that have been annotated
|
||||
# as needing to be exported.
|
||||
BUILD_SYMFLAGS := -fvisibility=hidden
|
||||
endif
|
||||
endif
|
||||
|
||||
# Determine default export behavior / visibility of symbols for clang.
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
ifeq ($(IS_WIN),yes)
|
||||
ifeq ($(EXPORT_SHARED),all)
|
||||
# NOTE: clang on Windows does not appear to support exporting all symbols
|
||||
# by default, and therefore we ignore the value of EXPORT_SHARED.
|
||||
BUILD_SYMFLAGS :=
|
||||
else # ifeq ($(EXPORT_SHARED),public)
|
||||
# NOTE: The default behavior of clang on Windows is to hide all symbols
|
||||
# and only export functions and other declarations that have beenannotated
|
||||
# as needing to be exported.
|
||||
BUILD_SYMFLAGS :=
|
||||
endif
|
||||
else # ifeq ($(IS_WIN),no)
|
||||
ifeq ($(EXPORT_SHARED),all)
|
||||
# Export all symbols by default.
|
||||
BUILD_SYMFLAGS := -fvisibility=default
|
||||
else # ifeq ($(EXPORT_SHARED),public)
|
||||
# Hide all symbols by default and export only those that have been annotated
|
||||
# as needing to be exported.
|
||||
BUILD_SYMFLAGS := -fvisibility=hidden
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# --- Language flags ---
|
||||
|
||||
# Enable C99.
|
||||
@@ -685,8 +765,18 @@ endif
|
||||
# --- #pragma omp simd flags (used for reference kernels only) ---
|
||||
|
||||
ifeq ($(PRAGMA_OMP_SIMD),yes)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
COMPSIMDFLAGS := -fopenmp-simd
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
COMPSIMDFLAGS := -fopenmp-simd
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
COMPSIMDFLAGS := -qopenmp-simd
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else # ifeq ($(PRAGMA_OMP_SIMD),no)
|
||||
COMPSIMDFLAGS :=
|
||||
endif
|
||||
|
||||
@@ -960,7 +1050,7 @@ VERS_DEF := -DBLIS_VERSION_STRING=\"$(VERSION)\"
|
||||
# Define a C preprocessor flag that is *only* defined when BLIS is being
|
||||
# compiled. (In other words, an application that #includes blis.h will not
|
||||
# get this cpp macro.)
|
||||
BUILD_FLAGS := -DBLIS_IS_BUILDING_LIBRARY
|
||||
BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
@@ -74,7 +74,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -57,16 +57,16 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -funroll-all-loops
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
@@ -74,7 +74,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -57,16 +57,16 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
@@ -74,7 +74,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -78,7 +78,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,9 +35,12 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
|
||||
|
||||
void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_haswell_ref( cntx );
|
||||
@@ -69,6 +73,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
@@ -118,12 +123,18 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
#if 1
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1008, 1008, 1008, 1008 );
|
||||
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 72, 36 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
#else
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 6, 6, 3, 3 );
|
||||
#endif
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1024, 1024, 1024, 1024 );
|
||||
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 64, 56, 32 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 72, 56, 44 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
#endif
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
|
||||
@@ -144,5 +155,62 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 201, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
8,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
|
||||
-1, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -36,7 +37,6 @@
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
|
||||
|
||||
|
||||
@@ -63,13 +63,13 @@ endif
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
|
||||
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CKVECFLAGS := -xCORE-AVX2
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
|
||||
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
|
||||
else
|
||||
$(error gcc, icc, or clang is required for this configuration.)
|
||||
endif
|
||||
@@ -78,7 +78,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) #-funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -78,7 +78,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -70,7 +70,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Override the default value for LDFLAGS.
|
||||
LDFLAGS := -mmic
|
||||
|
||||
@@ -99,7 +99,7 @@ endif
|
||||
# Note: We use AVX2 for reference kernels instead of AVX-512.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd
|
||||
CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CRVECFLAGS := -xMIC-AVX512
|
||||
|
||||
@@ -57,7 +57,7 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
@@ -78,7 +78,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -57,16 +57,16 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
@@ -74,7 +74,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -63,13 +63,13 @@ endif
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
|
||||
CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CKVECFLAGS := -xAVX
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
|
||||
CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge
|
||||
else
|
||||
$(error gcc, icc, or clang is required for this configuration.)
|
||||
endif
|
||||
@@ -78,7 +78,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -89,7 +89,7 @@ endif
|
||||
# to overcome the AVX-512 frequency drop". (Issue #187)
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd
|
||||
CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CRVECFLAGS := -xCORE-AVX2
|
||||
|
||||
@@ -57,16 +57,16 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3
|
||||
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
@@ -74,7 +74,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -57,7 +57,7 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
|
||||
@@ -78,7 +78,11 @@ endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -35,9 +35,12 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
|
||||
|
||||
void bli_cntx_init_zen( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_zen_ref( cntx );
|
||||
@@ -114,23 +117,27 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
|
||||
/*
|
||||
Multi Instance performance improvement of DGEMM when binded to a CCX
|
||||
In Multi instance each thread runs a sequential DGEMM.
|
||||
|
||||
a) If BLIS is run in a multi instance mode with
|
||||
CPU freq 2.6/2.2 Ghz
|
||||
DDR4 clock frequency 2400Mhz
|
||||
Multi Instance performance degradation on different cores
|
||||
a) CPU freq 2.6 Ghz
|
||||
DDR4 2400
|
||||
Multi instance mode
|
||||
mc = 240, kc = 512, and nc = 2040
|
||||
has better performance on EPYC server, over the default block sizes.
|
||||
|
||||
b) CPU freq 2.4Ghz
|
||||
DDR4 2400
|
||||
Multi Instance mode
|
||||
either
|
||||
mc = 240, kc = 512 and nc = 2040
|
||||
(or)
|
||||
mc = 390, kc = 512 and nc = 4080
|
||||
|
||||
b) If BLIS is run in Single Instance mode
|
||||
c) Higher frequency(3.1Ghz), single instance mode choose default value
|
||||
mc = 510, kc = 1024 and nc = 4080
|
||||
|
||||
*/
|
||||
|
||||
// Zen optmized level 3 cache block sizes
|
||||
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
|
||||
|
||||
#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
|
||||
@@ -138,7 +145,6 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
|
||||
#else
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
|
||||
@@ -150,9 +156,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
@@ -172,5 +176,62 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
8,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
|
||||
-1, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -39,14 +39,13 @@
|
||||
// By default, it is effective to parallelize the outer loops.
|
||||
// Setting these macros to 1 will force JR and IR inner loops
|
||||
// to be not paralleized.
|
||||
#define BLIS_THREAD_MAX_IR 1
|
||||
#define BLIS_THREAD_MAX_JR 1
|
||||
#define BLIS_DEFAULT_MR_THREAD_MAX 1
|
||||
#define BLIS_DEFAULT_NR_THREAD_MAX 1
|
||||
|
||||
#define BLIS_ENABLE_ZEN_BLOCK_SIZES
|
||||
#define BLIS_ENABLE_SMALL_MATRIX
|
||||
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
|
||||
|
||||
// This will select the threshold below which small matrix code will be called.
|
||||
#define BLIS_SMALL_MATRIX_THRES 700
|
||||
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
|
||||
@@ -64,6 +63,15 @@
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90
|
||||
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
|
||||
// Allow the sup implementation to combine some small edge case iterations in
|
||||
// the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the
|
||||
// block-panel algorithm (NR) with the last full iteration that precedes it.
|
||||
// NOTE: These cpp macros need to be explicitly set to an integer since they
|
||||
// are used at compile-time to create unconditional branches or dead code
|
||||
// regions.
|
||||
#define BLIS_ENABLE_SUP_MR_EXT 1
|
||||
#define BLIS_ENABLE_SUP_NR_EXT 0
|
||||
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
@@ -46,10 +46,27 @@ AMD_CONFIG_FILE := amd_config.mk
|
||||
AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
|
||||
-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS += -march=znver1
|
||||
endif
|
||||
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_zen2_ref( cntx );
|
||||
|
||||
@@ -135,5 +135,61 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
8,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
|
||||
-1, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -33,36 +33,56 @@
|
||||
#
|
||||
#
|
||||
|
||||
# FLAGS that are specific to 'zen2' architecture are added here.
|
||||
# FLAGS that are common for all the AMD architectures are present in config/zen/amd_config.mk
|
||||
#
|
||||
|
||||
# Declare the name of the current configuration and add it to the
|
||||
# running list of configurations included by common.mk.
|
||||
THIS_CONFIG := zen2
|
||||
#CONFIGS_INCL += $(THIS_CONFIG)
|
||||
|
||||
# Include file containing common flags for all AMD architectures
|
||||
AMD_CONFIG_FILE := amd_config.mk
|
||||
AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
|
||||
-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
|
||||
#
|
||||
# --- Determine the C compiler and related flags ---
|
||||
#
|
||||
|
||||
# NOTE: The build system will append these variables with various
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
# gcc 9.0 (clang ?) or later:
|
||||
GCC_VERSION := $(strip $(shell gcc -dumpversion))
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver2
|
||||
#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver2
|
||||
# gcc 6.0 (clang 4.0) or later:
|
||||
else
|
||||
CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
|
||||
endif
|
||||
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-avx256-split-unaligned-store
|
||||
# gcc 4.9 (clang 3.5) or later:
|
||||
# possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
|
||||
#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
328
configure
vendored
@@ -51,8 +51,6 @@ print_usage()
|
||||
#echo " "
|
||||
#echo " BLIS ${version}"
|
||||
echo " "
|
||||
echo " Field G. Van Zee"
|
||||
echo " "
|
||||
echo " Configure BLIS's build system for compilation using a specified"
|
||||
echo " configuration directory."
|
||||
echo " "
|
||||
@@ -72,30 +70,37 @@ print_usage()
|
||||
echo " "
|
||||
echo " -p PREFIX, --prefix=PREFIX"
|
||||
echo " "
|
||||
echo " The path to which make will install all build products."
|
||||
echo " If given, this option implies the following options:"
|
||||
echo " --libdir=PREFIX/lib"
|
||||
echo " --incdir=PREFIX/include"
|
||||
echo " The common installation prefix for all files. If given,"
|
||||
echo " this option effectively implies:"
|
||||
echo " --libdir=EXECPREFIX/lib"
|
||||
echo " --includedir=PREFIX/include"
|
||||
echo " --sharedir=PREFIX/share"
|
||||
echo " If not given, PREFIX defaults to \$(HOME)/blis. If PREFIX"
|
||||
echo " where EXECPREFIX defaults to PREFIX. If this option is"
|
||||
echo " not given, PREFIX defaults to '${prefix_def}'. If PREFIX"
|
||||
echo " refers to a directory that does not exist, it will be"
|
||||
echo " created."
|
||||
echo " "
|
||||
echo " --exec-prefix=EXECPREFIX"
|
||||
echo " "
|
||||
echo " The installation prefix for libraries. Specifically, if"
|
||||
echo " given, this option effectively implies:"
|
||||
echo " --libdir=EXECPREFIX/lib"
|
||||
echo " If not given, EXECPREFIX defaults to PREFIX, which may be"
|
||||
echo " modified by the --prefix option. If EXECPREFIX refers to"
|
||||
echo " a directory that does not exist, it will be created."
|
||||
echo " "
|
||||
echo " --libdir=LIBDIR"
|
||||
echo " "
|
||||
echo " The path to which make will install libraries. If given,"
|
||||
echo " LIBDIR will override the corresponding directory implied"
|
||||
echo " by --prefix; if not not given, LIBDIR defaults to"
|
||||
echo " PREFIX/lib. If LIBDIR refers to a directory that does"
|
||||
echo " not exist, it will be created."
|
||||
echo " The path to which make will install libraries. If not"
|
||||
echo " given, LIBDIR defaults to PREFIX/lib. If LIBDIR refers to"
|
||||
echo " a directory that does not exist, it will be created."
|
||||
echo " "
|
||||
echo " --includedir=INCDIR"
|
||||
echo " "
|
||||
echo " The path to which make will install development header"
|
||||
echo " files. If given, INCDIR will override the corresponding"
|
||||
echo " directory implied by --prefix; if not given, INCDIR"
|
||||
echo " defaults to PREFIX/include. If INCDIR refers to a"
|
||||
echo " directory that does not exist, it will be created."
|
||||
echo " files. If not given, INCDIR defaults to PREFIX/include."
|
||||
echo " If INCDIR refers to a directory that does not exist, it"
|
||||
echo " will be created."
|
||||
echo " "
|
||||
echo " --sharedir=SHAREDIR"
|
||||
echo " "
|
||||
@@ -104,18 +109,9 @@ print_usage()
|
||||
echo " and LDFLAGS). These files allow certain BLIS makefiles,"
|
||||
echo " such as those in the examples or testsuite directories, to"
|
||||
echo " operate on an installed copy of BLIS rather than a local"
|
||||
echo " (and possibly uninstalled) copy. If given, SHAREDIR will"
|
||||
echo " override the corresponding directory implied by --prefix;"
|
||||
echo " if not given, SHAREDIR defaults to PREFIX/share. If"
|
||||
echo " SHAREDIR refers to a directory that does not exist, it"
|
||||
echo " will be created."
|
||||
echo " "
|
||||
echo " -d DEBUG, --enable-debug[=DEBUG]"
|
||||
echo " "
|
||||
echo " Enable debugging symbols in the library. If argument"
|
||||
echo " DEBUG is given as 'opt', then optimization flags are"
|
||||
echo " kept in the framework, otherwise optimization is"
|
||||
echo " turned off."
|
||||
echo " (and possibly uninstalled) copy. If not given, SHAREDIR"
|
||||
echo " defaults to PREFIX/share. If SHAREDIR refers to a"
|
||||
echo " directory that does not exist, it will be created."
|
||||
echo " "
|
||||
echo " --enable-verbose-make, --disable-verbose-make"
|
||||
echo " "
|
||||
@@ -129,6 +125,13 @@ print_usage()
|
||||
echo " even if the command plus command line arguments exceeds"
|
||||
echo " the operating system limit (ARG_MAX)."
|
||||
echo " "
|
||||
echo " -d DEBUG, --enable-debug[=DEBUG]"
|
||||
echo " "
|
||||
echo " Enable debugging symbols in the library. If argument"
|
||||
echo " DEBUG is given as 'opt', then optimization flags are"
|
||||
echo " kept in the framework, otherwise optimization is"
|
||||
echo " turned off."
|
||||
echo " "
|
||||
echo " --disable-static, --enable-static"
|
||||
echo " "
|
||||
echo " Disable (enabled by default) building BLIS as a static"
|
||||
@@ -141,6 +144,23 @@ print_usage()
|
||||
echo " library. If the shared library build is disabled, the"
|
||||
echo " static library build must remain enabled."
|
||||
echo " "
|
||||
echo " -e SYMBOLS, --export-shared[=SYMBOLS]"
|
||||
echo " "
|
||||
echo " Specify the subset of library symbols that are exported"
|
||||
echo " within a shared library. Valid values for SYMBOLS are:"
|
||||
echo " 'public' (the default) and 'all'. By default, only"
|
||||
echo " functions and variables that belong to public APIs are"
|
||||
echo " exported in shared libraries. However, the user may"
|
||||
echo " instead export all symbols in BLIS, even those that were"
|
||||
echo " intended for internal use only. Note that the public APIs"
|
||||
echo " encompass all functions that almost any user would ever"
|
||||
echo " want to call, including the BLAS/CBLAS compatibility APIs"
|
||||
echo " as well as the basic and expert interfaces to the typed"
|
||||
echo " and object APIs that are unique to BLIS. Also note that"
|
||||
echo " changing this option to 'all' will have no effect in some"
|
||||
echo " environments, such as when compiling with clang on"
|
||||
echo " Windows."
|
||||
echo " "
|
||||
echo " -t MODEL, --enable-threading[=MODEL], --disable-threading"
|
||||
echo " "
|
||||
echo " Enable threading in the library, using threading model"
|
||||
@@ -222,6 +242,16 @@ print_usage()
|
||||
echo " only be enabled when mixed domain/precision support is"
|
||||
echo " enabled."
|
||||
echo " "
|
||||
echo " --disable-sup-handling, --enable-sup-handling"
|
||||
echo " "
|
||||
echo " Disable (enabled by default) handling of small/skinny"
|
||||
echo " matrix problems via separate code branches. When disabled,"
|
||||
echo " these small/skinny level-3 operations will be performed by"
|
||||
echo " the conventional implementation, which is optimized for"
|
||||
echo " medium and large problems. Note that what qualifies as"
|
||||
echo " \"small\" depends on thresholds that may vary by sub-"
|
||||
echo " configuration."
|
||||
echo " "
|
||||
echo " -s NAME --enable-sandbox=NAME"
|
||||
echo " "
|
||||
echo " Enable a separate sandbox implementation of gemm. This"
|
||||
@@ -278,6 +308,7 @@ print_usage()
|
||||
echo " Environment Variables:"
|
||||
echo " "
|
||||
echo " CC Specifies the C compiler to use."
|
||||
echo " CXX Specifies the C++ compiler to use (sandbox only)."
|
||||
echo " RANLIB Specifies the ranlib executable to use."
|
||||
echo " AR Specifies the archiver to use."
|
||||
echo " CFLAGS Specifies additional compiler flags to use (prepended)."
|
||||
@@ -1016,7 +1047,7 @@ auto_detect()
|
||||
# Set the linker flags. We need pthreads because it is needed for
|
||||
# parts of bli_arch.c unrelated to bli_arch_string(), which is called
|
||||
# by the main() function in ${main_c}.
|
||||
if [ $is_win = no ]; then
|
||||
if [[ $is_win == no || "$cc_vendor" != "clang" ]]; then
|
||||
ldflags="${LIBPTHREAD--lpthread}"
|
||||
fi
|
||||
|
||||
@@ -1294,8 +1325,7 @@ get_compiler_version()
|
||||
# to OS X's egrep only returning the first match.
|
||||
cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM' | { read first rest ; echo $first ; })
|
||||
if [ "${cc_vendor}" = "icc" -o \
|
||||
"${cc_vendor}" = "gcc" -o \
|
||||
"${cc_vendor}" = "clang" ]; then
|
||||
"${cc_vendor}" = "gcc" ]; then
|
||||
cc_version=$(${cc} -dumpversion)
|
||||
else
|
||||
cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
|
||||
@@ -1343,7 +1373,7 @@ check_compiler()
|
||||
# Specific:
|
||||
#
|
||||
# skx: icc 15.0.1+, gcc 6.0+, clang 3.9+
|
||||
# knl: icc 14.0.1+, gcc 5.0+, clang 3.5+
|
||||
# knl: icc 14.0.1+, gcc 5.0+, clang 3.9+
|
||||
# haswell: any
|
||||
# sandybridge: any
|
||||
# penryn: any
|
||||
@@ -1418,27 +1448,42 @@ check_compiler()
|
||||
|
||||
# clang
|
||||
if [ "x${cc_vendor}" = "xclang" ]; then
|
||||
|
||||
if [ ${cc_major} -lt 3 ]; then
|
||||
echoerr_unsupportedcc
|
||||
fi
|
||||
if [ ${cc_major} -eq 3 ]; then
|
||||
if [ ${cc_minor} -lt 3 ]; then
|
||||
if [ "$(echo ${vendor_string} | grep -o Apple)" = "Apple" ]; then
|
||||
if [ ${cc_major} -lt 5 ]; then
|
||||
echoerr_unsupportedcc
|
||||
fi
|
||||
if [ ${cc_minor} -lt 5 ]; then
|
||||
# See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
|
||||
if [ ${cc_major} -eq 5 ]; then
|
||||
# Apple clang 5.0 is clang 3.4svn
|
||||
blacklistcc_add "excavator"
|
||||
blacklistcc_add "zen"
|
||||
blacklistcc_add "knl"
|
||||
fi
|
||||
if [ ${cc_minor} -lt 9 ]; then
|
||||
if [ ${cc_major} -lt 7 ]; then
|
||||
blacklistcc_add "knl"
|
||||
blacklistcc_add "skx"
|
||||
fi
|
||||
fi
|
||||
if [ ${cc_major} -lt 4 ]; then
|
||||
# See comment above regarding zen support.
|
||||
#blacklistcc_add "zen"
|
||||
: # explicit no-op since bash can't handle empty loop bodies.
|
||||
else
|
||||
if [ ${cc_major} -lt 3 ]; then
|
||||
echoerr_unsupportedcc
|
||||
fi
|
||||
if [ ${cc_major} -eq 3 ]; then
|
||||
if [ ${cc_minor} -lt 3 ]; then
|
||||
echoerr_unsupportedcc
|
||||
fi
|
||||
if [ ${cc_minor} -lt 5 ]; then
|
||||
blacklistcc_add "excavator"
|
||||
blacklistcc_add "zen"
|
||||
fi
|
||||
if [ ${cc_minor} -lt 9 ]; then
|
||||
blacklistcc_add "knl"
|
||||
blacklistcc_add "skx"
|
||||
fi
|
||||
fi
|
||||
if [ ${cc_major} -lt 4 ]; then
|
||||
# See comment above regarding zen support.
|
||||
#blacklistcc_add "zen"
|
||||
: # explicit no-op since bash can't handle empty loop bodies.
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
}
|
||||
@@ -1496,8 +1541,8 @@ check_assembler()
|
||||
#
|
||||
|
||||
# The assembler on OS X won't recognize AVX-512 without help.
|
||||
if [ "$(uname -s)" == "Darwin" ]; then
|
||||
cflags="-Wa,-march=knl"
|
||||
if [ "${cc_vendor}" == "clang" ]; then
|
||||
cflags="-march=knl"
|
||||
fi
|
||||
|
||||
asm_fp=$(find ${asm_dir} -name "avx512f.s")
|
||||
@@ -1513,8 +1558,8 @@ check_assembler()
|
||||
#
|
||||
|
||||
# The assembler on OS X won't recognize AVX-512 without help.
|
||||
if [ "$(uname -s)" == "Darwin" ]; then
|
||||
cflags="-Wa,-march=skylake-avx512"
|
||||
if [ "${cc_vendor}" == "clang" ]; then
|
||||
cflags="-march=skylake-avx512"
|
||||
fi
|
||||
|
||||
asm_fp=$(find ${asm_dir} -name "avx512dq.s")
|
||||
@@ -1731,21 +1776,33 @@ main()
|
||||
|
||||
# -- configure options --
|
||||
|
||||
# The user-given install prefix and a flag indicating it was given.
|
||||
#install_prefix_def="${HOME}/blis"
|
||||
install_prefix_user=${HOME}/blis # default to this directory.
|
||||
# Define the default prefix so that the print_usage() function can
|
||||
# output it in the --help text.
|
||||
prefix_def='/usr/local'
|
||||
|
||||
# The installation prefix, assigned its default value, and a flag to
|
||||
# track whether or not it was given by the user.
|
||||
prefix=${prefix_def}
|
||||
prefix_flag=''
|
||||
|
||||
# The user-given install libdir and a flag indicating it was given.
|
||||
install_libdir_user=''
|
||||
# The installation exec_prefix, assigned its default value, and a flag to
|
||||
# track whether or not it was given by the user.
|
||||
exec_prefix='${prefix}'
|
||||
exec_prefix_flag=''
|
||||
|
||||
# The installation libdir, assigned its default value, and a flag to
|
||||
# track whether or not it was given by the user.
|
||||
libdir='${exec_prefix}/lib'
|
||||
libdir_flag=''
|
||||
|
||||
# The user-given install includedir and a flag indicating it was given.
|
||||
install_incdir_user=''
|
||||
incdir_flag=''
|
||||
# The installation includedir, assigned its default value, and a flag to
|
||||
# track whether or not it was given by the user.
|
||||
includedir='${prefix}/include'
|
||||
includedir_flag=''
|
||||
|
||||
# The user-given install sharedir and a flag indicating it was given.
|
||||
install_sharedir_user=''
|
||||
# The installation sharedir, assigned its default value, and a flag to
|
||||
# track whether or not it was given by the user.
|
||||
sharedir='${prefix}/share'
|
||||
sharedir_flag=''
|
||||
|
||||
# The preset value of CFLAGS and LDFLAGS (ie: compiler and linker flags
|
||||
@@ -1758,7 +1815,7 @@ main()
|
||||
debug_flag=''
|
||||
|
||||
# The threading flag.
|
||||
threading_model='no'
|
||||
threading_model='off'
|
||||
|
||||
# The method of assigning micropanels to threads in the JR and JR loops.
|
||||
thread_part_jrir='slab'
|
||||
@@ -1772,6 +1829,7 @@ main()
|
||||
enable_arg_max_hack='no'
|
||||
enable_static='yes'
|
||||
enable_shared='yes'
|
||||
export_shared='public'
|
||||
enable_pba_pools='yes'
|
||||
enable_sba_pools='yes'
|
||||
enable_mem_tracing='no'
|
||||
@@ -1781,6 +1839,7 @@ main()
|
||||
enable_cblas='no'
|
||||
enable_mixed_dt='yes'
|
||||
enable_mixed_dt_extra_mem='yes'
|
||||
enable_sup_handling='yes'
|
||||
enable_memkind='' # The default memkind value is determined later on.
|
||||
force_version='no'
|
||||
|
||||
@@ -1821,7 +1880,7 @@ main()
|
||||
|
||||
# Process our command line options.
|
||||
unset OPTIND
|
||||
while getopts ":hp:d:s:t:r:qci:b:-:" opt; do
|
||||
while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do
|
||||
case $opt in
|
||||
-)
|
||||
case "$OPTARG" in
|
||||
@@ -1833,19 +1892,23 @@ main()
|
||||
;;
|
||||
prefix=*)
|
||||
prefix_flag=1
|
||||
install_prefix_user=${OPTARG#*=}
|
||||
prefix=${OPTARG#*=}
|
||||
;;
|
||||
exec-prefix=*)
|
||||
exec_prefix_flag=1
|
||||
exec_prefix=${OPTARG#*=}
|
||||
;;
|
||||
libdir=*)
|
||||
libdir_flag=1
|
||||
install_libdir_user=${OPTARG#*=}
|
||||
libdir=${OPTARG#*=}
|
||||
;;
|
||||
includedir=*)
|
||||
incdir_flag=1
|
||||
install_incdir_user=${OPTARG#*=}
|
||||
includedir_flag=1
|
||||
includedir=${OPTARG#*=}
|
||||
;;
|
||||
sharedir=*)
|
||||
sharedir_flag=1
|
||||
install_sharedir_user=${OPTARG#*=}
|
||||
sharedir=${OPTARG#*=}
|
||||
;;
|
||||
enable-debug)
|
||||
debug_flag=1
|
||||
@@ -1882,15 +1945,18 @@ main()
|
||||
disable-shared)
|
||||
enable_shared='no'
|
||||
;;
|
||||
export-shared=*)
|
||||
export_shared=${OPTARG#*=}
|
||||
;;
|
||||
enable-threading=*)
|
||||
threading_model=${OPTARG#*=}
|
||||
;;
|
||||
disable-threading)
|
||||
threading_model='off'
|
||||
;;
|
||||
thread-part-jrir=*)
|
||||
thread_part_jrir=${OPTARG#*=}
|
||||
;;
|
||||
disable-threading)
|
||||
threading_model='no'
|
||||
;;
|
||||
enable-pba-pools)
|
||||
enable_pba_pools='yes'
|
||||
;;
|
||||
@@ -1946,6 +2012,12 @@ main()
|
||||
disable-mixed-dt-extra-mem)
|
||||
enable_mixed_dt_extra_mem='no'
|
||||
;;
|
||||
enable-sup-handling)
|
||||
enable_sup_handling='yes'
|
||||
;;
|
||||
disable-sup-handling)
|
||||
enable_sup_handling='no'
|
||||
;;
|
||||
with-memkind)
|
||||
enable_memkind='yes'
|
||||
;;
|
||||
@@ -1967,12 +2039,15 @@ main()
|
||||
;;
|
||||
p)
|
||||
prefix_flag=1
|
||||
install_prefix_user=$OPTARG
|
||||
prefix=$OPTARG
|
||||
;;
|
||||
d)
|
||||
debug_flag=1
|
||||
debug_type=$OPTARG
|
||||
;;
|
||||
e)
|
||||
export_shared=$OPTARG
|
||||
;;
|
||||
s)
|
||||
sandbox_flag=1
|
||||
sandbox=$OPTARG
|
||||
@@ -2459,54 +2534,49 @@ main()
|
||||
|
||||
# -- Prepare variables for subsitution into template files -----------------
|
||||
|
||||
# Parse the status of the install prefix and echo feedback.
|
||||
# Parse the status of the prefix option and echo feedback.
|
||||
if [ -n "${prefix_flag}" ]; then
|
||||
echo "${script_name}: detected --prefix='${install_prefix_user}'."
|
||||
echo "${script_name}: detected --prefix='${prefix}'."
|
||||
else
|
||||
echo "${script_name}: no install prefix option given; defaulting to '${install_prefix_user}'."
|
||||
echo "${script_name}: no install prefix option given; defaulting to '${prefix}'."
|
||||
fi
|
||||
|
||||
# Set initial (candidate) values for the libdir and includedir using the
|
||||
# install prefix that was determined above.
|
||||
install_libdir=${install_prefix_user}/lib
|
||||
install_incdir=${install_prefix_user}/include
|
||||
install_sharedir=${install_prefix_user}/share
|
||||
# Parse the status of the exec_prefix option and echo feedback.
|
||||
if [ -n "${exec_prefix_flag}" ]; then
|
||||
echo "${script_name}: detected --exec-prefix='${exec_prefix}'."
|
||||
else
|
||||
echo "${script_name}: no install exec_prefix option given; defaulting to PREFIX."
|
||||
fi
|
||||
|
||||
# Set the install libdir, if it was specified. Note that this will override
|
||||
# the default libdir implied by the install prefix, even if both options
|
||||
# were given.
|
||||
# Parse the status of the libdir option and echo feedback.
|
||||
if [ -n "${libdir_flag}" ]; then
|
||||
echo "${script_name}: detected --libdir='${install_libdir_user}'."
|
||||
install_libdir=${install_libdir_user}
|
||||
echo "${script_name}: detected --libdir='${libdir}'."
|
||||
else
|
||||
echo "${script_name}: no install libdir option given; defaulting to PREFIX/lib."
|
||||
echo "${script_name}: no install libdir option given; defaulting to EXECPREFIX/lib."
|
||||
fi
|
||||
|
||||
# Set the install includedir, if it was specified. Note that this will
|
||||
# override the default includedir implied by the install prefix, even if
|
||||
# both options were given.
|
||||
if [ -n "${incdir_flag}" ]; then
|
||||
echo "${script_name}: detected --includedir='${install_incdir_user}'."
|
||||
install_incdir=${install_incdir_user}
|
||||
# Parse the status of the includedir option and echo feedback.
|
||||
if [ -n "${includedir_flag}" ]; then
|
||||
echo "${script_name}: detected --includedir='${includedir}'."
|
||||
else
|
||||
echo "${script_name}: no install includedir option given; defaulting to PREFIX/include."
|
||||
fi
|
||||
|
||||
# Set the install sharedir, if it was specified. Note that this will
|
||||
# override the default sharedir implied by the install prefix, even if
|
||||
# both options were given.
|
||||
# Parse the status of the sharedir option and echo feedback.
|
||||
if [ -n "${sharedir_flag}" ]; then
|
||||
echo "${script_name}: detected --sharedir='${install_sharedir_user}'."
|
||||
install_sharedir=${install_sharedir_user}
|
||||
echo "${script_name}: detected --sharedir='${sharedir}'."
|
||||
else
|
||||
echo "${script_name}: no install sharedir option given; defaulting to PREFIX/share."
|
||||
fi
|
||||
|
||||
# Echo the installation directories that we settled on.
|
||||
echo "${script_name}: final installation directories:"
|
||||
echo "${script_name}: libdir: ${install_libdir}"
|
||||
echo "${script_name}: includedir: ${install_incdir}"
|
||||
echo "${script_name}: sharedir: ${install_sharedir}"
|
||||
echo "${script_name}: prefix: "${prefix}
|
||||
echo "${script_name}: exec_prefix: "${exec_prefix}
|
||||
echo "${script_name}: libdir: "${libdir}
|
||||
echo "${script_name}: includedir: "${includedir}
|
||||
echo "${script_name}: sharedir: "${sharedir}
|
||||
echo "${script_name}: NOTE: the variables above can be overridden when running make."
|
||||
|
||||
# Check if CFLAGS is non-empty.
|
||||
if [ -n "${CFLAGS}" ]; then
|
||||
@@ -2573,6 +2643,23 @@ main()
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if the "export shared" flag was specified.
|
||||
if [ "x${export_shared}" = "xall" ]; then
|
||||
if [ "x${enable_shared}" = "xyes" ]; then
|
||||
echo "${script_name}: exporting all symbols within shared library."
|
||||
else
|
||||
echo "${script_name}: ignoring request to export all symbols within shared library."
|
||||
fi
|
||||
elif [ "x${export_shared}" = "xpublic" ]; then
|
||||
if [ "x${enable_shared}" = "xyes" ]; then
|
||||
echo "${script_name}: exporting only public symbols within shared library."
|
||||
fi
|
||||
else
|
||||
echo "${script_name}: *** Invalid argument '${export_shared}' to --export-shared option given."
|
||||
echo "${script_name}: *** Please use 'public' or 'all'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check the threading model flag and standardize its value, if needed.
|
||||
# NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred.
|
||||
enable_openmp='no'
|
||||
@@ -2594,9 +2681,11 @@ main()
|
||||
enable_pthreads='yes'
|
||||
enable_pthreads_01=1
|
||||
threading_model="pthreads" # Standardize the value.
|
||||
elif [ "x${threading_model}" = "xno" ] ||
|
||||
elif [ "x${threading_model}" = "xoff" ] ||
|
||||
[ "x${threading_model}" = "xno" ] ||
|
||||
[ "x${threading_model}" = "xnone" ]; then
|
||||
echo "${script_name}: threading is disabled."
|
||||
threading_model="off"
|
||||
else
|
||||
echo "${script_name}: *** Unsupported threading model: ${threading_model}."
|
||||
exit 1
|
||||
@@ -2707,6 +2796,13 @@ main()
|
||||
enable_mixed_dt_extra_mem_01=0
|
||||
enable_mixed_dt_01=0
|
||||
fi
|
||||
if [ "x${enable_sup_handling}" = "xyes" ]; then
|
||||
echo "${script_name}: small matrix handling is enabled."
|
||||
enable_sup_handling_01=1
|
||||
else
|
||||
echo "${script_name}: small matrix handling is disabled."
|
||||
enable_sup_handling_01=0
|
||||
fi
|
||||
|
||||
# Report integer sizes.
|
||||
if [ "x${int_type_size}" = "x32" ]; then
|
||||
@@ -2758,13 +2854,15 @@ main()
|
||||
# Variables that may contain forward slashes, such as paths, need extra
|
||||
# escaping when used in sed commands. We insert those extra escape
|
||||
# characters here so that the sed commands below do the right thing.
|
||||
os_name_esc=$(echo "${os_name}" | sed 's/\//\\\//g')
|
||||
install_libdir_esc=$(echo "${install_libdir}" | sed 's/\//\\\//g')
|
||||
install_incdir_esc=$(echo "${install_incdir}" | sed 's/\//\\\//g')
|
||||
install_sharedir_esc=$(echo "${install_sharedir}" | sed 's/\//\\\//g')
|
||||
dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g')
|
||||
cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g')
|
||||
cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g')
|
||||
os_name_esc=$(echo "${os_name}" | sed 's/\//\\\//g')
|
||||
prefix_esc=$(echo "${prefix}" | sed 's/\//\\\//g')
|
||||
exec_prefix_esc=$(echo "${exec_prefix}" | sed 's/\//\\\//g')
|
||||
libdir_esc=$(echo "${libdir}" | sed 's/\//\\\//g')
|
||||
includedir_esc=$(echo "${includedir}" | sed 's/\//\\\//g')
|
||||
sharedir_esc=$(echo "${sharedir}" | sed 's/\//\\\//g')
|
||||
dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g')
|
||||
cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g')
|
||||
cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g')
|
||||
#sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g')
|
||||
|
||||
# For RANLIB, if the variable is not set, we use a default value of
|
||||
@@ -2779,7 +2877,7 @@ main()
|
||||
# For Windows builds, clear the libpthread_esc variable so that
|
||||
# no pthreads library is substituted into config.mk. (Windows builds
|
||||
# employ an implementation of pthreads that is internal to BLIS.)
|
||||
if [ $is_win = yes ]; then
|
||||
if [[ $is_win == yes && "$cc_vendor" == "clang" ]]; then
|
||||
libpthread_esc=
|
||||
fi
|
||||
|
||||
@@ -2821,13 +2919,13 @@ main()
|
||||
|
||||
# -- Determine whether we are performing an out-of-tree build --------------
|
||||
|
||||
if [ ${dist_path} != "./" ]; then
|
||||
if [ "${dist_path}" != "./" ]; then
|
||||
|
||||
# At this point, we know the user did not run "./configure". But we
|
||||
# have not yet ruled out "<fullpath>/configure" or some # equivalent
|
||||
# that uses relative paths. To further rule out these possibilities,
|
||||
# we create a dummy file in the current build directory.
|
||||
touch ./${dummy_file}
|
||||
touch "./${dummy_file}"
|
||||
|
||||
# If the dummy file we just created in the current directory does not
|
||||
# appear in the source distribution path, then we are in a different
|
||||
@@ -2871,14 +2969,17 @@ main()
|
||||
| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
|
||||
| sed -e "s/@debug_type@/${debug_type}/g" \
|
||||
| sed -e "s/@threading_model@/${threading_model}/g" \
|
||||
| sed -e "s/@install_libdir@/${install_libdir_esc}/g" \
|
||||
| sed -e "s/@install_incdir@/${install_incdir_esc}/g" \
|
||||
| sed -e "s/@install_sharedir@/${install_sharedir_esc}/g" \
|
||||
| sed -e "s/@prefix@/${prefix_esc}/g" \
|
||||
| sed -e "s/@exec_prefix@/${exec_prefix_esc}/g" \
|
||||
| sed -e "s/@libdir@/${libdir_esc}/g" \
|
||||
| sed -e "s/@includedir@/${includedir_esc}/g" \
|
||||
| sed -e "s/@sharedir@/${sharedir_esc}/g" \
|
||||
| sed -e "s/@enable_verbose@/${enable_verbose}/g" \
|
||||
| sed -e "s/@configured_oot@/${configured_oot}/g" \
|
||||
| sed -e "s/@enable_arg_max_hack@/${enable_arg_max_hack}/g" \
|
||||
| sed -e "s/@enable_static@/${enable_static}/g" \
|
||||
| sed -e "s/@enable_shared@/${enable_shared}/g" \
|
||||
| sed -e "s/@export_shared@/${export_shared}/g" \
|
||||
| sed -e "s/@enable_blas@/${enable_blas}/g" \
|
||||
| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
|
||||
| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
|
||||
@@ -2910,6 +3011,7 @@ main()
|
||||
| sed -e "s/@enable_cblas@/${enable_cblas_01}/g" \
|
||||
| sed -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \
|
||||
| sed -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
|
||||
| sed -e "s/@enable_sup_handling@/${enable_sup_handling_01}/g" \
|
||||
| sed -e "s/@enable_memkind@/${enable_memkind_01}/g" \
|
||||
| sed -e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g" \
|
||||
| sed -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \
|
||||
|
||||
@@ -9,6 +9,9 @@
|
||||
* **[Step 3b: Testing (optional)](BuildSystem.md#step-3b-testing-optional)**
|
||||
* **[Step 4: Installation](BuildSystem.md#step-4-installation)**
|
||||
* **[Cleaning out build products](BuildSystem.md#cleaning-out-build-products)**
|
||||
* **[Compiling with BLIS](BuildSystem.md#compiling-with-blis)**
|
||||
* [Disabling BLAS prototypes](BuildSystem.md#disabling-blas-prototypes)
|
||||
* [CBLAS](BuildSystem.md#cblas)
|
||||
* **[Linking against BLIS](BuildSystem.md#linking-against-blis)**
|
||||
* **[Uninstalling](BuildSystem.md#uninstalling)**
|
||||
* **[make targets](BuildSystem.md#make-targets)**
|
||||
@@ -83,11 +86,11 @@ Alternatively, `configure` can automatically select a configuration based on you
|
||||
```
|
||||
$ ./configure auto
|
||||
```
|
||||
However, as of this writing, only a limited number of architectures are detected. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used.
|
||||
However, as of this writing, only a limited number of architectures are detected. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used.
|
||||
|
||||
Upon running configure, you will get output similar to the following. The exact output will depend on whether you cloned BLIS from a `git` repository or whether you obtained BLIS via a downloadable tarball from the [releases](https://github.com/flame/blis/releases) page.
|
||||
```
|
||||
$ ./configure haswell
|
||||
$ ./configure --prefix=$HOME/blis haswell
|
||||
configure: using 'gcc' compiler.
|
||||
configure: found gcc version 5.4.0 (maj: 5, min: 4, rev: 0).
|
||||
configure: checking for blacklisted configurations due to gcc 5.4.0.
|
||||
@@ -166,17 +169,11 @@ The installation prefix can be specified via the `--prefix=PREFIX` option:
|
||||
```
|
||||
$ ./configure --prefix=/usr <configname>
|
||||
```
|
||||
This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `$(HOME)/blis`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
|
||||
This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `/usr/local`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
|
||||
```
|
||||
$ ./configure --libdir=/usr/lib --includedir=/usr/include <configname>
|
||||
```
|
||||
The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any `PREFIX` path, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `PREFIX/lib` and `INCDIR` defaults to `PREFIX/include`, but each will be overriden by their respective `--libdir`/`--includedir` options. So,
|
||||
```
|
||||
$ ./configure --libdir=/usr/lib <configname>
|
||||
|
||||
```
|
||||
will configure BLIS to install libraries to `/usr/lib` and header files to the default location (`$HOME/blis/include`).
|
||||
Also, note that `configure` will create any installation directories that do not already exist.
|
||||
The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any path implied by `PREFIX`, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `EXECPREFIX/lib` (where `EXECPREFIX`, set via `--exec-prefix=EXECPREFIX`, defaults to `PREFIX`) and `INCDIR` defaults to `PREFIX/include`, but `LIBDIR` and `INCDIR` will each be overriden by their respective `--libdir`/`--includedir` options. There is a third related option, `--sharedir=SHAREDIR`, where `SHAREDIR` defaults to `PREFIX/share`. This option specifies the installation directory for certain makefile fragments that contain variables determined by `configure` (e.g. `CC`, `CFLAGS`, `LDFLAGS`, etc.). These files allow certain BLIS makefiles, such as those in the `examples` or `testsuite` directories, to operate on an installed copy of BLIS rather than a local (and possibly uninstalled) copy.
|
||||
|
||||
For a complete list of supported `configure` options and arguments, run `configure` with the `-h` option:
|
||||
```
|
||||
@@ -338,6 +335,47 @@ Removing include.
|
||||
Running the `distclean` target is like saying, "Remove anything ever created by the build system."
|
||||
|
||||
|
||||
## Compiling with BLIS
|
||||
|
||||
All BLIS definitions and prototypes may be included in your C source file by including a single header file, `blis.h`:
|
||||
```c
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "otherstuff.h"
|
||||
#include "blis.h"
|
||||
```
|
||||
If the BLAS compatibility layer was enabled at configure-time (as it is by default), then `blis.h` will also provide BLAS prototypes to your source code.
|
||||
|
||||
|
||||
### Disabling BLAS prototypes
|
||||
|
||||
Some applications already `#include` a header that contains BLAS prototypes. This can cause problems if those applications also try to `#include` the BLIS header file, as shown above. Suppose for a moment that `otherstuff.h` in the example above already provides BLAS prototypes.
|
||||
```
|
||||
$ gcc -I/path/to/blis -I/path/to/otherstuff -c main.c -o main.o
|
||||
In file included from main.c:41:0:
|
||||
/path/to/blis/blis.h:36900:111: error: conflicting declaration of C function ‘int xerbla_(const bla_character*, const bla_integer*, ftnlen)’
|
||||
TEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
|
||||
```
|
||||
If your application is already declaring (prototyping) BLAS functions, then you may disable those prototypes from being defined included within `blis.h`. This prevents `blis.h` from re-declaring those prototypes, or, allows your other header to declare those functions for the first time, depending on the order that you `#include` the headers.
|
||||
```c
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "otherstuff.h"
|
||||
#define BLIS_DISABLE_BLAS_DEFS // disable BLAS prototypes within BLIS.
|
||||
#include "blis.h"
|
||||
```
|
||||
By `#defining` the `BLIS_DISABLE_BLAS_DEFS` macro, we signal to `blis.h` that it should skip over the BLAS prototypes, but otherwise `#include` everything else as it normally would. Note that `BLIS_DISABLE_BLAS_DEFS` must be `#defined` *prior* to the `#include "blis.h"` directive in order for it to have any effect.
|
||||
|
||||
|
||||
### CBLAS
|
||||
|
||||
If you build BLIS with CBLAS enabled and you wish to access CBLAS function prototypes from within your application, you will have to `#include` the `cblas.h` header separately from `blis.h`.
|
||||
```
|
||||
#include "blis.h"
|
||||
#include "cblas.h"
|
||||
```
|
||||
|
||||
|
||||
## Linking against BLIS
|
||||
|
||||
Once you have instantiated (configured and compiled, and perhaps installed) a BLIS library, you can link to it in your application's makefile as you would any other library. The following is an abbreviated makefile for a small hypothetical application that has just two external dependencies: BLIS and the standard C math library. We also link against libpthread since that library has been a runtime dependency of BLIS since 70640a3 (December 2017).
|
||||
@@ -357,7 +395,7 @@ OBJS = main.o util.o other.o
|
||||
%.o: %.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
all: $(OBJS)
|
||||
all: $(OBJS)
|
||||
$(LINKER) $(OBJS) $(BLIS_LIB) $(OTHER_LIBS) -o my_program.x
|
||||
```
|
||||
The above example assumes you will want to include BLIS definitions and function prototypes into your application via `#include blis.h`. (If you are only using the BLIS via the BLAS compatibility layer, including `blis.h` is not necessary.) Since BLIS headers are installed into a `blis` subdirectory of `PREFIX/include`, you must make sure that the compiler knows where to find the `blis.h` header file. This is typically accomplished by inserting `#include "blis.h"` into your application's source code files and compiling the code with `-I PREFIX/include/blis`.
|
||||
|
||||
@@ -12,8 +12,8 @@ The following table lists architectures for which there exist optimized level-3
|
||||
A few remarks / reminders:
|
||||
* Optimizing only the [gemm microkernel](KernelsHowTo.md#gemm-microkernel) will result in optimal performance for all [level-3 operations](BLISTypedAPI#level-3-operations) except `trsm` (which will typically achieve 60 - 80% of attainable peak performance).
|
||||
* The [trsm](BLISTypedAPI#trsm) operation needs the [gemmtrsm microkernel(s)](KernelsHowTo.md#gemmtrsm-microkernels), in addition to the aforementioned [gemm microkernel](KernelsHowTo.md#gemm-microkernel), in order reach optimal performance.
|
||||
* Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available. Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
|
||||
* Some microarchitectures use the same sub-configuration. This is not a typo. For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kabylake, and Coffeelake all use the `haswell` sub-configuration and the kernels registered therein.
|
||||
* Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available, but the "native" complex domain gemm microkernel is unavailable. Note that the table below lists native kernels, so if a microarchitecture lists only `sd`, support for both `c` and `z` datatypes will be provided via the 1m method. (Note: most people cannot tell the difference between native and 1m-based performance.) Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
|
||||
* Some microarchitectures use the same sub-configuration. *This is not a typo.* For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kaby Lake, and Coffee Lake all use the `haswell` sub-configuration and the kernels registered therein. Microkernels can be recycled in this manner because the key detail that determines level-3 performance outcomes is actually the vector ISA, not the microarchitecture. In the previous example, all of the microarchitectures listed support AVX2 (but not AVX-512), and therefore they can reuse the same microkernels.
|
||||
* Remember that you (usually) don't have to choose your sub-configuration manually! Instead, you can always request configure-time hardware detection via `./configure auto`. This will defer to internal logic (based on CPUID for x86_64 systems) that will attempt to choose the appropriate sub-configuration automatically.
|
||||
|
||||
| Vendor/Microarchitecture | BLIS sub-configuration | `gemm` | `gemmtrsm` |
|
||||
@@ -26,7 +26,7 @@ A few remarks / reminders:
|
||||
| Intel Core2 (SSE3) | `penryn` | `sd` | `d` |
|
||||
| Intel Sandy/Ivy Bridge (AVX/FMA3) | `sandybridge` | `sdcz` | |
|
||||
| Intel Haswell, Broadwell (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
| Intel Sky/Kaby/Coffeelake (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
| Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
| Intel Knights Landing (AVX-512/FMA3) | `knl` | `sd` | |
|
||||
| Intel SkylakeX (AVX-512/FMA3) | `skx` | `sd` | |
|
||||
| ARMv7 Cortex-A9 (NEON) | `cortex-a9` | `sd` | |
|
||||
|
||||
@@ -23,11 +23,17 @@
|
||||
|
||||
# Introduction
|
||||
|
||||
Our paper [Anatomy of High-Performance Many-Threaded Matrix Multiplication](https://github.com/flame/blis#citations), presented at IPDPS'14, identified 5 loops around the microkernel as opportunities for parallelization within level-3 operations such as `gemm`. Within BLIS, we have enabled parallelism for 4 of those loops and have extended it to the rest of the level-3 operations except for `trsm`.
|
||||
Our paper [Anatomy of High-Performance Many-Threaded Matrix Multiplication](https://github.com/flame/blis#citations), presented at IPDPS'14, identified five loops around the microkernel as opportunities for parallelization within level-3 operations such as `gemm`. Within BLIS, we have enabled parallelism for four of those loops, with the fifth planned for future work. This software architecture extends naturally to all level-3 operations except for `trsm`, where its application is necessarily limited to three of the five loops due to inter-iteration dependencies.
|
||||
|
||||
**IMPORTANT**: Multithreading in BLIS is disabled by default. Furthermore, even when multithreading is enabled, BLIS will default to single-threaded execution at runtime. In order to both *allow* and *invoke* parallelism from within BLIS operations, you must both *enable* multithreading at configure-time and *specify* multithreading at runtime.
|
||||
|
||||
To summarize: In order to observe multithreaded parallelism within a BLIS operation, you must do *both* of the following:
|
||||
1. Enable multithreading at configure-time. This is discussed in the [next section](docs/Multithreading.md#enabling-multithreading).
|
||||
2. Specify multithreading at runtime. This is also dicussed [later on](docs/Multithreading.md#specifying-multithreading).
|
||||
|
||||
# Enabling multithreading
|
||||
|
||||
Note that BLIS disables multithreading by default. In order to extract multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.
|
||||
BLIS disables multithreading by default. In order to allow multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.
|
||||
|
||||
As of this writing, BLIS optionally supports multithreading via either OpenMP or POSIX threads.
|
||||
|
||||
@@ -101,7 +107,7 @@ This pattern--automatic or manual--holds regardless of which of the three method
|
||||
|
||||
Regardless of which method is employed, and which specific way within each method, after setting the number of threads, the application may call the desired level-3 operation (via either the [typed API](docs/BLISTypedAPI.md) or the [object API](docs/BLISObjectAPI.md)) and the operation will execute in a multithreaded manner. (When calling BLIS via the BLAS API, only the first two (global) methods are available.)
|
||||
|
||||
NOTE: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the manual way will always take precedence.** Also, specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1).
|
||||
**Note**: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the manual way will always take precedence.** Also, specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1).
|
||||
|
||||
## Globally via environment variables
|
||||
|
||||
@@ -109,6 +115,8 @@ The most common method of specifying multithreading in BLIS is globally via envi
|
||||
|
||||
Regardless of whether you end up using the automatic or manual way of expressing a request for multithreading, note that the environment variables are read (via `getenv()`) by BLIS **only once**, when the library is initialized. Subsequent to library initialization, the global settings for parallelization may only be changed via the [global runtime API](Multithreading.md#globally-at-runtime). If this constraint is not a problem, then environment variables may work fine for you. Otherwise, please consider [local settings](Multithreading.md#locally-at-runtime). (Local settings may used at any time, regardless of whether global settings were explicitly specified, and local settings always override global settings.)
|
||||
|
||||
**Note**: Regardless of which way ([automatic](Multithreading.md#environment-variables-the-automatic-way) or [manual](Multithreading.md#environment-variables-the-manual-way)) environment variables are used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs that are unique to BLIS.
|
||||
|
||||
### Environment variables: the automatic way
|
||||
|
||||
The automatic way of specifying parallelism entails simply setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable:
|
||||
@@ -119,7 +127,7 @@ $ ./my_blis_program
|
||||
```
|
||||
This causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `OMP_NUM_THREADS`. If neither variable is set, the default number of threads is 1.
|
||||
|
||||
**Note:** We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
|
||||
**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
|
||||
|
||||
### Environment variables: the manual way
|
||||
|
||||
@@ -127,7 +135,7 @@ The manual way of specifying parallelism involves communicating which loops with
|
||||
|
||||
The below chart describes the five loops used in BLIS's matrix multiplication operations.
|
||||
|
||||
| Loop around microkernel | Environment variable | Direction | Notes |
|
||||
| Loop around microkernel | Environment variable | Direction | Notes |
|
||||
|:-------------------------|:---------------------|:----------|:------------|
|
||||
| 5th loop | `BLIS_JC_NT` | `n` | |
|
||||
| 4th loop | _N/A_ | `k` | Not enabled |
|
||||
@@ -154,6 +162,8 @@ Next, which combinations of loops to parallelize depends on which caches are sha
|
||||
|
||||
If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized).
|
||||
|
||||
**Note**: Regardless of which way ([automatic](Multithreading.md#globally-at-runtime-the-automatic-way) or [manual](Multithreading.md#globally-at-runtime-the-manual-way)) the global runtime API is used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs that are unique to BLIS.
|
||||
|
||||
### Globally at runtime: the automatic way
|
||||
|
||||
If you simply want to specify an overall number of threads and let BLIS choose a thread factorization automatically, use the following function:
|
||||
@@ -193,6 +203,8 @@ In addition to the global methods based on environment variables and runtime fun
|
||||
|
||||
As with environment variables and the global runtime API, there are two ways to specify parallelism: the automatic way and the manual way. Both ways involve allocating a BLIS-specific object, initializing the object and encoding the desired parallelization, and then passing a pointer to the object into one of the expert interfaces of either the [typed](docs/BLISTypedAPI.md) or [object](docs/BLISObjectAPI) APIs. We provide examples of utilizing this threading object below.
|
||||
|
||||
**Note**: Neither way ([automatic](Multithreading.md#locally-at-runtime-the-automatic-way) nor [manual](Multithreading.md#locally-at-runtime-the-manual-way)) of specifying multithreading via the local runtime API can be used via the BLAS interfaces. The local runtime API may *only* be used via the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs, which are unique to BLIS. (Furthermore, the expert interfaces of each API must be used. This is demonstrated later on in this section.)
|
||||
|
||||
### Initializing a rntm_t
|
||||
|
||||
Before specifying the parallelism (automatically or manually), you must first allocate a special BLIS object called a `rntm_t` (runtime). The object is quite small (about 64 bytes), and so we recommend allocating it statically on the function stack:
|
||||
|
||||
394
docs/Performance.md
Normal file
@@ -0,0 +1,394 @@
|
||||
# Contents
|
||||
|
||||
* **[Contents](Performance.md#contents)**
|
||||
* **[Introduction](Performance.md#introduction)**
|
||||
* **[General information](Performance.md#general-information)**
|
||||
* **[Level-3 performance](Performance.md#level-3-performance)**
|
||||
* **[ThunderX2](Performance.md#thunderx2)**
|
||||
* **[Experiment details](Performance.md#thunderx2-experiment-details)**
|
||||
* **[Results](Performance.md#thunderx2-results)**
|
||||
* **[SkylakeX](Performance.md#skylakex)**
|
||||
* **[Experiment details](Performance.md#skylakex-experiment-details)**
|
||||
* **[Results](Performance.md#skylakex-results)**
|
||||
* **[Haswell](Performance.md#haswell)**
|
||||
* **[Experiment details](Performance.md#haswell-experiment-details)**
|
||||
* **[Results](Performance.md#haswell-results)**
|
||||
* **[Epyc](Performance.md#epyc)**
|
||||
* **[Experiment details](Performance.md#epyc-experiment-details)**
|
||||
* **[Results](Performance.md#epyc-results)**
|
||||
* **[Feedback](Performance.md#feedback)**
|
||||
|
||||
# Introduction
|
||||
|
||||
This document showcases performance results for a representative sample of
|
||||
level-3 operations on large matrices with BLIS and BLAS for several hardware
|
||||
architectures.
|
||||
|
||||
# General information
|
||||
|
||||
Generally speaking, for level-3 operations on large matrices, we publish three
|
||||
"panels" for each type of hardware,
|
||||
each of which reports one of: single-threaded performance, multithreaded
|
||||
performance on a single socket, or multithreaded performance on two sockets.
|
||||
Each panel will consist of a 4x5 grid of graphs, with each row representing
|
||||
a different datatype (single real, double real, single complex, and double
|
||||
complex) and each column representing a different operation (`gemm`,
|
||||
`hemm`/`symm`, `herk`/`syrk`, `trmm`, and `trsm`).
|
||||
Each of the 20 graphs within a panel will contain an x-axis that reports
|
||||
problem size, with all matrix dimensions equal to the problem size (e.g.
|
||||
_m_ = _n_ = _k_), resulting in square matrices.
|
||||
The y-axis will report in units GFLOPS (billions of floating-point operations
|
||||
per second) in the case of single-threaded performance, or GFLOPS/core in the
|
||||
case of single- or dual-socket multithreaded performance, where GFLOPS/core
|
||||
is simply the total GFLOPS observed divided by the number of threads utilized.
|
||||
This normalization is done intentionally in order to facilitate a visual
|
||||
assessment of the drop in efficiency of multithreaded performance relative
|
||||
to their single-threaded baselines.
|
||||
|
||||
It's also worth pointing out that the top of each graph (e.g. the maximum
|
||||
y-axis value depicted) _always_ corresponds to the theoretical peak performance
|
||||
under the conditions associated with that graph.
|
||||
Theoretical peak performance, in units of GFLOPS/core, is calculated as the
|
||||
product of:
|
||||
1. the maximum sustainable clock rate in GHz; and
|
||||
2. the maximum number of floating-point operations (flops) that can be
|
||||
executed per cycle (per core).
|
||||
|
||||
Note that the maximum sustainable clock rate may change depending on the
|
||||
conditions.
|
||||
For example, on some systems the maximum clock rate is higher when only one
|
||||
core is active (e.g. single-threaded performance) versus when all cores are
|
||||
active (e.g. multithreaded performance).
|
||||
The maximum number of flops executable per cycle (per core) is generally
|
||||
computed as the product of:
|
||||
1. the maximum number of fused multiply-add (FMA) vector instructions that
|
||||
can be issued per cycle (per core);
|
||||
2. the maximum number of elements that can be stored within a single vector
|
||||
register (for the datatype in question); and
|
||||
3. 2.0, since an FMA instruction fuses two operations (a multiply and an add).
|
||||
|
||||
The problem size range, represented on the x-axis, is usually sampled with 50
|
||||
equally-spaced problem size.
|
||||
For example, for single-threaded execution, we might choose to execute with
|
||||
problem sizes of 48 to 2400 in increments of 48, or 56 to 2800 in increments
|
||||
of 56.
|
||||
These values are almost never chosen for any particular (read: sneaky) reason;
|
||||
rather, we start with a "good" maximum problem size, such as 2400 or 2800, and
|
||||
then divide it by 50 to obtain the appropriate starting point and increment.
|
||||
|
||||
Finally, each point along each curve represents the best of three trials.
|
||||
|
||||
# Interpretation
|
||||
|
||||
In general, the the curves associated with higher-performing implementations
|
||||
will appear higher in the graphs than lower-performing implementations.
|
||||
Ideally, an implementation will climb in performance (as a function of problem
|
||||
size) as quickly as possible and asymptotically approach some high fraction of
|
||||
peak performance.
|
||||
|
||||
Occasionally, we may publish graphs with incomplete curves--for example,
|
||||
only the first 25 data points in a typical 50-point series--usually because
|
||||
the implementation being tested was slow enough that it was not practical to
|
||||
allow it to finish.
|
||||
|
||||
Where along the x-axis you focus your attention will depend on the segment of
|
||||
the problem size range that you care about most. Some people's applications
|
||||
depend heavily on smaller problems, where "small" can mean anything from 10
|
||||
to 1000 or even higher. Some people consider 1000 to be quite large, while
|
||||
others insist that 5000 is merely "medium." What each of us considers to be
|
||||
small, medium, or large (naturally) depends heavily on the kinds of dense
|
||||
linear algebra problems we tend to encounter. No one is "right" or "wrong"
|
||||
about their characterization of matrix smallness or bigness since each person's
|
||||
relative frame of reference can vary greatly. That said, the
|
||||
[Science of High-Performance Computing](http://shpc.ices.utexas.edu/) group at
|
||||
[The University of Texas at Austin](https://www.utexas.edu/) tends to target
|
||||
matrices that it classifies as "medium-to-large", and so most of the graphs
|
||||
presented in this document will reflect that targeting in their x-axis range.
|
||||
|
||||
When corresponding with us, via email or when opening an
|
||||
[issue](https://github.com/flame/blis/issues) on github, we kindly ask that
|
||||
you specify as closely as possible (though a range is fine) your problem
|
||||
size of interest so that we can better assist you.
|
||||
|
||||
# Level-3 performance
|
||||
|
||||
## ThunderX2
|
||||
|
||||
### ThunderX2 experiment details
|
||||
|
||||
* Location: Unknown
|
||||
* Processor model: Marvell ThunderX2 CN9975
|
||||
* Core topology: two sockets, 28 cores per socket, 56 cores total
|
||||
* SMT status: disabled at boot-time
|
||||
* Max clock rate: 2.2GHz (single-core and multicore)
|
||||
* Max vector register length: 128 bits (NEON)
|
||||
* Max FMA vector IPC: 2
|
||||
* Peak performance:
|
||||
* single-core: 17.6 GFLOPS (double-precision), 35.2 GFLOPS (single-precision)
|
||||
* multicore: 17.6 GFLOPS/core (double-precision), 35.2 GFLOPS/core (single-precision)
|
||||
* Operating system: Ubuntu 16.04 (Linux kernel 4.15.0)
|
||||
* Compiler: gcc 7.3.0
|
||||
* Results gathered: 14 February 2019
|
||||
* Implementations tested:
|
||||
* BLIS 075143df (0.5.1-39)
|
||||
* configured with `./configure -t openmp thunderx2` (single- and multithreaded)
|
||||
* sub-configuration exercised: `thunderx2`
|
||||
* Single-threaded (1 core) execution requested via no change in environment variables
|
||||
* Multithreaded (28 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=7`
|
||||
* Multithreaded (56 core) execution requested via `export BLIS_JC_NT=8 BLIS_IC_NT=7`
|
||||
* OpenBLAS 52d3f7a
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=56` (multithreaded, 56 cores)
|
||||
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
|
||||
* Multithreaded (28 core) execution requested via `export OPENBLAS_NUM_THREADS=28`
|
||||
* Multithreaded (56 core) execution requested via `export OPENBLAS_NUM_THREADS=56`
|
||||
* ARMPL 18.4
|
||||
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
|
||||
* Multithreaded (28 core) execution requested via `export OMP_NUM_THREADS=28`
|
||||
* Multithreaded (56 core) execution requested via `export OMP_NUM_THREADS=56`
|
||||
* Affinity:
|
||||
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 55"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
|
||||
* Frequency throttling (via `cpupower`):
|
||||
* No changes made.
|
||||
* Comments:
|
||||
* ARMPL performance is remarkably uneven across datatypes and operations, though it would appear their "base" consists of OpenBLAS, which they then optimize for select, targeted routines. Unfortunately, we were unable to test the absolute latest versions of OpenBLAS and ARMPL on this hardware before we lost access. We will rerun these experiments once we gain access to a similar system.
|
||||
|
||||
### ThunderX2 results
|
||||
|
||||
#### pdf
|
||||
|
||||
* [ThunderX2 single-threaded](graphs/large/l3_perf_tx2_nt1.pdf)
|
||||
* [ThunderX2 multithreaded (28 cores)](graphs/large/l3_perf_tx2_jc4ic7_nt28.pdf)
|
||||
* [ThunderX2 multithreaded (56 cores)](graphs/large/l3_perf_tx2_jc8ic7_nt56.pdf)
|
||||
|
||||
#### png (inline)
|
||||
|
||||
* **ThunderX2 single-threaded**
|
||||

|
||||
* **ThunderX2 multithreaded (28 cores)**
|
||||

|
||||
* **ThunderX2 multithreaded (56 cores)**
|
||||

|
||||
|
||||
---
|
||||
|
||||
## SkylakeX
|
||||
|
||||
### SkylakeX experiment details
|
||||
|
||||
* Location: Oracle cloud
|
||||
* Processor model: Intel Xeon Platinum 8167M (SkylakeX/AVX-512)
|
||||
* Core topology: two sockets, 26 cores per socket, 52 cores total
|
||||
* SMT status: enabled, but not utilized
|
||||
* Max clock rate: 2.0GHz (single-core and multicore)
|
||||
* Max vector register length: 512 bits (AVX-512)
|
||||
* Max FMA vector IPC: 2
|
||||
* Peak performance:
|
||||
* single-core: 64 GFLOPS (double-precision), 128 GFLOPS (single-precision)
|
||||
* multicore: 64 GFLOPS/core (double-precision), 128 GFLOPS/core (single-precision)
|
||||
* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
|
||||
* Compiler: gcc 7.3.0
|
||||
* Results gathered: 6 March 2019, 27 March 2019
|
||||
* Implementations tested:
|
||||
* BLIS 9f1dbe5 (0.5.1-54)
|
||||
* configured with `./configure -t openmp auto` (single- and multithreaded)
|
||||
* sub-configuration exercised: `skx`
|
||||
* Single-threaded (1 core) execution requested via no change in environment variables
|
||||
* Multithreaded (26 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=13`
|
||||
* Multithreaded (52 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=13`
|
||||
* OpenBLAS 0.3.5
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=52` (multithreaded, 52 cores)
|
||||
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
|
||||
* Multithreaded (26 core) execution requested via `export OPENBLAS_NUM_THREADS=26`
|
||||
* Multithreaded (52 core) execution requested via `export OPENBLAS_NUM_THREADS=52`
|
||||
* Eigen 3.3.90
|
||||
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
|
||||
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
|
||||
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
|
||||
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
|
||||
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
|
||||
* Multithreaded (26 core) execution requested via `export OMP_NUM_THREADS=26`
|
||||
* Multithreaded (52 core) execution requested via `export OMP_NUM_THREADS=52`
|
||||
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
|
||||
* MKL 2019 update 1
|
||||
* Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
|
||||
* Multithreaded (26 core) execution requested via `export MKL_NUM_THREADS=26`
|
||||
* Multithreaded (52 core) execution requested via `export MKL_NUM_THREADS=52`
|
||||
* Affinity:
|
||||
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 51"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
|
||||
* Frequency throttling (via `cpupower`):
|
||||
* Driver: acpi-cpufreq
|
||||
* Governor: performance
|
||||
* Hardware limits: 1.0GHz - 2.0GHz
|
||||
* Adjusted minimum: 2.0GHz
|
||||
* Comments:
|
||||
* MKL yields superb performance for most operations, though BLIS is not far behind except for `trsm`. (We understand the `trsm` underperformance and hope to address it in the future.) OpenBLAS lags far behind MKL and BLIS due to lack of full support for AVX-512, and possibly other reasons related to software architecture and register/cache blocksizes.
|
||||
|
||||
### SkylakeX results
|
||||
|
||||
#### pdf
|
||||
|
||||
* [SkylakeX single-threaded](graphs/large/l3_perf_skx_nt1.pdf)
|
||||
* [SkylakeX multithreaded (26 cores)](graphs/large/l3_perf_skx_jc2ic13_nt26.pdf)
|
||||
* [SkylakeX multithreaded (52 cores)](graphs/large/l3_perf_skx_jc4ic13_nt52.pdf)
|
||||
|
||||
#### png (inline)
|
||||
|
||||
* **SkylakeX single-threaded**
|
||||

|
||||
* **SkylakeX multithreaded (26 cores)**
|
||||

|
||||
* **SkylakeX multithreaded (52 cores)**
|
||||

|
||||
|
||||
---
|
||||
|
||||
## Haswell
|
||||
|
||||
### Haswell experiment details
|
||||
|
||||
* Location: TACC (Lonestar5)
|
||||
* Processor model: Intel Xeon E5-2690 v3 (Haswell)
|
||||
* Core topology: two sockets, 12 cores per socket, 24 cores total
|
||||
* SMT status: enabled, but not utilized
|
||||
* Max clock rate: 3.5GHz (single-core), 3.1GHz (multicore)
|
||||
* Max vector register length: 256 bits (AVX2)
|
||||
* Max FMA vector IPC: 2
|
||||
* Peak performance:
|
||||
* single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision)
|
||||
* multicore: 49.6 GFLOPS/core (double-precision), 99.2 GFLOPS/core (single-precision)
|
||||
* Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103)
|
||||
* Compiler: gcc 6.3.0
|
||||
* Results gathered: 25-26 February 2019, 27 March 2019
|
||||
* Implementations tested:
|
||||
* BLIS 075143df (0.5.1-39)
|
||||
* configured with `./configure -t openmp auto` (single- and multithreaded)
|
||||
* sub-configuration exercised: `haswell`
|
||||
* Single-threaded (1 core) execution requested via no change in environment variables
|
||||
* Multithreaded (12 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=3 BLIS_JR_NT=2`
|
||||
* Multithreaded (24 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=3 BLIS_JR_NT=2`
|
||||
* OpenBLAS 0.3.5
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=24` (multithreaded, 24 cores)
|
||||
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
|
||||
* Multithreaded (12 core) execution requested via `export OPENBLAS_NUM_THREADS=12`
|
||||
* Multithreaded (24 core) execution requested via `export OPENBLAS_NUM_THREADS=24`
|
||||
* Eigen 3.3.90
|
||||
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
|
||||
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
|
||||
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
|
||||
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
|
||||
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
|
||||
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
|
||||
* Multithreaded (24 core) execution requested via `export OMP_NUM_THREADS=24`
|
||||
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
|
||||
* MKL 2018 update 2
|
||||
* Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
|
||||
* Multithreaded (12 core) execution requested via `export MKL_NUM_THREADS=12`
|
||||
* Multithreaded (24 core) execution requested via `export MKL_NUM_THREADS=24`
|
||||
* Affinity:
|
||||
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 23"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
|
||||
* Frequency throttling (via `cpupower`):
|
||||
* No changes made.
|
||||
* Comments:
|
||||
* We were pleasantly surprised by how competitive BLIS performs relative to MKL on this multicore Haswell system, which is a _very_ common microarchitecture, and _very_ similar to the more recent Broadwells, Skylakes (desktop), Kaby Lakes, and Coffee Lakes that succeeded it.
|
||||
|
||||
### Haswell results
|
||||
|
||||
#### pdf
|
||||
|
||||
* [Haswell single-threaded](graphs/large/l3_perf_has_nt1.pdf)
|
||||
* [Haswell multithreaded (12 cores)](graphs/large/l3_perf_has_jc2ic3jr2_nt12.pdf)
|
||||
* [Haswell multithreaded (24 cores)](graphs/large/l3_perf_has_jc4ic3jr2_nt24.pdf)
|
||||
|
||||
#### png (inline)
|
||||
|
||||
* **Haswell single-threaded**
|
||||

|
||||
* **Haswell multithreaded (12 cores)**
|
||||

|
||||
* **Haswell multithreaded (24 cores)**
|
||||

|
||||
|
||||
---
|
||||
|
||||
## Epyc
|
||||
|
||||
### Epyc experiment details
|
||||
|
||||
* Location: Oracle cloud
|
||||
* Processor model: AMD Epyc 7551 (Zen1)
|
||||
* Core topology: two sockets, 4 dies per socket, 2 core complexes (CCX) per die, 4 cores per CCX, 64 cores total
|
||||
* SMT status: enabled, but not utilized
|
||||
* Max clock rate: 3.0GHz (single-core), 2.55GHz (multicore)
|
||||
* Max vector register length: 256 bits (AVX2)
|
||||
* Max FMA vector IPC: 1
|
||||
* Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each.
|
||||
* Peak performance:
|
||||
* single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
|
||||
* multicore: 20.4 GFLOPS/core (double-precision), 40.8 GFLOPS/core (single-precision)
|
||||
* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
|
||||
* Compiler: gcc 7.3.0
|
||||
* Results gathered: 6 March 2019, 19 March 2019, 27 March 2019
|
||||
* Implementations tested:
|
||||
* BLIS 9f1dbe5 (0.5.1-54)
|
||||
* configured with `./configure -t openmp auto` (single- and multithreaded)
|
||||
* sub-configuration exercised: `zen`
|
||||
* Single-threaded (1 core) execution requested via no change in environment variables
|
||||
* Multithreaded (32 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=8 BLIS_JR_NT=4`
|
||||
* Multithreaded (64 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=8 BLIS_JR_NT=4`
|
||||
* OpenBLAS 0.3.5
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=64` (multithreaded, 64 cores)
|
||||
* Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
|
||||
* Multithreaded (32 core) execution requested via `export OPENBLAS_NUM_THREADS=32`
|
||||
* Multithreaded (64 core) execution requested via `export OPENBLAS_NUM_THREADS=64`
|
||||
* Eigen 3.3.90
|
||||
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
|
||||
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
|
||||
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
|
||||
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
|
||||
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
|
||||
* Multithreaded (32 core) execution requested via `export OMP_NUM_THREADS=32`
|
||||
* Multithreaded (64 core) execution requested via `export OMP_NUM_THREADS=64`
|
||||
* **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
|
||||
* MKL 2019 update 1
|
||||
* Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
|
||||
* Multithreaded (32 core) execution requested via `export MKL_NUM_THREADS=32`
|
||||
* Multithreaded (64 core) execution requested via `export MKL_NUM_THREADS=64`
|
||||
* Affinity:
|
||||
* Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 63"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
|
||||
* Frequency throttling (via `cpupower`):
|
||||
* Driver: acpi-cpufreq
|
||||
* Governor: performance
|
||||
* Hardware limits: 1.2GHz - 2.0GHz
|
||||
* Adjusted minimum: 2.0GHz
|
||||
* Comments:
|
||||
* MKL performance is dismal, despite being linked in the same manner as on the Xeon Platinum. It's not clear what is causing the slowdown. It could be that MKL's runtime kernel/blocksize selection logic is falling back to some older, more basic implementation because CPUID is not returning Intel as the hardware vendor. Alternatively, it's possible that MKL is trying to use kernels for the closest Intel architectures--say, Haswell/Broadwell--but its implementations use Haswell-specific optimizations that, due to microarchitectural differences, degrade performance on Zen.
|
||||
|
||||
### Epyc results
|
||||
|
||||
#### pdf
|
||||
|
||||
* [Epyc single-threaded](graphs/large/l3_perf_epyc_nt1.pdf)
|
||||
* [Epyc multithreaded (32 cores)](graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.pdf)
|
||||
* [Epyc multithreaded (64 cores)](graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.pdf)
|
||||
|
||||
#### png (inline)
|
||||
|
||||
* **Epyc single-threaded**
|
||||

|
||||
* **Epyc multithreaded (32 cores)**
|
||||

|
||||
* **Epyc multithreaded (64 cores)**
|
||||

|
||||
|
||||
---
|
||||
|
||||
# Feedback
|
||||
|
||||
Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
|
||||
|
||||
Thanks for your interest in BLIS!
|
||||
|
||||
224
docs/PerformanceSmall.md
Normal file
@@ -0,0 +1,224 @@
|
||||
# Contents
|
||||
|
||||
* **[Contents](Performance.md#contents)**
|
||||
* **[Introduction](Performance.md#introduction)**
|
||||
* **[General information](Performance.md#general-information)**
|
||||
* **[Level-3 performance](Performance.md#level-3-performance)**
|
||||
* **[Kaby Lake](Performance.md#kaby-lake)**
|
||||
* **[Experiment details](Performance.md#kaby-lake-experiment-details)**
|
||||
* **[Results](Performance.md#kaby-lake-results)**
|
||||
* **[Epyc](Performance.md#epyc)**
|
||||
* **[Experiment details](Performance.md#epyc-experiment-details)**
|
||||
* **[Results](Performance.md#epyc-results)**
|
||||
* **[Feedback](Performance.md#feedback)**
|
||||
|
||||
# Introduction
|
||||
|
||||
This document showcases performance results for the level-3 `gemm` operation
|
||||
on small matrices with BLIS and BLAS for select hardware architectures.
|
||||
|
||||
# General information
|
||||
|
||||
Generally speaking, for level-3 operations on small matrices, we publish
|
||||
two "panels" for each type of hardware, one that reflects performance on
|
||||
row-stored matrices and another for column-stored matrices.
|
||||
Each panel will consist of a 4x7 grid of graphs, with each row representing
|
||||
a different transposition case (`nn`, `nt`, `tn`, `tt`)
|
||||
complex) and each column representing a different shape scenario, usually
|
||||
with one or two matrix dimensions bound to a fixed size for all problem
|
||||
sizes tested.
|
||||
Each of the 28 graphs within a panel will contain an x-axis that reports
|
||||
problem size, with one, two, or all three matrix dimensions equal to the
|
||||
problem size (e.g. _m_ = 6; _n_ = _k_, also encoded as `m6npkp`).
|
||||
The y-axis will report in units GFLOPS (billions of floating-point operations
|
||||
per second) on a single core.
|
||||
|
||||
It's also worth pointing out that the top of each graph (e.g. the maximum
|
||||
y-axis value depicted) _always_ corresponds to the theoretical peak performance
|
||||
under the conditions associated with that graph.
|
||||
Theoretical peak performance, in units of GFLOPS, is calculated as the
|
||||
product of:
|
||||
1. the maximum sustainable clock rate in GHz; and
|
||||
2. the maximum number of floating-point operations (flops) that can be
|
||||
executed per cycle.
|
||||
|
||||
Note that the maximum sustainable clock rate may change depending on the
|
||||
conditions.
|
||||
For example, on some systems the maximum clock rate is higher when only one
|
||||
core is active (e.g. single-threaded performance) versus when all cores are
|
||||
active (e.g. multithreaded performance).
|
||||
The maximum number of flops executable per cycle (per core) is generally
|
||||
computed as the product of:
|
||||
1. the maximum number of fused multiply-add (FMA) vector instructions that
|
||||
can be issued per cycle (per core);
|
||||
2. the maximum number of elements that can be stored within a single vector
|
||||
register (for the datatype in question); and
|
||||
3. 2.0, since an FMA instruction fuses two operations (a multiply and an add).
|
||||
|
||||
The problem size range, represented on the x-axis, is sampled in
|
||||
increments of 4 up to 800 for the cases where one or two dimensions is small
|
||||
(and constant)
|
||||
and up to 400 in the case where all dimensions (e.g. _m_, _n_, and _k_) are
|
||||
bound to the problem size (i.e., square matrices).
|
||||
|
||||
Note that the constant small matrix dimensions were chosen to be _very_
|
||||
small--in the neighborhood of 8--intentionally to showcase what happens when
|
||||
at least one of the matrices is abnormally "skinny." Typically, organizations
|
||||
and individuals only publish performance with square matrices, which can miss
|
||||
the problem sizes of interest to many applications. Here, in addition to square
|
||||
matrices (shown in the seventh column), we also show six other scenarios where
|
||||
one or two `gemm` dimensions (of _m,_ _n_, and _k_) is small.
|
||||
|
||||
The legend in each graph contains two entries for BLIS, corresponding to the
|
||||
two black lines, one solid and one dotted. The dotted line, **"BLIS conv"**,
|
||||
represents the conventional implementation that targets large matrices. This
|
||||
was the only implementation available in BLIS prior to the addition to the
|
||||
small/skinny matrix support. The solid line, **"BLIS sup"**, makes use of the
|
||||
new small/skinny matrix implementation for certain small problems. Whenever
|
||||
these results differ by any significant amount (beyond noise), it denotes a
|
||||
problem size for which BLIS employed the new small/skinny implementation.
|
||||
Put another way, **the delta between these two lines represents the performance
|
||||
improvement between BLIS's previous status quo and the new regime.**
|
||||
|
||||
Finally, each point along each curve represents the best of three trials.
|
||||
|
||||
# Interpretation
|
||||
|
||||
In general, the the curves associated with higher-performing implementations
|
||||
will appear higher in the graphs than lower-performing implementations.
|
||||
Ideally, an implementation will climb in performance (as a function of problem
|
||||
size) as quickly as possible and asymptotically approach some high fraction of
|
||||
peak performance.
|
||||
|
||||
When corresponding with us, via email or when opening an
|
||||
[issue](https://github.com/flame/blis/issues) on github, we kindly ask that
|
||||
you specify as closely as possible (though a range is fine) your problem
|
||||
size of interest so that we can better assist you.
|
||||
|
||||
# Level-3 performance
|
||||
|
||||
## Kaby Lake
|
||||
|
||||
### Kaby Lake experiment details
|
||||
|
||||
* Location: undisclosed
|
||||
* Processor model: Intel Core i5-7500 (Kaby Lake)
|
||||
* Core topology: one socket, 4 cores total
|
||||
* SMT status: unavailable
|
||||
* Max clock rate: 3.8GHz (single-core)
|
||||
* Max vector register length: 256 bits (AVX2)
|
||||
* Max FMA vector IPC: 2
|
||||
* Peak performance:
|
||||
* single-core: 57.6 GFLOPS (double-precision), 115.2 GFLOPS (single-precision)
|
||||
* Operating system: Gentoo Linux (Linux kernel 5.0.7)
|
||||
* Compiler: gcc 7.3.0
|
||||
* Results gathered: 31 May 2019, 3 June 2019, 19 June 2019
|
||||
* Implementations tested:
|
||||
* BLIS 6bf449c (0.5.2-42)
|
||||
* configured with `./configure --enable-cblas auto`
|
||||
* sub-configuration exercised: `haswell`
|
||||
* OpenBLAS 0.3.6
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
|
||||
* BLASFEO 2c9f312
|
||||
* configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
|
||||
* Eigen 3.3.90
|
||||
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
|
||||
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
|
||||
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
|
||||
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
|
||||
* Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
|
||||
* MKL 2018 update 4
|
||||
* Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
|
||||
* Affinity:
|
||||
* N/A.
|
||||
* Frequency throttling (via `cpupower`):
|
||||
* Driver: intel_pstate
|
||||
* Governor: performance
|
||||
* Hardware limits: 800MHz - 3.8GHz
|
||||
* Adjusted minimum: 3.7GHz
|
||||
* Comments:
|
||||
* For both row- and column-stored matrices, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution (typically MKL), except for a few cases of where the _k_ dimension is very small. It is likely the case that this shape scenario begs a different kernel approach, since the BLIS microkernel is inherently designed to iterate over many _k_ dimension iterations (which leads them to incur considerable overhead for small values of _k_).
|
||||
* For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 80 to 180. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
|
||||
|
||||
### Kaby Lake results
|
||||
|
||||
#### pdf
|
||||
|
||||
* [Kaby Lake row-stored](graphs/sup/dgemm_rrr_kbl_nt1.pdf)
|
||||
* [Kaby Lake column-stored](graphs/sup/dgemm_ccc_kbl_nt1.pdf)
|
||||
|
||||
#### png (inline)
|
||||
|
||||
* **Kaby Lake row-stored**
|
||||

|
||||
* **Kaby Lake column-stored**
|
||||

|
||||
|
||||
---
|
||||
|
||||
## Epyc
|
||||
|
||||
### Epyc experiment details
|
||||
|
||||
* Location: Oracle cloud
|
||||
* Processor model: AMD Epyc 7551 (Zen1)
|
||||
* Core topology: two sockets, 4 dies per socket, 2 core complexes (CCX) per die, 4 cores per CCX, 64 cores total
|
||||
* SMT status: enabled, but not utilized
|
||||
* Max clock rate: 3.0GHz (single-core), 2.55GHz (multicore)
|
||||
* Max vector register length: 256 bits (AVX2)
|
||||
* Max FMA vector IPC: 1
|
||||
* Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each.
|
||||
* Peak performance:
|
||||
* single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
|
||||
* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
|
||||
* Compiler: gcc 7.3.0
|
||||
* Results gathered: 31 May 2019, 3 June 2019, 19 June 2019
|
||||
* Implementations tested:
|
||||
* BLIS 6bf449c (0.5.2-42)
|
||||
* configured with `./configure --enable-cblas auto`
|
||||
* sub-configuration exercised: `zen`
|
||||
* OpenBLAS 0.3.6
|
||||
* configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
|
||||
* BLASFEO 2c9f312
|
||||
* configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
|
||||
* Eigen 3.3.90
|
||||
* Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
|
||||
* Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
|
||||
* configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
|
||||
* The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
|
||||
* Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
|
||||
* MKL 2019 update 4
|
||||
* Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
|
||||
* Affinity:
|
||||
* N/A.
|
||||
* Frequency throttling (via `cpupower`):
|
||||
* Driver: acpi-cpufreq
|
||||
* Governor: performance
|
||||
* Hardware limits: 1.2GHz - 2.0GHz
|
||||
* Adjusted minimum: 2.0GHz
|
||||
* Comments:
|
||||
* As with Kaby Lake, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution, except for a few cases of where the _k_ dimension is very small.
|
||||
* For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 12 to 256. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
|
||||
|
||||
### Epyc results
|
||||
|
||||
#### pdf
|
||||
|
||||
* [Epyc row-stored](graphs/sup/dgemm_rrr_epyc_nt1.pdf)
|
||||
* [Epyc column-stored](graphs/sup/dgemm_ccc_epyc_nt1.pdf)
|
||||
|
||||
#### png (inline)
|
||||
|
||||
* **Epyc row-stored**
|
||||

|
||||
* **Epyc column-stored**
|
||||

|
||||
|
||||
---
|
||||
|
||||
# Feedback
|
||||
|
||||
Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
|
||||
|
||||
Thanks for your interest in BLIS!
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
|
||||
## Contents
|
||||
|
||||
* [Changes in 0.6.0](ReleaseNotes.md#changes-in-060)
|
||||
* [Changes in 0.5.2](ReleaseNotes.md#changes-in-052)
|
||||
* [Changes in 0.5.1](ReleaseNotes.md#changes-in-051)
|
||||
* [Changes in 0.5.0](ReleaseNotes.md#changes-in-050)
|
||||
* [Changes in 0.4.1](ReleaseNotes.md#changes-in-041)
|
||||
@@ -33,6 +35,70 @@
|
||||
* [Changes in 0.0.2](ReleaseNotes.md#changes-in-002)
|
||||
* [Changes in 0.0.1](ReleaseNotes.md#changes-in-001)
|
||||
|
||||
## Changes in 0.6.0
|
||||
June 3, 2019
|
||||
|
||||
Improvements present in 0.6.0:
|
||||
|
||||
Framework:
|
||||
- Implemented small/skinny/unpacked (sup) framework for accelerated level-3 performance when at least one matrix dimension is small (or very small). For now, only `dgemm` is optimized, and this new implementation currently only targets Intel Haswell through Coffee Lake, and AMD Zen-based Ryzen/Epyc. (The existing kernels should extend without significant modification to Zen2-based Ryzen/Epyc once they are available.) Also, multithreaded parallelism is not yet implemented, though application-level threading should be fine. (AMD)
|
||||
- Changed function pointer usages of `void*` to new, typedef'ed type `void_fp`.
|
||||
- Allow compile-time disabling of BLAS prototypes in BLIS, in case the application already has access to prototypes.
|
||||
- In `bli_system.h`, define `_POSIX_C_SOURCE` to `200809L` if the macro is not already defined. This ensures that things such as pthreads are properly defined by an application that has `#include "blis.h"` but omits the definition of `_POSIX_C_SOURCE` from the command-line compiler options. (Christos Psarras)
|
||||
|
||||
Kernels:
|
||||
- None.
|
||||
|
||||
Build system:
|
||||
- Updated the way configure and the top-level Makefile handle installation prefixes (`prefix`, `exec_prefix`, `libdir`, `includedir`, `sharedir`) to better conform with GNU conventions.
|
||||
- Improved clang version detection. (Isuru Fernando)
|
||||
- Use pthreads on MinGW and Cygwin. (Isuru Fernando)
|
||||
|
||||
Testing:
|
||||
- Added Eigen support to test drivers in `test/3`.
|
||||
- Fix inadvertently hidden `xerbla_()` in blastest drivers when building only shared libraries. (Isuru Fernando, M. Zhou)
|
||||
|
||||
Documentation:
|
||||
- Added `docs/PerformanceSmall.md` to showcase new BLIS small/skinny `dgemm` performance on Kaby Lake and Epyc.
|
||||
- Added Eigen results (3.3.90) to performance graphs showcased in `docs/Performance.md`.
|
||||
- Added BLIS thread factorization info to `docs/Performance.md`.
|
||||
|
||||
## Changes in 0.5.2
|
||||
March 19, 2019
|
||||
|
||||
Improvements present in 0.5.2:
|
||||
|
||||
Framework:
|
||||
- Added support for IC loop parallelism to the `trsm` operation.
|
||||
- Implemented a pool-based small block allocator and a corresponding `configure` option (enabled by default), which minimizes the number of calls to `malloc()` and `free()` for the purposes of allocating small blocks (on the order of 100 bytes). These small blocks are used by internal data structures, and the repeated allocation and freeing of these structures could, perhaps, cause memory fragmentation issues in certain application circumstances. This was never reproduced and observed, however, and remains entirely theoretical. Still, the sba should be no slower, and perhaps a little faster, than repeatedly calling `malloc()` and `free()` for these internal data structures. Also, the sba was designed to be thread-safe. (AMD)
|
||||
- Refined and extended the output enabled by `--enable-mem-tracing`, which allows a developer to follow memory allocation and release performed by BLIS.
|
||||
- Initialize error messages at compile-time rather than at runtime. (Minh Quan Ho)
|
||||
- Fixed a potential situation whereby the multithreading parameters in a `rntm_t` object that is passed into an expert interface is ignored.
|
||||
- Prevent a redefinition of `ftnlen` in the `f2c_types.h` in blastest. (Jeff Diamond)
|
||||
|
||||
Kernels:
|
||||
- Adjusted the cache blocksizes in the `zen` sub-configuration for `float`, `scomplex`, and `dcomplex` datatypes. The previous values, taken directly from the `haswell` subconfig, were merely meant to be reasonable placeholders until more suitable values were determined, as had already taken place for the `double` datatype. (AMD)
|
||||
- Rewrote reference kernels in terms of simplified indexing annotated by the `#pragma omp simd` directive, which a compiler can use to vectorize certain constant-bounded loops. The `#pragma` is disabled via a preprocessor macro layer if the compiler is found by `configure` to not support `-fopenmp-simd`. (Devin Matthews, Jeff Hammond)
|
||||
|
||||
Build system:
|
||||
- Added symbol-export annotation macros to all of the function prototypes and global variable declarations for public symbols, and created a new `configure` option, `--export-shared=[public|all]`, that controls which symbols--only those that are meant to be public, or all symbols--are exported to the shared library. (Isuru Fernando)
|
||||
- Standardized to using `-O3` in various subconfigs, and also `-funsafe-math-optimizations` for reference kernels. (Dave Love, Jeff Hammond)
|
||||
- Disabled TBM, XOP, LWP instructions in all AMD subconfigs. (Devin Matthews)
|
||||
- Fixed issues that prevented using BLIS on GNU Hurd. (M. Zhou)
|
||||
- Relaxed python3 requirements to allow python 3.4 or later. Previously, python 3.5 or later was required if python3 was being used. (Dave Love)
|
||||
- Added `thunderx2` sub-configuration. (Devangi Parikh)
|
||||
- Added `power9` sub-configuration. For now, this subconfig only uses reference kernels. (Nicholai Tukanov)
|
||||
- Fixed an issue with `configure` failing on OSes--including certain flavors of BSD--that contain a slash '/' character in the output of `uname -s`. (Isuru Fernando, M. Zhou)
|
||||
|
||||
Testing:
|
||||
- Renamed `test/3m4m` directory to `test/3`.
|
||||
- Lots of updates and improvements to Makefiles, shell scripts, and matlab scripts in `test/3`.
|
||||
|
||||
Documentation:
|
||||
- Added a new `docs/Performance.md` document that showcases single-threaded, single-socket, and dual-socket performance results of `single`, `double`, `scomplex`, and `dcomplex` level-3 operations in BLIS, OpenBLAS, and MKL/ARMPL for Haswell, SkylakeX, ThunderX2, and Epyc hardware architectures. (Note: Other implementations such as Eigen and ATLAS may be added to these graphs in the future.)
|
||||
- Updated `README.md` to include new language on external packages. (Dave Love)
|
||||
- Updated `docs/Multithreading.md` to be more explicit about the fact that multithreading is disabled by default at configure-time, and the fact that BLIS will run executed single-threaded at runtime by default if no multithreaded specification is given. (M. Zhou)
|
||||
|
||||
## Changes in 0.5.1
|
||||
December 18, 2018
|
||||
|
||||
@@ -88,7 +154,7 @@ Kernels:
|
||||
Build system:
|
||||
- Added support for building Windows DLLs via AppVeyor [2], complete with a built-in implementation of pthreads for Windows, as well as an implementation of the `pthread_barrier_*()` APIs for use on OS X. (Isuru Fernando, Devin Matthews, Mathieu Poumeyrol, Matthew Honnibal)
|
||||
- Defined a `cortexa53` sub-configuration, which is similar to `cortexa57` except that it uses slightly different compiler flags. (Mathieu Poumeyrol)
|
||||
- Added python version checking to configure script.
|
||||
- Added python version checking to `configure` script.
|
||||
- Added a script to automate the regeneration of the symbols list file (now located in `build/libblis-symbols.def`).
|
||||
- Various tweaks in preparation for BLIS's inclusion within Debian. (M. Zhou)
|
||||
- Various fixes and cleanups.
|
||||
@@ -246,16 +312,16 @@ May 2, 2017
|
||||
- Implemented the 1m method for inducing complex matrix multiplication. (Please see ACM TOMS publication ["Implementing high-performance complex matrix multiplication via the 1m method"](https://github.com/flame/blis#citations) for more details.)
|
||||
- Switched to simpler `trsm_r` implementation.
|
||||
- Relaxed constraints that `MC % NR = 0` and `NC % MR = 0`, as this was only needed for the more sophisticated `trsm_r` implementation.
|
||||
- Automatic loop thread assignment. (Devin Matthews)
|
||||
- Updates to `.travis.yml` configuration file. (Devin Matthews)
|
||||
- Automatic loop thread assignment. (Devin Matthews)
|
||||
- Updates to `.travis.yml` configuration file. (Devin Matthews)
|
||||
- Updates to non-default haswell microkernels.
|
||||
- Match storage format of the temporary micro-tiles in macrokernels to that of the microkernel storage preference for edge cases.
|
||||
- Added support for Intel's Knight's Landing. (Devin Matthews)
|
||||
- Added more flexible options to specify multithreading via the configure script. (Devin Matthews)
|
||||
- OS X compatibility fixes. (Devin Matthews)
|
||||
- Other small changes and fixes.
|
||||
- Added support for Intel's Knight's Landing. (Devin Matthews)
|
||||
- Added more flexible options to specify multithreading via the configure script. (Devin Matthews)
|
||||
- OS X compatibility fixes. (Devin Matthews)
|
||||
- Other small changes and fixes.
|
||||
|
||||
Also, thanks to Elmar Peise, Krzysztof Drewniak, and Francisco Igual for their contributions in reporting/fixing certain bugs that were addressed in this version.
|
||||
Also, thanks to Elmar Peise, Krzysztof Drewniak, and Francisco Igual for their contributions in reporting/fixing certain bugs that were addressed in this version.
|
||||
|
||||
## Changes in 0.2.1
|
||||
October 5, 2016
|
||||
@@ -439,7 +505,7 @@ While neither `bli_config.h` nor `bli_kernel.h` has changed formats since 0.0.7,
|
||||
## Changes in 0.0.7
|
||||
April 30, 2013
|
||||
|
||||
This version incorporates many small fixes and feature enhancements made during our SC13 collaboration.
|
||||
This version incorporates many small fixes and feature enhancements made during our SC13 collaboration.
|
||||
|
||||
## Changes in 0.0.6
|
||||
April 13, 2013
|
||||
@@ -478,7 +544,7 @@ The compatibility layer is enabled via a configuration option in `bl2_config.h`.
|
||||
## Changes in 0.0.2
|
||||
February 11, 2013
|
||||
|
||||
Most notably, this version contains the new test suite I've been working on for the last month.
|
||||
Most notably, this version contains the new test suite I've been working on for the last month.
|
||||
|
||||
What is the test suite? It is a highly configurable test driver that allows one to test an arbitrary set of BLIS operations, with an arbitrary set of parameter combinations, and matrix/vector storage formats, as well as whichever datatypes you are interested in. (For now, only homogeneous datatyping is supported, which is what most people want.) You can also specify an arbitrary problem size range with arbitrary increments, and arbitrary ratios between dimensions (or anchor a dimension to a single value), and you can output directly to files which store the output in matlab syntax, which makes it easy to generate performance graphs.
|
||||
|
||||
|
||||
BIN
docs/graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.pdf
Normal file
BIN
docs/graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.png
Normal file
|
After Width: | Height: | Size: 108 KiB |
BIN
docs/graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.pdf
Normal file
BIN
docs/graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.png
Normal file
|
After Width: | Height: | Size: 115 KiB |
BIN
docs/graphs/large/l3_perf_epyc_nt1.pdf
Normal file
BIN
docs/graphs/large/l3_perf_epyc_nt1.png
Normal file
|
After Width: | Height: | Size: 78 KiB |
BIN
docs/graphs/large/l3_perf_has_jc2ic3jr2_nt12.pdf
Normal file
BIN
docs/graphs/large/l3_perf_has_jc2ic3jr2_nt12.png
Normal file
|
After Width: | Height: | Size: 96 KiB |
BIN
docs/graphs/large/l3_perf_has_jc4ic3jr2_nt24.pdf
Normal file
BIN
docs/graphs/large/l3_perf_has_jc4ic3jr2_nt24.png
Normal file
|
After Width: | Height: | Size: 96 KiB |
BIN
docs/graphs/large/l3_perf_has_nt1.pdf
Normal file
BIN
docs/graphs/large/l3_perf_has_nt1.png
Normal file
|
After Width: | Height: | Size: 81 KiB |
BIN
docs/graphs/large/l3_perf_skx_jc2ic13_nt26.pdf
Normal file
BIN
docs/graphs/large/l3_perf_skx_jc2ic13_nt26.png
Normal file
|
After Width: | Height: | Size: 104 KiB |
BIN
docs/graphs/large/l3_perf_skx_jc4ic13_nt52.pdf
Normal file
BIN
docs/graphs/large/l3_perf_skx_jc4ic13_nt52.png
Normal file
|
After Width: | Height: | Size: 101 KiB |
BIN
docs/graphs/large/l3_perf_skx_nt1.pdf
Normal file
BIN
docs/graphs/large/l3_perf_skx_nt1.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
BIN
docs/graphs/large/l3_perf_tx2_jc4ic7_nt28.pdf
Normal file
BIN
docs/graphs/large/l3_perf_tx2_jc4ic7_nt28.png
Normal file
|
After Width: | Height: | Size: 92 KiB |
BIN
docs/graphs/large/l3_perf_tx2_jc8ic7_nt56.pdf
Normal file
BIN
docs/graphs/large/l3_perf_tx2_jc8ic7_nt56.png
Normal file
|
After Width: | Height: | Size: 100 KiB |
BIN
docs/graphs/large/l3_perf_tx2_nt1.pdf
Normal file
BIN
docs/graphs/large/l3_perf_tx2_nt1.png
Normal file
|
After Width: | Height: | Size: 70 KiB |
BIN
docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf
Normal file
BIN
docs/graphs/sup/dgemm_ccc_epyc_nt1.png
Normal file
|
After Width: | Height: | Size: 169 KiB |
BIN
docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf
Normal file
BIN
docs/graphs/sup/dgemm_ccc_kbl_nt1.png
Normal file
|
After Width: | Height: | Size: 195 KiB |
BIN
docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf
Normal file
BIN
docs/graphs/sup/dgemm_rrr_epyc_nt1.png
Normal file
|
After Width: | Height: | Size: 171 KiB |
BIN
docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf
Normal file
BIN
docs/graphs/sup/dgemm_rrr_kbl_nt1.png
Normal file
|
After Width: | Height: | Size: 203 KiB |
@@ -114,7 +114,7 @@ CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
|
||||
CFLAGS += -I$(TEST_SRC_PATH)
|
||||
|
||||
# Locate the libblis library to which we will link.
|
||||
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
|
||||
# Binary executable name.
|
||||
TEST_BINS := 00obj_basic.x \
|
||||
|
||||
@@ -102,7 +102,7 @@ CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
|
||||
CFLAGS += -I$(TEST_SRC_PATH)
|
||||
|
||||
# Locate the libblis library to which we will link.
|
||||
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
|
||||
# Binary executable name.
|
||||
TEST_BINS := 00level1v.x \
|
||||
|
||||
@@ -64,7 +64,7 @@ void PASTEMAC0(opname) \
|
||||
bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
|
||||
\
|
||||
f \
|
||||
@@ -100,7 +100,7 @@ void PASTEMAC0(opname) \
|
||||
PASTEMAC(opname,_check)( chi, psi ); \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
|
||||
\
|
||||
f \
|
||||
@@ -137,7 +137,7 @@ void PASTEMAC0(opname) \
|
||||
PASTEMAC(opname,_check)( chi ); \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
|
||||
\
|
||||
f \
|
||||
@@ -170,7 +170,7 @@ void PASTEMAC0(opname) \
|
||||
PASTEMAC(opname,_check)( chi, psi ); \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
|
||||
\
|
||||
f \
|
||||
@@ -213,7 +213,7 @@ void PASTEMAC0(opname) \
|
||||
else dt_use = dt_chi; \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_use ); \
|
||||
\
|
||||
f \
|
||||
@@ -247,7 +247,7 @@ void PASTEMAC0(opname) \
|
||||
PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
|
||||
\
|
||||
f \
|
||||
@@ -290,7 +290,7 @@ void PASTEMAC0(opname) \
|
||||
bli_obj_scalar_set_dt_buffer( chi, dt_zeta_c, &dt_chi, &buf_chi ); \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
|
||||
\
|
||||
f \
|
||||
@@ -327,7 +327,7 @@ void PASTEMAC0(opname) \
|
||||
PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
|
||||
\
|
||||
/* Query a type-specific function pointer, except one that uses
|
||||
void* instead of typed pointers. */ \
|
||||
void* for function arguments instead of typed pointers. */ \
|
||||
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
|
||||
\
|
||||
f \
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* chi, \
|
||||
obj_t* absq \
|
||||
@@ -53,7 +53,7 @@ GENPROT( normfsc )
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* chi, \
|
||||
obj_t* psi \
|
||||
@@ -69,7 +69,7 @@ GENPROT( subsc )
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* chi \
|
||||
);
|
||||
@@ -80,7 +80,7 @@ GENPROT( invertsc )
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* chi, \
|
||||
double* zeta_r, \
|
||||
@@ -93,7 +93,7 @@ GENPROT( getsc )
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
double zeta_r, \
|
||||
double zeta_i, \
|
||||
@@ -106,7 +106,7 @@ GENPROT( setsc )
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* chi, \
|
||||
obj_t* zeta_r, \
|
||||
@@ -119,7 +119,7 @@ GENPROT( unzipsc )
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* zeta_r, \
|
||||
obj_t* zeta_i, \
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
conj_t conjchi, \
|
||||
ctype* chi, \
|
||||
@@ -56,7 +56,7 @@ INSERT_GENTPROT_BASIC0( subsc )
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
conj_t conjchi, \
|
||||
ctype* chi \
|
||||
@@ -68,7 +68,7 @@ INSERT_GENTPROT_BASIC0( invertsc )
|
||||
#undef GENTPROTR
|
||||
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
ctype* chi, \
|
||||
ctype_r* absq \
|
||||
@@ -81,7 +81,7 @@ INSERT_GENTPROTR_BASIC0( normfsc )
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
ctype* chi, \
|
||||
ctype* psi \
|
||||
@@ -93,7 +93,7 @@ INSERT_GENTPROT_BASIC0( sqrtsc )
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
ctype* chi, \
|
||||
double* zeta_r, \
|
||||
@@ -106,7 +106,7 @@ INSERT_GENTPROT_BASIC0( getsc )
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
double zeta_r, \
|
||||
double zeta_i, \
|
||||
@@ -119,7 +119,7 @@ INSERT_GENTPROT_BASIC0( setsc )
|
||||
#undef GENTPROTR
|
||||
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
ctype* chi, \
|
||||
ctype_r* zeta_r, \
|
||||
@@ -132,7 +132,7 @@ INSERT_GENTPROTR_BASIC0( unzipsc )
|
||||
#undef GENTPROTR
|
||||
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
ctype_r* zeta_r, \
|
||||
ctype_r* zeta_i, \
|
||||
@@ -143,14 +143,14 @@ INSERT_GENTPROTR_BASIC0( zipsc )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_igetsc
|
||||
BLIS_EXPORT_BLIS void bli_igetsc
|
||||
(
|
||||
dim_t* chi,
|
||||
double* zeta_r,
|
||||
double* zeta_i
|
||||
);
|
||||
|
||||
void bli_isetsc
|
||||
BLIS_EXPORT_BLIS void bli_isetsc
|
||||
(
|
||||
double zeta_r,
|
||||
double zeta_i,
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* chi, \
|
||||
obj_t* psi \
|
||||
@@ -55,7 +55,7 @@ GENFRONT( copysc )
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjchi, \
|
||||
void* chi, \
|
||||
|
||||