Merge branch 'amd-staging-rome2.1' of ssh://git.amd.com:29418/cpulibraries/er/blis into amd-blis-cpp

Change-Id: I97a10ab7546d475474b0ff733bafb8248843c352
2026-05-11 09:39:59 +00:00 · 2019-11-21 00:54:16 +05:30
parent d63f9b7d7f b5475f527d
commit 3d20128aea
422 changed files with 70330 additions and 3604 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -39,7 +39,7 @@ build_script:
 - bash -lc "cd /c/projects/blis && ./configure %CONFIGURE_OPTS% --enable-threading=%THREADING% --enable-arg-max-hack --prefix=/c/blis %CONFIG%"
 - bash -lc "cd /c/projects/blis && mingw32-make -j4 V=1"
 - bash -lc "cd /c/projects/blis && mingw32-make install"
- ps: Compress-Archive -Path C:\blis -DestinationPath C:\blis.zip
+- 7z a C:\blis.zip C:\blis
 - ps: Push-AppveyorArtifact C:\blis.zip

 test_script:
--- a/1789
+++ b/1789
--- a/11
+++ b/11
@@ -9,18 +9,22 @@ The BLIS framework was primarily authored by

 but many others have contributed code and feedback, including

+  Sameer Agarwal        @sandwichmaker      (Google)
  Murtaza Ali                               (Texas Instruments)
  Sajid Ali             @s-sajid-ali        (Northwestern University)
  Erling Andersen       @erling-d-andersen
  Alex Arslan           @ararslan
  Vernon Austel                             (IBM, T.J. Watson Research Center)
+  Matthew Brett         @matthew-brett      (University of Birmingham)
  Jed Brown             @jedbrown           (Argonne National Laboratory)
  Robin Christ          @robinchrist
  Kay Dewhurst          @jkd2016            (Max Planck Institute, Halle, Germany)
  Jeff Diamond                              (Oracle)
  Johannes Dieterich    @iotamudelta
  Krzysztof Drewniak    @krzysz00
+  Marat Dukhan          @Maratyszcza        (Google)
  Victor Eijkhout       @VictorEijkhout     (Texas Advanced Computing Center)
+  Evgeny Epifanovsky    @epifanovsky        (Q-Chem)
  Isuru Fernando        @isuruf
  Roman Gareev          @gareevroman
  Richard Goldschmidt   @SuperFluffy
@@ -30,7 +34,7 @@ but many others have contributed code and feedback, including
  Jeff Hammond          @jeffhammond        (Intel)
  Jacob Gorm Hansen     @jacobgorm
  Jean-Michel Hautbois  @jhautbois
-  Ian Henriksen         @insertinterestingnamehere
+  Ian Henriksen         @insertinterestingnamehere (The University of Texas at Austin)
  Minh Quan Ho          @hominhquan
  Matthew Honnibal      @honnibal
  Stefan Husmann        @stefanhusmann
@@ -53,6 +57,7 @@ but many others have contributed code and feedback, including
  Ilya Polkovnichenko
  Jack Poulson          @poulson            (Stanford)
  Mathieu Poumeyrol     @kali
+  Christos Psarras      @ChrisPsa           (RWTH-Aachen)
                        @qnerd
  Michael Rader         @mrader1248
  Pradeep Rao           @pradeeptrgit       (AMD)
@@ -63,11 +68,13 @@ but many others have contributed code and feedback, including
  Rene Sitt
  Tony Skjellum         @tonyskjellum       (The University of Tennessee at Chattanooga)
  Mikhail Smelyanskiy                       (Intel, Parallel Computing Lab)
+  Nathaniel Smith       @njsmith
  Shaden Smith          @ShadenSmith
  Tyler Smith           @tlrmchlsmth        (The University of Texas at Austin)
  Paul Springer         @springer13         (RWTH-Aachen)
  Vladimir Sukarev
  Santanu Thangaraj                         (AMD)
+  Nicholai Tukanov      @nicholaiTukanov    (The University of Texas at Austin)
  Rhys Ulerich          @RhysU              (The University of Texas at Austin)
  Robert van de Geijn   @rvdg               (The University of Texas at Austin)
  Kiran Varaganti       @kvaragan           (AMD)
@@ -83,8 +90,10 @@ partners, including

  AMD
  Hewlett Packard Enterprise
+  Huawei
  Intel
  Microsoft
+  Oracle
  Texas Instruments

 as well as the National Science Foundation (NSF Awards CCF-0917167,
--- a/110
+++ b/110
@@ -386,23 +386,22 @@ ifeq ($(IS_CONFIGURED),yes)
 # named with three .so version numbers.
 UNINSTALL_OLD_LIBS    :=

-UNINSTALL_OLD_LIBS    += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS_SO).?.?.?" 2> /dev/null | $(GREP) -v "$(LIBBLIS).$(LIBBLIS_SO_MMB_EXT)")
+UNINSTALL_OLD_LIBS    += $(filter-out $(INSTALL_LIBDIR)/$(LIBBLIS).$(LIBBLIS_SO_MMB_EXT),$(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS_SO).?.?.?))

 # These shell commands gather the filepaths to any library symlink in the
 # current LIBDIR that might be left over from an old installation. We start
 # with symlinks named using the .so major version number.
-UNINSTALL_OLD_SYML    := $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS_SO).?" 2> /dev/null | $(GREP) -v "$(LIBBLIS_SO).$(SO_MAJOR)")
+UNINSTALL_OLD_SYML    := $(filter-out $(INSTALL_LIBDIR)/$(LIBBLIS_SO).$(SO_MAJOR),$(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS_SO).?))

 # We also prepare to uninstall older-style symlinks whose names contain the
 # BLIS version number and configuration family.
-UNINSTALL_OLD_SYML    += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS)-*.a" 2> /dev/null | $(GREP) -v "$(LIBBLIS)-$(VERS_CONF).a")
-
-UNINSTALL_OLD_SYML    += $(shell $(FIND) $(INSTALL_LIBDIR)/ -name "$(LIBBLIS)-*.$(SHLIB_EXT)" 2> /dev/null | $(GREP) -v "$(LIBBLIS)-$(VERS_CONF).$(SHLIB_EXT)")
+UNINSTALL_OLD_SYML    += $(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS)-*.a)
+UNINSTALL_OLD_SYML    += $(wildcard $(INSTALL_LIBDIR)/$(LIBBLIS)-*.$(SHLIB_EXT))

 # This shell command grabs all files named "*.h" that are not blis.h or cblas.h
 # in the installation directory. We consider this set of headers to be "old" and
 # eligible for removal upon running of the uninstall-old-headers target.
-UNINSTALL_OLD_HEADERS := $(shell $(FIND) $(INSTALL_INCDIR)/blis/ -name "*.h" 2> /dev/null | $(GREP) -v "$(BLIS_H)" | $(GREP) -v "$(CBLAS_H)")
+UNINSTALL_OLD_HEADERS := $(filter-out $(BLIS_H),$(filter-out $(CBLAS_H),$(wildcard $(INSTALL_INCDIR)/blis/*.h)))

 endif # IS_CONFIGURED

@@ -1027,23 +1026,24 @@ endif # ifeq ($(IS_WIN),no)
 # --- Query current configuration ---

 showconfig: check-env
-	@echo "configuration family:  $(CONFIG_NAME)"
-	@echo "sub-configurations:    $(CONFIG_LIST)"
-	@echo "requisite kernels:     $(KERNEL_LIST)"
-	@echo "kernel-to-config map:  $(KCONFIG_MAP)"
-	@echo "-----------------------"
-	@echo "BLIS version string:   $(VERSION)"
-	@echo ".so major version:     $(SO_MAJOR)"
-	@echo ".so minor.build vers:  $(SO_MINORB)"
-	@echo "install libdir:        $(INSTALL_LIBDIR)"
-	@echo "install includedir:    $(INSTALL_INCDIR)"
-	@echo "debugging status:      $(DEBUG_TYPE)"
-	@echo "multithreading status: $(THREADING_MODEL)"
-	@echo "enable BLAS API?       $(MK_ENABLE_BLAS)"
-	@echo "enable CBLAS API?      $(MK_ENABLE_CBLAS)"
-	@echo "build static library?  $(MK_ENABLE_STATIC)"
-	@echo "build shared library?  $(MK_ENABLE_SHARED)"
-	@echo "ARG_MAX hack enabled?  $(ARG_MAX_HACK)"
+	@echo "configuration family:    $(CONFIG_NAME)"
+	@echo "sub-configurations:      $(CONFIG_LIST)"
+	@echo "requisite kernels sets:  $(KERNEL_LIST)"
+	@echo "kernel-to-config map:    $(KCONFIG_MAP)"
+	@echo "-------------------------"
+	@echo "BLIS version string:     $(VERSION)"
+	@echo ".so major version:       $(SO_MAJOR)"
+	@echo ".so minor.build vers:    $(SO_MINORB)"
+	@echo "install libdir:          $(INSTALL_LIBDIR)"
+	@echo "install includedir:      $(INSTALL_INCDIR)"
+	@echo "install sharedir:        $(INSTALL_SHAREDIR)"
+	@echo "debugging status:        $(DEBUG_TYPE)"
+	@echo "multithreading status:   $(THREADING_MODEL)"
+	@echo "enable BLAS API?         $(MK_ENABLE_BLAS)"
+	@echo "enable CBLAS API?        $(MK_ENABLE_CBLAS)"
+	@echo "build static library?    $(MK_ENABLE_STATIC)"
+	@echo "build shared library?    $(MK_ENABLE_SHARED)"
+	@echo "ARG_MAX hack enabled?    $(ARG_MAX_HACK)"


 # --- Clean rules ---
@@ -1059,16 +1059,16 @@ ifneq ($(SANDBOX),)
 	- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
 endif
 else
-	@echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)."
+	@echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)"
 	@- $(FIND) $(CONFIG_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
-	@echo "Removing makefile fragments from $(FRAME_FRAG_PATH)."
+	@echo "Removing makefile fragments from $(FRAME_FRAG_PATH)"
 	@- $(FIND) $(FRAME_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
-	@echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)."
+	@echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)"
 	@- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
-	@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)."
+	@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)"
 	@- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
 ifneq ($(SANDBOX),)
-	@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)."
+	@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)"
 	@- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
 endif
 endif
@@ -1080,7 +1080,7 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	$(RM_F) $(BLIS_H_FLAT)
 	$(RM_F) $(CBLAS_H_FLAT)
 else
-	@echo "Removing flattened header files from $(BASE_INC_PATH)."
+	@echo "Removing flattened header files from $(BASE_INC_PATH)"
 	@$(RM_F) $(BLIS_H_FLAT)
 	@$(RM_F) $(CBLAS_H_FLAT)
 endif
@@ -1093,9 +1093,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $(LIBBLIS_A_PATH)
 	- $(RM_F) $(LIBBLIS_SO_PATH)
 else
-	@echo "Removing object files from $(BASE_OBJ_PATH)."
+	@echo "Removing object files from $(BASE_OBJ_PATH)"
 	@- $(FIND) $(BASE_OBJ_PATH) -name "*.o" | $(XARGS) $(RM_F)
-	@echo "Removing libraries from $(BASE_LIB_PATH)."
+	@echo "Removing libraries from $(BASE_LIB_PATH)"
 	@- $(RM_F) $(LIBBLIS_A_PATH)
 	@- $(RM_F) $(LIBBLIS_SO_PATH)
 endif
@@ -1117,13 +1117,13 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
 	- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
 else
-	@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)."
+	@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)"
 	@- $(RM_F) $(BLASTEST_F2C_OBJS) $(BLASTEST_DRV_OBJS)
-	@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)."
+	@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)"
 	@- $(RM_F) $(BLASTEST_F2C_LIB)
-	@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)."
+	@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)"
 	@- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
-	@echo "Removing driver output files 'out.*'."
+	@echo "Removing driver output files 'out.*'"
 	@- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
 endif # ENABLE_VERBOSE
 endif # IS_CONFIGURED
@@ -1136,13 +1136,13 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $(BLASTEST_DIR)/$(BLASTEST_F2C_LIB_NAME)
 	- $(RM_F) $(addprefix $(BLASTEST_DIR)/out.,$(BLASTEST_DRV_BASES))
 else
-	@echo "Removing object files from ./$(BLASTEST_DIR)/$(OBJ_DIR)."
+	@echo "Removing object files from ./$(BLASTEST_DIR)/$(OBJ_DIR)"
 	@- $(FIND) $(BLASTEST_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
-	@echo "Removing libf2c.a from ./$(BLASTEST_DIR)."
+	@echo "Removing libf2c.a from ./$(BLASTEST_DIR)"
 	@- $(RM_F) $(BLASTEST_DIR)/$(BLASTEST_F2C_LIB_NAME)
-	@echo "Removing binaries from ./$(BLASTEST_DIR)."
+	@echo "Removing binaries from ./$(BLASTEST_DIR)"
 	@- $(FIND) $(BLASTEST_DIR) -name "*.x" | $(XARGS) $(RM_F)
-	@echo "Removing driver output files 'out.*' from ./$(BLASTEST_DIR)."
+	@echo "Removing driver output files 'out.*' from ./$(BLASTEST_DIR)"
 	@- $(RM_F) $(addprefix $(BLASTEST_DIR)/out.,$(BLASTEST_DRV_BASES))
 endif # ENABLE_VERBOSE
 endif # IS_CONFIGURED
@@ -1160,11 +1160,11 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $(TESTSUITE_BIN)
 	- $(RM_F) $(TESTSUITE_OUT_FILE)
 else
-	@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)."
+	@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)"
 	@- $(RM_F) $(MK_TESTSUITE_OBJS)
-	@echo "Removing binary $(TESTSUITE_BIN)."
+	@echo "Removing binary $(TESTSUITE_BIN)"
 	@- $(RM_F) $(TESTSUITE_BIN)
-	@echo "Removing $(TESTSUITE_OUT_FILE)."
+	@echo "Removing $(TESTSUITE_OUT_FILE)"
 	@- $(RM_F) $(TESTSUITE_OUT_FILE)
 endif # ENABLE_VERBOSE
 endif # IS_CONFIGURED
@@ -1176,9 +1176,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
 	- $(MAKE) -C $(CPP_TEST_DIR) clean
 else
-	@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)."
+	@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
 	@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
-	@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)."
+	@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
 	@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
 	@$(MAKE) -C $(CPP_TEST_DIR) clean
 endif # ENABLE_VERBOSE
@@ -1193,15 +1193,15 @@ ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_RF) $(LIB_DIR)
 	- $(RM_RF) $(INCLUDE_DIR)
 else
-	@echo "Removing $(BLIS_CONFIG_H)."
+	@echo "Removing $(BLIS_CONFIG_H)"
 	@$(RM_F) $(BLIS_CONFIG_H)
-	@echo "Removing $(CONFIG_MK_FILE)."
+	@echo "Removing $(CONFIG_MK_FILE)"
 	@- $(RM_F) $(CONFIG_MK_FILE)
-	@echo "Removing $(OBJ_DIR)."
+	@echo "Removing $(OBJ_DIR)"
 	@- $(RM_RF) $(OBJ_DIR)
-	@echo "Removing $(LIB_DIR)."
+	@echo "Removing $(LIB_DIR)"
 	@- $(RM_RF) $(LIB_DIR)
-	@echo "Removing $(INCLUDE_DIR)."
+	@echo "Removing $(INCLUDE_DIR)"
 	@- $(RM_RF) $(INCLUDE_DIR)
 endif
 endif
@@ -1210,7 +1210,7 @@ endif
 # --- CHANGELOG rules ---

 changelog:
-	@echo "Updating '$(DIST_PATH)/$(CHANGELOG)' via '$(GIT_LOG)'."
+	@echo "Updating '$(DIST_PATH)/$(CHANGELOG)' via '$(GIT_LOG)'"
 	@$(GIT_LOG) > $(DIST_PATH)/$(CHANGELOG) 


@@ -1225,7 +1225,7 @@ uninstall-libs: check-env
 ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $(MK_LIBS_INST)
 else
-	@echo "Uninstalling libraries $(notdir $(MK_LIBS_INST)) from $(dir $(firstword $(MK_LIBS_INST)))."
+	@echo "Uninstalling libraries $(notdir $(MK_LIBS_INST)) from $(dir $(firstword $(MK_LIBS_INST)))"
 	@- $(RM_F) $(MK_LIBS_INST)
 endif

@@ -1233,7 +1233,7 @@ uninstall-lib-symlinks: check-env
 ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $(MK_LIBS_SYML)
 else
-	@echo "Uninstalling symlinks $(notdir $(MK_LIBS_SYML)) from $(dir $(firstword $(MK_LIBS_SYML)))."
+	@echo "Uninstalling symlinks $(notdir $(MK_LIBS_SYML)) from $(dir $(firstword $(MK_LIBS_SYML)))"
 	@- $(RM_F) $(MK_LIBS_SYML)
 endif

@@ -1241,7 +1241,7 @@ uninstall-headers: check-env
 ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_RF) $(MK_INCL_DIR_INST)
 else
-	@echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))."
+	@echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))"
 	@- $(RM_RF) $(MK_INCL_DIR_INST)
 endif

@@ -1249,7 +1249,7 @@ uninstall-share: check-env
 ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_RF) $(MK_SHARE_DIR_INST)
 else
-	@echo "Uninstalling directory '$(notdir $(MK_SHARE_DIR_INST))' from $(dir $(MK_SHARE_DIR_INST))."
+	@echo "Uninstalling directory '$(notdir $(MK_SHARE_DIR_INST))' from $(dir $(MK_SHARE_DIR_INST))"
 	@- $(RM_RF) $(MK_SHARE_DIR_INST)
 endif

@@ -1265,7 +1265,7 @@ $(UNINSTALL_OLD_LIBS) $(UNINSTALL_OLD_SYML) $(UNINSTALL_OLD_HEADERS): check-env
 ifeq ($(ENABLE_VERBOSE),yes)
 	- $(RM_F) $@
 else
-	@echo "Uninstalling $(@F) from $(@D)/."
+	@echo "Uninstalling $(@F) from $(@D)/"
 	@- $(RM_F) $@
 endif

--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ Contents
 --------

 * **[Introduction](#introduction)**
+* **[Education and Learning](#education-and-learning)**
 * **[What's New](#whats-new)**
 * **[What People Are Saying About BLIS](#what-people-are-saying-about-blis)**
 * **[Key Features](#key-features)**
@@ -76,9 +77,38 @@ and [collaborators](http://shpc.ices.utexas.edu/collaborators.html),
 [publications](http://shpc.ices.utexas.edu/publications.html),
 and [other educational projects](http://www.ulaff.net/) (such as MOOCs).

+Education and Learning
+----------------------
+
+Want to understand what's under the hood?
+Many of the same concepts and principles employed when developing BLIS are
+introduced and taught in a basic pedagogical setting as part of
+[LAFF-On Programming for High Performance (LAFF-On-PfHP)](http://www.ulaff.net/),
+one of several massive open online courses (MOOCs) in the
+[Linear Algebra: Foundations to Frontiers](http://www.ulaff.net/) series,
+all of which are available for free via the [edX platform](http://www.edx.org/).
+
 What's New
 ----------

+ * **Small/skinny matrix support for dgemm now available!** Thanks to
+contributions made possible by our partnership with AMD, we have dramatically
+accelerated `gemm` for double-precision real matrix problems where one or two
+dimensions is exceedingly small. A natural byproduct of this optimization is
+that the traditional case of small _m = n = k_ (i.e. square matrices) is also
+accelerated, even though it was not targeted specifically. And though only
+`dgemm` was optimized for now, support for other datatypes, other operations,
+and/or multithreading may be implemented in the future. We've also added a new
+[PerformanceSmall](docs/PerformanceSmall.md) document to showcase the
+improvement in performance when some matrix dimensions are small.
+
+ * **Performance comparisons now available!** We recently measured the
+performance of various level-3 operations on a variety of hardware architectures,
+as implemented within BLIS and other BLAS libraries for all four of the standard
+floating-point datatypes. The results speak for themselves! Check out our
+extensive performance graphs and background info in our new
+[Performance](docs/Performance.md) document.
+
 * **BLIS is now in Debian Unstable!** Thanks to Debian developer-maintainers
 [M. Zhou](https://github.com/cdluminate) and
 [Nico Schlömer](https://github.com/nschloe) for sponsoring our package in Debian.
@@ -87,7 +117,7 @@ the second-most popular Linux distribution (behind Ubuntu, which Debian packages
 feed into). The Debian tracker page may be found
 [here](https://tracker.debian.org/pkg/blis).

- * **BLIS now supports mixed-datatype gemm.** The `gemm` operation may now be
+ * **BLIS now supports mixed-datatype gemm!** The `gemm` operation may now be
 executed on operands of mixed domains and/or mixed precisions. Any combination
 of storage datatype for A, B, and C is now supported, along with a separate
 computation precision that can differ from the storage precision of A and B.
@@ -313,10 +343,20 @@ table of supported microarchitectures.
 * **[Multithreading](docs/Multithreading.md).** This document describes how to
 use the multithreading features of BLIS.

- * **[Mixed-Datatype](docs/MixedDatatype.md).** This document provides an
+ * **[Mixed-Datatypes](docs/MixedDatatypes.md).** This document provides an
 overview of BLIS's mixed-datatype functionality and provides a brief example
 of how to take advantage of this new code.

+ * **[Performance](docs/Performance.md).** This document reports empirically
+measured performance of a representative set of level-3 operations on a variety
+of hardware architectures, as implemented within BLIS and other BLAS libraries
+for all four of the standard floating-point datatypes.
+
+ * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
+empirically measured performance of `gemm` on select hardware architectures
+within BLIS and other BLAS libraries when performing matrix problems where one
+or two dimensions is exceedingly small.
+
 * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of
 changes included with each new version of BLIS, along with contributor credits
 for key features.
--- a/attic/windows/Makefile
+++ b/attic/windows/Makefile
--- a/attic/windows/build/bli_kernel.h
+++ b/attic/windows/build/bli_kernel.h
--- a/attic/windows/build/config.mk.in
+++ b/attic/windows/build/config.mk.in
--- a/attic/windows/build/defs.mk
+++ b/attic/windows/build/defs.mk
--- a/attic/windows/build/gather-src-for-windows.py
+++ b/attic/windows/build/gather-src-for-windows.py
--- a/attic/windows/build/gen-check-rev-file.py
+++ b/attic/windows/build/gen-check-rev-file.py
--- a/attic/windows/build/gen-config-file.py
+++ b/attic/windows/build/gen-config-file.py
--- a/attic/windows/build/ignore_list
+++ b/attic/windows/build/ignore_list
--- a/attic/windows/build/ignore_list.windows
+++ b/attic/windows/build/ignore_list.windows
--- a/attic/windows/build/leaf_list
+++ b/attic/windows/build/leaf_list
--- a/attic/windows/build/nmake-help.cmd
+++ b/attic/windows/build/nmake-help.cmd
--- a/attic/windows/configure.cmd
+++ b/attic/windows/configure.cmd
--- a/attic/windows/gendll.cmd
+++ b/attic/windows/gendll.cmd
--- a/attic/windows/linkargs.txt
+++ b/attic/windows/linkargs.txt
--- a/attic/windows/linkargs64.txt
+++ b/attic/windows/linkargs64.txt
--- a/attic/windows/revision
+++ b/attic/windows/revision
--- a/attic/windows/vc110.pdb
+++ b/attic/windows/vc110.pdb
--- a/blastest/Makefile
+++ b/blastest/Makefile
@@ -136,7 +136,7 @@ CFLAGS         += -Wno-maybe-uninitialized -Wno-parentheses -Wfatal-errors \
                  -I$(INC_PATH) -DHAVE_BLIS_H

 # Locate the libblis library to which we will link.
-LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
+#LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)

 # Override the location of the check-blastest.sh script.
 #BLASTEST_CHECK := ./check-blastest.sh
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -135,6 +135,12 @@
 #endif
 #endif

+#if @enable_sup_handling@
+#define BLIS_ENABLE_SUP_HANDLING
+#else
+#define BLIS_DISABLE_SUP_HANDLING
+#endif
+
 #if @enable_memkind@
 #define BLIS_ENABLE_MEMKIND
 #else
@@ -159,4 +165,5 @@
 #define BLIS_DISABLE_SHARED
 #endif

+
 #endif
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -115,13 +115,33 @@ THREADING_MODEL   := @threading_model@
 # Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option.
 PRAGMA_OMP_SIMD   := @pragma_omp_simd@

-# The install libdir, includedir, and shareddir values from configure tell
-# us where to install the libraries, header files, and public makefile
-# fragments, respectively. Notice that we support the use of DESTDIR so that
-# advanced users may install to a temporary location.
-INSTALL_LIBDIR    := $(DESTDIR)@install_libdir@
-INSTALL_INCDIR    := $(DESTDIR)@install_incdir@
-INSTALL_SHAREDIR  := $(DESTDIR)@install_sharedir@
+# The installation prefix, exec_prefix, libdir, includedir, and shareddir
+# values from configure tell us where to install the libraries, header files,
+# and public makefile fragments. We must first assign each substituted
+# @anchor@ to its own variable. Why? Because the subsitutions may contain
+# unevaluated variable expressions. For example, '@libdir@' may be replaced
+# with '${exec_prefix}/lib'. By assigning the anchors to variables first, and
+# then assigning them to their final INSTALL_* variables, we allow prefix and
+# exec_prefix to be used in the definitions of exec_prefix, libdir,
+# includedir, and sharedir.
+prefix              := @prefix@
+exec_prefix         := @exec_prefix@
+libdir              := @libdir@
+includedir          := @includedir@
+sharedir            := @sharedir@
+
+# Notice that we support the use of DESTDIR so that advanced users may install
+# to a temporary location.
+INSTALL_LIBDIR      := $(DESTDIR)$(libdir)
+INSTALL_INCDIR      := $(DESTDIR)$(includedir)
+INSTALL_SHAREDIR    := $(DESTDIR)$(sharedir)
+
+#$(info prefix      = $(prefix) )
+#$(info exec_prefix = $(exec_prefix) )
+#$(info libdir      = $(libdir) )
+#$(info includedir  = $(includedir) )
+#$(info sharedir    = $(sharedir) )
+#$(error .)

 # Whether to output verbose command-line feedback as the Makefile is
 # processed.
@@ -135,11 +155,15 @@ BUILDING_OOT      := @configured_oot@
 ARG_MAX_HACK      := @enable_arg_max_hack@

 # Whether to build the static and shared libraries.
-# Note the "MK_" prefix, which helps differentiate these variables from
+# NOTE: The "MK_" prefix, which helps differentiate these variables from
 # their corresonding cpp macros that use the BLIS_ prefix.
 MK_ENABLE_STATIC  := @enable_static@
 MK_ENABLE_SHARED  := @enable_shared@

+# Whether to export all symbols within the shared library, even those symbols
+# that are considered to be for internal use only.
+EXPORT_SHARED     := @export_shared@
+
 # Whether to enable either the BLAS or CBLAS compatibility layers.
 MK_ENABLE_BLAS    := @enable_blas@
 MK_ENABLE_CBLAS   := @enable_cblas@
--- a/build/detect/config/config_detect.c
+++ b/build/detect/config/config_detect.c
@@ -33,6 +33,7 @@

 */

+#define BLIS_EXPORT_BLIS
 #include "bli_system.h"
 #include "bli_type_defs.h"
 #include "bli_arch.h"
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -244,10 +244,24 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 			# directive.
 			header_path = get_header_path( header, header_dirpaths )

-			# If the header was found, we recurse. Otherwise, we output
-			# the #include directive with a comment indicating that it
-			# was skipped.
-			if header_path:
+			# First, check if the header is our root header (and if so, ignore it).
+			# Otherwise, if the header was found, we recurse. Otherwise, we output
+			# the #include directive with a comment indicating that it as skipped
+			if header == root_inputfile:
+
+				markl = result.group(1)
+				markr = result.group(3)
+
+				echov2( "%sthis is the root header '%s'; commenting out / skipping." \
+				        % ( cursp, header ) )
+
+				# If the header found is our root header, then we cannot
+				# recurse into it lest we enter an infinite loop. Output the
+				# line but make sure it's commented out entirely.
+				ostring += "%s #include %c%s%c %c" \
+				           % ( skipstr, markl, header, markr, '\n' )
+
+			elif header_path:

 				echov2( "%slocated file '%s'; recursing." \
 				        % ( cursp, header_path ) )
@@ -327,6 +341,7 @@ strip_comments = None
 recursive_flag = None
 verbose_flag   = None
 regex          = None
+root_inputfile = None

 def main():

@@ -336,6 +351,7 @@ def main():
 	global recursive_flag
 	global verbose_flag
 	global regex
+	global root_inputfile

 	# Obtain the script name.
 	path, script_name = os.path.split(sys.argv[0])
@@ -397,6 +413,10 @@ def main():
 	temp_dir   = args[2]
 	dir_list   = args[3]

+	# Save the filename (basename) part of the input file (or root file) into a
+	# global variable that we can access later from within flatten_header().
+	root_inputfile = os.path.basename( inputfile )
+
 	# Separate the directories into distinct strings.
 	dir_list = dir_list.split()

--- a/build/gen-make-frags/gen-make-frag.sh
+++ b/build/gen-make-frags/gen-make-frag.sh
@@ -417,8 +417,9 @@ main()
 	
 	# The arguments to this function. They'll get assigned meaningful
 	# values after getopts.
-	mkfile_frag_tmpl_path=""
 	root_dir=""
+	frag_dir=""
+	mkfile_frag_tmpl_path=""
 	suffix_file=""
 	ignore_file=""
 	
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -183,13 +183,11 @@ bli_cgemm4mb
 bli_cgemm4mb_ker_var2
 bli_cgemm4mh
 bli_cgemm_ex
-bli_cgemm_haswell_asm_3x8
-bli_cgemm_haswell_asm_8x3
 bli_cgemm_ker_var2
 bli_cgemm_md_c2r_ref
-bli_cgemm_ukernel
 bli_cgemmtrsm_l_ukernel
 bli_cgemmtrsm_u_ukernel
+bli_cgemm_ukernel
 bli_cgemv
 bli_cgemv_ex
 bli_cgemv_unb_var1
@@ -285,12 +283,6 @@ bli_chemv_unf_var3a
 bli_cher
 bli_cher2
 bli_cher2_ex
-bli_cher2_unb_var1
-bli_cher2_unb_var2
-bli_cher2_unb_var3
-bli_cher2_unb_var4
-bli_cher2_unf_var1
-bli_cher2_unf_var4
 bli_cher2k
 bli_cher2k1m
 bli_cher2k3m1
@@ -298,9 +290,13 @@ bli_cher2k3mh
 bli_cher2k4m1
 bli_cher2k4mh
 bli_cher2k_ex
+bli_cher2_unb_var1
+bli_cher2_unb_var2
+bli_cher2_unb_var3
+bli_cher2_unb_var4
+bli_cher2_unf_var1
+bli_cher2_unf_var4
 bli_cher_ex
-bli_cher_unb_var1
-bli_cher_unb_var2
 bli_cherk
 bli_cherk1m
 bli_cherk3m1
@@ -310,6 +306,8 @@ bli_cherk4mh
 bli_cherk_ex
 bli_cherk_l_ker_var2
 bli_cherk_u_ker_var2
+bli_cher_unb_var1
+bli_cher_unb_var2
 bli_cinvertd
 bli_cinvertd_ex
 bli_cinvertsc
@@ -354,8 +352,8 @@ bli_cntl_copy
 bli_cntl_create_node
 bli_cntl_free
 bli_cntl_free_node
-bli_cntl_free_w_thrinfo
 bli_cntl_free_wo_thrinfo
+bli_cntl_free_w_thrinfo
 bli_cntl_mark_family
 bli_cntx_1m_stage
 bli_cntx_3m1_stage
@@ -544,8 +542,8 @@ bli_ctrsm1m
 bli_ctrsm3m1
 bli_ctrsm4m1
 bli_ctrsm_ex
-bli_ctrsm_l_ukernel
 bli_ctrsm_ll_ker_var2
+bli_ctrsm_l_ukernel
 bli_ctrsm_lu_ker_var2
 bli_ctrsm_rl_ker_var2
 bli_ctrsm_ru_ker_var2
@@ -591,7 +589,6 @@ bli_daddv
 bli_daddv_ex
 bli_damaxv
 bli_damaxv_ex
-bli_damaxv_zen_int
 bli_dasumv
 bli_dasumv_ex
 bli_dasumv_unb_var1
@@ -603,14 +600,11 @@ bli_daxpyd
 bli_daxpyd_ex
 bli_daxpyf
 bli_daxpyf_ex
-bli_daxpyf_zen_int_8
 bli_daxpym
 bli_daxpym_ex
 bli_daxpym_unb_var1
 bli_daxpyv
 bli_daxpyv_ex
-bli_daxpyv_zen_int
-bli_daxpyv_zen_int10
 bli_dccastm
 bli_dccastnzm
 bli_dccastv
@@ -640,16 +634,12 @@ bli_ddotaxpyv
 bli_ddotaxpyv_ex
 bli_ddotv
 bli_ddotv_ex
-bli_ddotv_zen_int
-bli_ddotv_zen_int10
 bli_ddotxaxpyf
 bli_ddotxaxpyf_ex
 bli_ddotxf
 bli_ddotxf_ex
-bli_ddotxf_zen_int_8
 bli_ddotxv
 bli_ddotxv_ex
-bli_ddotxv_zen_int
 bli_ddpackm_blk_var1_md
 bli_ddpackm_cxk_1e_md
 bli_ddpackm_cxk_1r_md
@@ -673,14 +663,10 @@ bli_dgemm4mb
 bli_dgemm4mb_ker_var2
 bli_dgemm4mh
 bli_dgemm_ex
-bli_dgemm_haswell_asm_6x8
-bli_dgemm_haswell_asm_8x6
 bli_dgemm_ker_var2
-bli_dgemm_ukernel
-bli_dgemmtrsm_l_haswell_asm_6x8
 bli_dgemmtrsm_l_ukernel
-bli_dgemmtrsm_u_haswell_asm_6x8
 bli_dgemmtrsm_u_ukernel
+bli_dgemm_ukernel
 bli_dgemv
 bli_dgemv_ex
 bli_dgemv_unb_var1
@@ -713,12 +699,6 @@ bli_dhemv_unf_var3a
 bli_dher
 bli_dher2
 bli_dher2_ex
-bli_dher2_unb_var1
-bli_dher2_unb_var2
-bli_dher2_unb_var3
-bli_dher2_unb_var4
-bli_dher2_unf_var1
-bli_dher2_unf_var4
 bli_dher2k
 bli_dher2k1m
 bli_dher2k3m1
@@ -726,9 +706,13 @@ bli_dher2k3mh
 bli_dher2k4m1
 bli_dher2k4mh
 bli_dher2k_ex
+bli_dher2_unb_var1
+bli_dher2_unb_var2
+bli_dher2_unb_var3
+bli_dher2_unb_var4
+bli_dher2_unf_var1
+bli_dher2_unf_var4
 bli_dher_ex
-bli_dher_unb_var1
-bli_dher_unb_var2
 bli_dherk
 bli_dherk1m
 bli_dherk3m1
@@ -738,6 +722,8 @@ bli_dherk4mh
 bli_dherk_ex
 bli_dherk_l_ker_var2
 bli_dherk_u_ker_var2
+bli_dher_unb_var1
+bli_dher_unb_var2
 bli_dinvertd
 bli_dinvertd_ex
 bli_dinvertsc
@@ -746,11 +732,6 @@ bli_dinvertv_ex
 bli_divsc
 bli_divsc_check
 bli_divsc_qfp
-bli_dlamc1
-bli_dlamc2
-bli_dlamc3
-bli_dlamc4
-bli_dlamc5
 bli_dlamch
 bli_dmachval
 bli_dmkherm
@@ -838,8 +819,6 @@ bli_dscalm_ex
 bli_dscalm_unb_var1
 bli_dscalv
 bli_dscalv_ex
-bli_dscalv_zen_int
-bli_dscalv_zen_int10
 bli_dscastm
 bli_dscastnzm
 bli_dscastv
@@ -906,11 +885,6 @@ bli_dsyrk3mh
 bli_dsyrk4m1
 bli_dsyrk4mh
 bli_dsyrk_ex
-bli_dt_size
-bli_dt_size_check
-bli_dt_string
-bli_dt_string_check
-bli_dt_union_check
 bli_dtrmm
 bli_dtrmm1m
 bli_dtrmm3
@@ -938,8 +912,8 @@ bli_dtrsm1m
 bli_dtrsm3m1
 bli_dtrsm4m1
 bli_dtrsm_ex
-bli_dtrsm_l_ukernel
 bli_dtrsm_ll_ker_var2
+bli_dtrsm_l_ukernel
 bli_dtrsm_lu_ker_var2
 bli_dtrsm_rl_ker_var2
 bli_dtrsm_ru_ker_var2
@@ -950,6 +924,11 @@ bli_dtrsv_unb_var1
 bli_dtrsv_unb_var2
 bli_dtrsv_unf_var1
 bli_dtrsv_unf_var2
+bli_dt_size
+bli_dt_size_check
+bli_dt_string
+bli_dt_string_check
+bli_dt_union_check
 bli_dunpackm_blk_var1
 bli_dunpackm_cxk
 bli_dunpackm_unb_var1
@@ -1018,6 +997,7 @@ bli_gemm_basic_check
 bli_gemm_blk_var1
 bli_gemm_blk_var2
 bli_gemm_blk_var3
+bli_gemmbp_cntl_create
 bli_gemm_check
 bli_gemm_cntl_create
 bli_gemm_cntl_create_node
@@ -1028,6 +1008,8 @@ bli_gemm_determine_kc_f
 bli_gemm_direct
 bli_gemm_ex
 bli_gemm_front
+bli_gemmind
+bli_gemmind_get_avail
 bli_gemm_int
 bli_gemm_ker_var2
 bli_gemm_ker_var2_md
@@ -1040,20 +1022,17 @@ bli_gemm_md_rcc
 bli_gemm_md_rcr
 bli_gemm_md_rrc
 bli_gemm_md_rrr
+bli_gemmnat
 bli_gemm_packa
 bli_gemm_packb
 bli_gemm_prune_unref_mparts_k
 bli_gemm_prune_unref_mparts_m
 bli_gemm_prune_unref_mparts_n
+bli_gemmtrsm_l_ukernel_qfp
+bli_gemmtrsm_ukernel
+bli_gemmtrsm_u_ukernel_qfp
 bli_gemm_ukernel
 bli_gemm_ukernel_qfp
-bli_gemmbp_cntl_create
-bli_gemmind
-bli_gemmind_get_avail
-bli_gemmnat
-bli_gemmtrsm_l_ukernel_qfp
-bli_gemmtrsm_u_ukernel_qfp
-bli_gemmtrsm_ukernel
 bli_gemv
 bli_gemv_check
 bli_gemv_ex
@@ -1120,30 +1099,18 @@ bli_hemv_unb_var3_qfp
 bli_hemv_unb_var4
 bli_hemv_unb_var4_qfp
 bli_hemv_unf_var1
-bli_hemv_unf_var1_qfp
 bli_hemv_unf_var1a
 bli_hemv_unf_var1a_qfp
+bli_hemv_unf_var1_qfp
 bli_hemv_unf_var3
-bli_hemv_unf_var3_qfp
 bli_hemv_unf_var3a
 bli_hemv_unf_var3a_qfp
+bli_hemv_unf_var3_qfp
 bli_her
 bli_her2
 bli_her2_check
 bli_her2_ex
 bli_her2_ex_qfp
-bli_her2_unb_var1
-bli_her2_unb_var1_qfp
-bli_her2_unb_var2
-bli_her2_unb_var2_qfp
-bli_her2_unb_var3
-bli_her2_unb_var3_qfp
-bli_her2_unb_var4
-bli_her2_unb_var4_qfp
-bli_her2_unf_var1
-bli_her2_unf_var1_qfp
-bli_her2_unf_var4
-bli_her2_unf_var4_qfp
 bli_her2k
 bli_her2k1m
 bli_her2k3m1
@@ -1157,13 +1124,21 @@ bli_her2k_front
 bli_her2kind
 bli_her2kind_get_avail
 bli_her2knat
+bli_her2_unb_var1
+bli_her2_unb_var1_qfp
+bli_her2_unb_var2
+bli_her2_unb_var2_qfp
+bli_her2_unb_var3
+bli_her2_unb_var3_qfp
+bli_her2_unb_var4
+bli_her2_unb_var4_qfp
+bli_her2_unf_var1
+bli_her2_unf_var1_qfp
+bli_her2_unf_var4
+bli_her2_unf_var4_qfp
 bli_her_check
 bli_her_ex
 bli_her_ex_qfp
-bli_her_unb_var1
-bli_her_unb_var1_qfp
-bli_her_unb_var2
-bli_her_unb_var2_qfp
 bli_herk
 bli_herk1m
 bli_herk3m1
@@ -1178,15 +1153,19 @@ bli_herk_determine_kc_f
 bli_herk_direct
 bli_herk_ex
 bli_herk_front
+bli_herkind
+bli_herkind_get_avail
 bli_herk_l_ker_var2
+bli_herknat
 bli_herk_prune_unref_mparts_k
 bli_herk_prune_unref_mparts_m
 bli_herk_prune_unref_mparts_n
 bli_herk_u_ker_var2
 bli_herk_x_ker_var2
-bli_herkind
-bli_herkind_get_avail
-bli_herknat
+bli_her_unb_var1
+bli_her_unb_var1_qfp
+bli_her_unb_var2
+bli_her_unb_var2_qfp
 bli_ifprintm
 bli_ifprintv
 bli_igetsc
@@ -1217,9 +1196,9 @@ bli_info_get_enable_sba_pools
 bli_info_get_enable_stay_auto_init
 bli_info_get_enable_threading
 bli_info_get_gemm_impl_string
-bli_info_get_gemm_ukr_impl_string
 bli_info_get_gemmtrsm_l_ukr_impl_string
 bli_info_get_gemmtrsm_u_ukr_impl_string
+bli_info_get_gemm_ukr_impl_string
 bli_info_get_heap_addr_align_size
 bli_info_get_heap_stride_align_size
 bli_info_get_hemm_impl_string
@@ -1278,12 +1257,12 @@ bli_l1d_xy_check
 bli_l1m_ax_check
 bli_l1m_axy_check
 bli_l1m_xy_check
-bli_l1v_ax_check
 bli_l1v_axby_check
+bli_l1v_ax_check
 bli_l1v_axy_check
 bli_l1v_dot_check
-bli_l1v_x_check
 bli_l1v_xby_check
+bli_l1v_x_check
 bli_l1v_xi_check
 bli_l1v_xy_check
 bli_l3_basic_check
@@ -1452,12 +1431,10 @@ bli_pool_init
 bli_pool_print
 bli_pool_reinit
 bli_pool_shrink
-bli_pow_di
-bli_pow_ri
 bli_prime_factorization
-bli_print_msg
 bli_printm
 bli_printm_ex
+bli_print_msg
 bli_printv
 bli_printv_ex
 bli_projm
@@ -1510,7 +1487,6 @@ bli_saddv
 bli_saddv_ex
 bli_samaxv
 bli_samaxv_ex
-bli_samaxv_zen_int
 bli_sasumv
 bli_sasumv_ex
 bli_sasumv_unb_var1
@@ -1522,14 +1498,11 @@ bli_saxpyd
 bli_saxpyd_ex
 bli_saxpyf
 bli_saxpyf_ex
-bli_saxpyf_zen_int_8
 bli_saxpym
 bli_saxpym_ex
 bli_saxpym_unb_var1
 bli_saxpyv
 bli_saxpyv_ex
-bli_saxpyv_zen_int
-bli_saxpyv_zen_int10
 bli_sba_acquire
 bli_sba_checkin_array
 bli_sba_checkout_array
@@ -1591,16 +1564,12 @@ bli_sdotaxpyv
 bli_sdotaxpyv_ex
 bli_sdotv
 bli_sdotv_ex
-bli_sdotv_zen_int
-bli_sdotv_zen_int10
 bli_sdotxaxpyf
 bli_sdotxaxpyf_ex
 bli_sdotxf
 bli_sdotxf_ex
-bli_sdotxf_zen_int_8
 bli_sdotxv
 bli_sdotxv_ex
-bli_sdotxv_zen_int
 bli_sdpackm_blk_var1_md
 bli_sdpackm_cxk_1e_md
 bli_sdpackm_cxk_1r_md
@@ -1643,14 +1612,10 @@ bli_sgemm4mb
 bli_sgemm4mb_ker_var2
 bli_sgemm4mh
 bli_sgemm_ex
-bli_sgemm_haswell_asm_16x6
-bli_sgemm_haswell_asm_6x16
 bli_sgemm_ker_var2
-bli_sgemm_ukernel
-bli_sgemmtrsm_l_haswell_asm_6x16
 bli_sgemmtrsm_l_ukernel
-bli_sgemmtrsm_u_haswell_asm_6x16
 bli_sgemmtrsm_u_ukernel
+bli_sgemm_ukernel
 bli_sgemv
 bli_sgemv_ex
 bli_sgemv_unb_var1
@@ -1683,12 +1648,6 @@ bli_shemv_unf_var3a
 bli_sher
 bli_sher2
 bli_sher2_ex
-bli_sher2_unb_var1
-bli_sher2_unb_var2
-bli_sher2_unb_var3
-bli_sher2_unb_var4
-bli_sher2_unf_var1
-bli_sher2_unf_var4
 bli_sher2k
 bli_sher2k1m
 bli_sher2k3m1
@@ -1696,9 +1655,13 @@ bli_sher2k3mh
 bli_sher2k4m1
 bli_sher2k4mh
 bli_sher2k_ex
+bli_sher2_unb_var1
+bli_sher2_unb_var2
+bli_sher2_unb_var3
+bli_sher2_unb_var4
+bli_sher2_unf_var1
+bli_sher2_unf_var4
 bli_sher_ex
-bli_sher_unb_var1
-bli_sher_unb_var2
 bli_sherk
 bli_sherk1m
 bli_sherk3m1
@@ -1708,6 +1671,8 @@ bli_sherk4mh
 bli_sherk_ex
 bli_sherk_l_ker_var2
 bli_sherk_u_ker_var2
+bli_sher_unb_var1
+bli_sher_unb_var2
 bli_shiftd
 bli_shiftd_check
 bli_shiftd_ex
@@ -1717,11 +1682,6 @@ bli_sinvertd_ex
 bli_sinvertsc
 bli_sinvertv
 bli_sinvertv_ex
-bli_slamc1
-bli_slamc2
-bli_slamc3
-bli_slamc4
-bli_slamc5
 bli_slamch
 bli_sleep
 bli_smachval
@@ -1793,8 +1753,6 @@ bli_sscalm_ex
 bli_sscalm_unb_var1
 bli_sscalv
 bli_sscalv_ex
-bli_sscalv_zen_int
-bli_sscalv_zen_int10
 bli_sscastm
 bli_sscastnzm
 bli_sscastv
@@ -1889,8 +1847,8 @@ bli_strsm1m
 bli_strsm3m1
 bli_strsm4m1
 bli_strsm_ex
-bli_strsm_l_ukernel
 bli_strsm_ll_ker_var2
+bli_strsm_l_ukernel
 bli_strsm_lu_ker_var2
 bli_strsm_rl_ker_var2
 bli_strsm_ru_ker_var2
@@ -2062,17 +2020,17 @@ bli_trmm_determine_kc_f
 bli_trmm_direct
 bli_trmm_ex
 bli_trmm_front
+bli_trmmind
+bli_trmmind_get_avail
 bli_trmm_ll_ker_var2
 bli_trmm_lu_ker_var2
+bli_trmmnat
 bli_trmm_prune_unref_mparts_k
 bli_trmm_prune_unref_mparts_m
 bli_trmm_prune_unref_mparts_n
 bli_trmm_rl_ker_var2
 bli_trmm_ru_ker_var2
 bli_trmm_xx_ker_var2
-bli_trmmind
-bli_trmmind_get_avail
-bli_trmmnat
 bli_trmv
 bli_trmv_check
 bli_trmv_ex
@@ -2102,11 +2060,14 @@ bli_trsm_determine_kc_f
 bli_trsm_direct
 bli_trsm_ex
 bli_trsm_front
+bli_trsmind
+bli_trsmind_get_avail
 bli_trsm_int
 bli_trsm_l_cntl_create
-bli_trsm_l_ukernel_qfp
 bli_trsm_ll_ker_var2
+bli_trsm_l_ukernel_qfp
 bli_trsm_lu_ker_var2
+bli_trsmnat
 bli_trsm_packa
 bli_trsm_packb
 bli_trsm_prune_unref_mparts_k
@@ -2115,12 +2076,9 @@ bli_trsm_prune_unref_mparts_n
 bli_trsm_r_cntl_create
 bli_trsm_rl_ker_var2
 bli_trsm_ru_ker_var2
-bli_trsm_u_ukernel_qfp
 bli_trsm_ukernel
+bli_trsm_u_ukernel_qfp
 bli_trsm_xx_ker_var2
-bli_trsmind
-bli_trsmind_get_avail
-bli_trsmnat
 bli_trsv
 bli_trsv_check
 bli_trsv_ex
@@ -2245,13 +2203,11 @@ bli_zgemm4mb
 bli_zgemm4mb_ker_var2
 bli_zgemm4mh
 bli_zgemm_ex
-bli_zgemm_haswell_asm_3x4
-bli_zgemm_haswell_asm_4x3
 bli_zgemm_ker_var2
 bli_zgemm_md_c2r_ref
-bli_zgemm_ukernel
 bli_zgemmtrsm_l_ukernel
 bli_zgemmtrsm_u_ukernel
+bli_zgemm_ukernel
 bli_zgemv
 bli_zgemv_ex
 bli_zgemv_unb_var1
@@ -2284,12 +2240,6 @@ bli_zhemv_unf_var3a
 bli_zher
 bli_zher2
 bli_zher2_ex
-bli_zher2_unb_var1
-bli_zher2_unb_var2
-bli_zher2_unb_var3
-bli_zher2_unb_var4
-bli_zher2_unf_var1
-bli_zher2_unf_var4
 bli_zher2k
 bli_zher2k1m
 bli_zher2k3m1
@@ -2297,9 +2247,13 @@ bli_zher2k3mh
 bli_zher2k4m1
 bli_zher2k4mh
 bli_zher2k_ex
+bli_zher2_unb_var1
+bli_zher2_unb_var2
+bli_zher2_unb_var3
+bli_zher2_unb_var4
+bli_zher2_unf_var1
+bli_zher2_unf_var4
 bli_zher_ex
-bli_zher_unb_var1
-bli_zher_unb_var2
 bli_zherk
 bli_zherk1m
 bli_zherk3m1
@@ -2309,6 +2263,8 @@ bli_zherk4mh
 bli_zherk_ex
 bli_zherk_l_ker_var2
 bli_zherk_u_ker_var2
+bli_zher_unb_var1
+bli_zher_unb_var2
 bli_zinvertd
 bli_zinvertd_ex
 bli_zinvertsc
@@ -2492,8 +2448,8 @@ bli_ztrsm1m
 bli_ztrsm3m1
 bli_ztrsm4m1
 bli_ztrsm_ex
-bli_ztrsm_l_ukernel
 bli_ztrsm_ll_ker_var2
+bli_ztrsm_l_ukernel
 bli_ztrsm_lu_ker_var2
 bli_ztrsm_rl_ker_var2
 bli_ztrsm_ru_ker_var2
@@ -2528,19 +2484,6 @@ bli_zzpackm_struc_cxk_md
 bli_zzxpbym_md
 bli_zzxpbym_md_ex
 bli_zzxpbym_md_unb_var1
-bla_c_abs
-bla_c_div
-bla_d_abs
-bla_d_cnjg
-bla_d_imag
-bla_d_sign
-bla_f__cabs
-bla_r_abs
-bla_r_cnjg
-bla_r_imag
-bla_r_sign
-bla_z_abs
-bla_z_div
 sasum_
 sasumsub_
 saxpy_
@@ -2567,14 +2510,14 @@ srotmg_
 ssbmv_
 sscal_
 sspmv_
-sspr2_
 sspr_
+sspr2_
 sswap_
 ssymm_
 ssymv_
+ssyr_
 ssyr2_
 ssyr2k_
-ssyr_
 ssyrk_
 stbmv_
 stbsv_
@@ -2606,14 +2549,14 @@ dscal_
 dsdot_
 dsdotsub_
 dspmv_
-dspr2_
 dspr_
+dspr2_
 dswap_
 dsymm_
 dsymv_
+dsyr_
 dsyr2_
 dsyr2k_
-dsyr_
 dsyrk_
 dtbmv_
 dtbsv_
@@ -2641,13 +2584,13 @@ cgeru_
 chbmv_
 chemm_
 chemv_
+cher_
 cher2_
 cher2k_
-cher_
 cherk_
 chpmv_
-chpr2_
 chpr_
+chpr2_
 crotg_
 cscal_
 csrot_
@@ -2680,13 +2623,13 @@ zgeru_
 zhbmv_
 zhemm_
 zhemv_
+zher_
 zher2_
 zher2k_
-zher_
 zherk_
 zhpmv_
-zhpr2_
 zhpr_
+zhpr2_
 zrotg_
 zscal_
 zswap_
--- a/build/templates/license.c
+++ b/build/templates/license.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2019, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
--- a/build/templates/license.h
+++ b/build/templates/license.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2019, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
--- a/build/templates/license.sh
+++ b/build/templates/license.sh
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2019, The University of Texas at Austin
+#  Copyright (C) 2018, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
--- a/common.mk
+++ b/common.mk
@@ -118,7 +118,8 @@ get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
 get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                   $(call get-noopt-cflags-for,$(1)) \
                                   -DBLIS_CNAME=$(1) \
-                                   $(BUILD_FLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
                            )

 get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
@@ -126,23 +127,27 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                   $(call get-noopt-cflags-for,$(1)) \
                                   $(COMPSIMDFLAGS) \
                                   -DBLIS_CNAME=$(1) \
-                                   $(BUILD_FLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
                            )

 get-config-cflags-for    = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                   $(call get-noopt-cflags-for,$(1)) \
-                                   $(BUILD_FLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
                            )

 get-frame-cflags-for     = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                   $(call get-noopt-cflags-for,$(1)) \
-                                   $(BUILD_FLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
                            )

 get-kernel-cflags-for    = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
                                   $(call load-var-for,CKVECFLAGS,$(1)) \
                                   $(call get-noopt-cflags-for,$(1)) \
-                                   $(BUILD_FLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
                            )

 # When compiling sandboxes, we use flags similar to those of general framework
@@ -153,19 +158,24 @@ get-kernel-cflags-for    = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
 get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                   $(call get-noopt-cflags-for,$(1)) \
                                   $(CSBOXINCFLAGS) \
-                                   $(BUILD_FLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
                            )
 get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                   $(call get-noopt-cxxflags-for,$(1)) \
                                   $(CSBOXINCFLAGS) \
-                                   $(BUILD_FLAGS) \
+                                   $(BUILD_CPPFLAGS) \
+                                   $(BUILD_SYMFLAGS) \
                            )

 # Define a separate function that will return appropriate flags for use by
 # applications that want to use the same basic flags as those used when BLIS
-# was compiled. (This is the same as get-frame-cflags-for(), except that it
-# omits the BUILD_FLAGS, which are exclusively for use when BLIS is being
-# compiled.)
+# was compiled. (NOTE: This is the same as the $(get-frame-cflags-for ...)
+# function, except that it omits two variables that contain flags exclusively
+# for use when BLIS is being compiled/built: BUILD_CPPFLAGS, which contains a
+# cpp macro that confirms that BLIS is being built; and BUILD_SYMFLAGS, which
+# contains symbol export flags that are only needed when a shared library is
+# being compiled/linked.)
 get-user-cflags-for      = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                   $(call get-noopt-cflags-for,$(1)) \
                            )
@@ -508,9 +518,9 @@ SOFLAGS    := -shared
 ifeq ($(IS_WIN),yes)
 # Windows shared library link flags.
 ifeq ($(CC_VENDOR),clang)
-SOFLAGS    += -Wl,-def:build/libblis-symbols.def -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib
+SOFLAGS    += -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib
 else
-SOFLAGS    += -Wl,--export-all-symbols -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a
+SOFLAGS    += -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a
 endif
 else
 # Linux shared library link flags.
@@ -532,6 +542,11 @@ ifeq ($(IS_WIN),no)
 LDFLAGS        += -Wl,-rpath,$(BASE_LIB_PATH)
 endif
 endif
+# On windows, use the shared library even if static is created.
+ifeq ($(IS_WIN),yes)
+LIBBLIS_L      := $(LIBBLIS_SO)
+LIBBLIS_LINK   := $(LIBBLIS_SO_PATH)
+endif
 endif


@@ -610,7 +625,7 @@ endif

 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c))))

-# --- Shared library (position-independent code) flags ---
+# --- Position-independent code flags (shared libraries only) ---

 # Emit position-independent code for dynamic linking.
 ifeq ($(IS_WIN),yes)
@@ -622,6 +637,71 @@ CPICFLAGS := -fPIC
 endif
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPICFLAGS,$(c))))

+# --- Symbol exporting flags (shared libraries only) ---
+
+# NOTE: These flags are only applied when building BLIS and not used by
+# applications that import BLIS compilation flags via the
+# $(get-user-cflags-for ...) function.
+
+# Determine default export behavior / visibility of symbols for gcc.
+ifeq ($(CC_VENDOR),gcc)
+ifeq ($(IS_WIN),yes)
+ifeq ($(EXPORT_SHARED),all)
+BUILD_SYMFLAGS := -Wl,--export-all-symbols, -Wl,--enable-auto-import
+else # ifeq ($(EXPORT_SHARED),public)
+BUILD_SYMFLAGS := -Wl,--exclude-all-symbols
+endif
+else # ifeq ($(IS_WIN),no)
+ifeq ($(EXPORT_SHARED),all)
+# Export all symbols by default.
+BUILD_SYMFLAGS := -fvisibility=default
+else # ifeq ($(EXPORT_SHARED),public)
+# Hide all symbols by default and export only those that have been annotated
+# as needing to be exported.
+BUILD_SYMFLAGS := -fvisibility=hidden
+endif
+endif
+endif
+
+# Determine default export behavior / visibility of symbols for icc.
+# NOTE: The Windows branches have been omitted since we currently make no
+# effort to support Windows builds via icc (only gcc/clang via AppVeyor).
+ifeq ($(CC_VENDOR),icc)
+ifeq ($(EXPORT_SHARED),all)
+# Export all symbols by default.
+BUILD_SYMFLAGS := -fvisibility=default
+else # ifeq ($(EXPORT_SHARED),public)
+# Hide all symbols by default and export only those that have been annotated
+# as needing to be exported.
+BUILD_SYMFLAGS := -fvisibility=hidden
+endif
+endif
+
+# Determine default export behavior / visibility of symbols for clang.
+ifeq ($(CC_VENDOR),clang)
+ifeq ($(IS_WIN),yes)
+ifeq ($(EXPORT_SHARED),all)
+# NOTE: clang on Windows does not appear to support exporting all symbols
+# by default, and therefore we ignore the value of EXPORT_SHARED.
+BUILD_SYMFLAGS :=
+else # ifeq ($(EXPORT_SHARED),public)
+# NOTE: The default behavior of clang on Windows is to hide all symbols
+# and only export functions and other declarations that have beenannotated
+# as needing to be exported.
+BUILD_SYMFLAGS :=
+endif
+else # ifeq ($(IS_WIN),no)
+ifeq ($(EXPORT_SHARED),all)
+# Export all symbols by default.
+BUILD_SYMFLAGS := -fvisibility=default
+else # ifeq ($(EXPORT_SHARED),public)
+# Hide all symbols by default and export only those that have been annotated
+# as needing to be exported.
+BUILD_SYMFLAGS := -fvisibility=hidden
+endif
+endif
+endif
+
 # --- Language flags ---

 # Enable C99.
@@ -685,8 +765,18 @@ endif
 # --- #pragma omp simd flags (used for reference kernels only) ---

 ifeq ($(PRAGMA_OMP_SIMD),yes)
+ifeq ($(CC_VENDOR),gcc)
 COMPSIMDFLAGS := -fopenmp-simd
 else
+ifeq ($(CC_VENDOR),clang)
+COMPSIMDFLAGS := -fopenmp-simd
+else
+ifeq ($(CC_VENDOR),icc)
+COMPSIMDFLAGS := -qopenmp-simd
+endif
+endif
+endif
+else # ifeq ($(PRAGMA_OMP_SIMD),no)
 COMPSIMDFLAGS :=
 endif

@@ -960,7 +1050,7 @@ VERS_DEF       := -DBLIS_VERSION_STRING=\"$(VERSION)\"
 # Define a C preprocessor flag that is *only* defined when BLIS is being
 # compiled. (In other words, an application that #includes blis.h will not
 # get this cpp macro.)
-BUILD_FLAGS    := -DBLIS_IS_BUILDING_LIBRARY
+BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY



--- a/config/amd64/make_defs.mk
+++ b/config/amd64/make_defs.mk
@@ -57,7 +57,7 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -fomit-frame-pointer
+COPTFLAGS      := -O3
 endif

 # Flags specific to optimized kernels.
@@ -74,7 +74,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
 CRVECFLAGS     := $(CKVECFLAGS)
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/bulldozer/make_defs.mk
+++ b/config/bulldozer/make_defs.mk
@@ -57,16 +57,16 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -funroll-all-loops
+COPTFLAGS      := -O3
 endif

 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma4 -march=bdver1
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma4 -march=bdver1
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
 else
 $(error gcc or clang are required for this configuration.)
 endif
@@ -74,7 +74,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/excavator/make_defs.mk
+++ b/config/excavator/make_defs.mk
@@ -57,16 +57,16 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -fomit-frame-pointer
+COPTFLAGS      := -O3
 endif

 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 $(error gcc or clang are required for this configuration.)
 endif
@@ -74,7 +74,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/generic/make_defs.mk
+++ b/config/generic/make_defs.mk
@@ -78,7 +78,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
 CRVECFLAGS     := $(CKVECFLAGS)
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -34,9 +35,12 @@

 #include "blis.h"

+//GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
+
 void bli_cntx_init_haswell( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+	blksz_t thresh[ BLIS_NUM_THRESH ];

 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_haswell_ref( cntx );
@@ -69,6 +73,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  cntx
 	);

+	// Update the context with optimized level-1f kernels.
 	bli_cntx_set_l1f_kers
 	(
 	  4,
@@ -118,12 +123,18 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 #if 1
 	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1008,  1008,  1008,  1008 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    75,   192 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
 #else
 	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    16,     8,     8,     4 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     6,     6,     3,     3 );
-#endif
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1024,  1024,  1024,  1024 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],   112,    64,    56,    32 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   112,    72,    56,    44 );
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
+#endif
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,     8,     8 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,     8,     8 );
@@ -144,5 +155,62 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 	  cntx
 	);
+
+	// -------------------------------------------------------------------------
+
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  201,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  100,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  120,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  8,
+	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  cntx
+	);
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1,
+	                                            -1,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
 }

--- a/config/haswell/bli_family_haswell.h
+++ b/config/haswell/bli_family_haswell.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -36,7 +37,6 @@
 //#define BLIS_FAMILY_H


-
 #if 0
 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------

--- a/config/haswell/make_defs.mk
+++ b/config/haswell/make_defs.mk
@@ -63,13 +63,13 @@ endif
 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=core-avx2
+CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=haswell
 else
 ifeq ($(CC_VENDOR),icc)
 CKVECFLAGS     := -xCORE-AVX2
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=core-avx2
+CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=haswell
 else
 $(error gcc, icc, or clang is required for this configuration.)
 endif
@@ -78,7 +78,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) #-funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/intel64/make_defs.mk
+++ b/config/intel64/make_defs.mk
@@ -78,7 +78,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
 CRVECFLAGS     := $(CKVECFLAGS)
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/knc/make_defs.mk
+++ b/config/knc/make_defs.mk
@@ -70,7 +70,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Override the default value for LDFLAGS.
 LDFLAGS        := -mmic
--- a/config/knl/make_defs.mk
+++ b/config/knl/make_defs.mk
@@ -99,7 +99,7 @@ endif
 # Note: We use AVX2 for reference kernels instead of AVX-512.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd
+CRVECFLAGS     := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations
 else
 ifeq ($(CC_VENDOR),icc)
 CRVECFLAGS     := -xMIC-AVX512
--- a/config/penryn/make_defs.mk
+++ b/config/penryn/make_defs.mk
@@ -57,7 +57,7 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -fomit-frame-pointer
+COPTFLAGS      := -O3
 endif

 # Flags specific to optimized kernels.
@@ -78,7 +78,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/piledriver/make_defs.mk
+++ b/config/piledriver/make_defs.mk
@@ -57,16 +57,16 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -fomit-frame-pointer
+COPTFLAGS      := -O3
 endif

 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 $(error gcc or clang are required for this configuration.)
 endif
@@ -74,7 +74,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -63,13 +63,13 @@ endif
 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := -mavx -mfpmath=sse -march=corei7-avx
+CKVECFLAGS     := -mavx -mfpmath=sse -march=sandybridge
 else
 ifeq ($(CC_VENDOR),icc)
 CKVECFLAGS     := -xAVX
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mavx -mfpmath=sse -march=corei7-avx
+CKVECFLAGS     := -mavx -mfpmath=sse -march=sandybridge
 else
 $(error gcc, icc, or clang is required for this configuration.)
 endif
@@ -78,7 +78,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/skx/make_defs.mk
+++ b/config/skx/make_defs.mk
@@ -89,7 +89,7 @@ endif
 # to overcome the AVX-512 frequency drop". (Issue #187)
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd
+CRVECFLAGS     := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations
 else
 ifeq ($(CC_VENDOR),icc)
 CRVECFLAGS     := -xCORE-AVX2
--- a/config/steamroller/make_defs.mk
+++ b/config/steamroller/make_defs.mk
@@ -57,16 +57,16 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -fomit-frame-pointer
+COPTFLAGS      := -O3
 endif

 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3
+CKVECFLAGS     := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 $(error gcc or clang are required for this configuration.)
 endif
@@ -74,7 +74,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/template/make_defs.mk
+++ b/config/template/make_defs.mk
@@ -57,7 +57,7 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2
+COPTFLAGS      := -O3
 endif

 # Flags specific to optimized kernels.
--- a/config/x86_64/make_defs.mk
+++ b/config/x86_64/make_defs.mk
@@ -78,7 +78,11 @@ endif

 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
 CRVECFLAGS     := $(CKVECFLAGS)
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif

 # Store all of the variables here to new variables containing the
 # configuration name.
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -35,9 +35,12 @@

 #include "blis.h"

+//GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
+
 void bli_cntx_init_zen( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+	blksz_t thresh[ BLIS_NUM_THRESH ];

 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen_ref( cntx );
@@ -114,23 +117,27 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );

 /*
-	Multi Instance performance improvement of DGEMM when binded to a CCX
-	In Multi instance each thread runs a sequential DGEMM.
-
-	a) 	If BLIS is run in a multi instance mode with 
-		CPU freq 2.6/2.2 Ghz
-		DDR4 clock frequency 2400Mhz
+	Multi Instance performance degradation on different cores
+	a) 	CPU freq 2.6 Ghz
+		DDR4 2400
+		Multi instance mode
                mc = 240, kc = 512, and nc = 2040
-		has better performance on EPYC server, over the default block sizes.
+	
+	b)	CPU freq 2.4Ghz
+		DDR4 2400
+		Multi Instance mode
+		either
+		mc = 240, kc = 512 and nc = 2040 
+			    (or)
+		mc = 390, kc = 512 and nc = 4080

-	b)  	If BLIS is run in Single Instance mode 
+	c)  	Higher frequency(3.1Ghz), single instance mode choose default value
 		mc = 510, kc = 1024 and nc = 4080

 */

      // Zen optmized level 3 cache block sizes
 #ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
-
   #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
  
        bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,  510,   144,    72 );
@@ -138,7 +145,6 @@ void bli_cntx_init_zen( cntx_t* cntx )
        bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );

   #else
-
        bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,   240,   144,    72 );
        bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   512,   256,   256 );
        bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,   2040,  4080,  4080 );
@@ -150,9 +156,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
        bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,   4080,  4080,  4080 );

 #endif
-
-
-
+ 	//bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  2040,  4080,  4080 );
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );

@@ -172,5 +176,62 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 	  cntx
 	);
+
+	// -------------------------------------------------------------------------
+
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  256,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  100,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  120,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  8,
+	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  cntx
+	);
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1,
+	                                            -1,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
 }

--- a/config/zen/bli_family_zen.h
+++ b/config/zen/bli_family_zen.h
@@ -39,14 +39,13 @@
 // By default, it is effective to parallelize the outer loops.
 // Setting these macros to 1 will force JR and IR inner loops
 // to be not paralleized.
-#define BLIS_THREAD_MAX_IR      1
-#define BLIS_THREAD_MAX_JR      1
+#define BLIS_DEFAULT_MR_THREAD_MAX 1
+#define BLIS_DEFAULT_NR_THREAD_MAX 1

 #define BLIS_ENABLE_ZEN_BLOCK_SIZES
 #define BLIS_ENABLE_SMALL_MATRIX
 #define BLIS_ENABLE_SMALL_MATRIX_TRSM

-
 // This will select the threshold below which small matrix code will be called.
 #define BLIS_SMALL_MATRIX_THRES        700
 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160
@@ -64,6 +63,15 @@
 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90

 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
+// Allow the sup implementation to combine some small edge case iterations in
+// the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the
+// block-panel algorithm (NR) with the last full iteration that precedes it.
+// NOTE: These cpp macros need to be explicitly set to an integer since they
+// are used at compile-time to create unconditional branches or dead code
+// regions.
+#define BLIS_ENABLE_SUP_MR_EXT 1
+#define BLIS_ENABLE_SUP_NR_EXT 0
+


 //#endif
--- a/config/zen/make_defs.mk
+++ b/config/zen/make_defs.mk
@@ -46,10 +46,27 @@ AMD_CONFIG_FILE := amd_config.mk
 AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
 -include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)

+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
 CKVECFLAGS += -march=znver1
 endif

+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -38,7 +38,7 @@
 void bli_cntx_init_zen2( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-
+	blksz_t thresh[ BLIS_NUM_THRESH ];
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen2_ref( cntx );

@@ -135,5 +135,61 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 	  cntx
 	);
+// -------------------------------------------------------------------------
+
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  256,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  100,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  120,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  8,
+	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+	  cntx
+	);
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1,
+	                                            -1,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
 }

--- a/config/zen2/make_defs.mk
+++ b/config/zen2/make_defs.mk
@@ -33,36 +33,56 @@
 #
 #

-# FLAGS that are specific to 'zen2' architecture are added here.
-# FLAGS that are common for all the AMD architectures are present in config/zen/amd_config.mk
-#

 # Declare the name of the current configuration and add it to the
 # running list of configurations included by common.mk.
 THIS_CONFIG    := zen2
 #CONFIGS_INCL   += $(THIS_CONFIG)

-# Include file containing common flags for all AMD architectures
-AMD_CONFIG_FILE := amd_config.mk
-AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
 #
 # --- Determine the C compiler and related flags ---
 #
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3 -fomit-frame-pointer
+endif
+
 # Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
 # gcc 9.0 (clang ?) or later:
-GCC_VERSION := $(strip $(shell gcc -dumpversion))
-ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
-CKVECFLAGS     += -march=znver2
+#CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=znver2
 # gcc 6.0 (clang 4.0) or later:
-else
-CKVECFLAGS     += -march=znver1 -mno-avx256-split-unaligned-store
-endif
+CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-avx256-split-unaligned-store
 # gcc 4.9 (clang 3.5) or later:
 # possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
 #CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+else
+$(error gcc or clang are required for this configuration.)
 endif
+endif
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS)
+
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
--- a/328
+++ b/328
@@ -51,8 +51,6 @@ print_usage()
 	#echo " "
 	#echo " BLIS ${version}"
 	echo " "
-	echo " Field G. Van Zee"
-	echo " "
 	echo " Configure BLIS's build system for compilation using a specified"
 	echo " configuration directory."
 	echo " "
@@ -72,30 +70,37 @@ print_usage()
 	echo " "
 	echo "   -p PREFIX, --prefix=PREFIX"
 	echo " "
-	echo "                 The path to which make will install all build products."
-	echo "                 If given, this option implies the following options:"
-	echo "                   --libdir=PREFIX/lib"
-	echo "                   --incdir=PREFIX/include"
+	echo "                 The common installation prefix for all files. If given,"
+	echo "                 this option effectively implies:"
+	echo "                   --libdir=EXECPREFIX/lib"
+	echo "                   --includedir=PREFIX/include"
 	echo "                   --sharedir=PREFIX/share"
-	echo "                 If not given, PREFIX defaults to \$(HOME)/blis. If PREFIX"
+	echo "                 where EXECPREFIX defaults to PREFIX. If this option is"
+	echo "                 not given, PREFIX defaults to '${prefix_def}'. If PREFIX"
 	echo "                 refers to a directory that does not exist, it will be"
 	echo "                 created."
 	echo " "
+	echo "   --exec-prefix=EXECPREFIX"
+	echo " "
+	echo "                 The installation prefix for libraries. Specifically, if"
+	echo "                 given, this option effectively implies:"
+	echo "                   --libdir=EXECPREFIX/lib"
+	echo "                 If not given, EXECPREFIX defaults to PREFIX, which may be"
+	echo "                 modified by the --prefix option. If EXECPREFIX refers to"
+	echo "                 a directory that does not exist, it will be created."
+	echo " "
 	echo "   --libdir=LIBDIR"
 	echo " "
-	echo "                 The path to which make will install libraries. If given,"
-	echo "                 LIBDIR will override the corresponding directory implied"
-	echo "                 by --prefix; if not not given, LIBDIR defaults to"
-	echo "                 PREFIX/lib. If LIBDIR refers to a directory that does"
-	echo "                 not exist, it will be created."
+	echo "                 The path to which make will install libraries. If not"
+	echo "                 given, LIBDIR defaults to PREFIX/lib. If LIBDIR refers to"
+	echo "                 a directory that does not exist, it will be created."
 	echo " "
 	echo "   --includedir=INCDIR"
 	echo " "
 	echo "                 The path to which make will install development header"
-	echo "                 files. If given, INCDIR will override the corresponding"
-	echo "                 directory implied by --prefix; if not given, INCDIR"
-	echo "                 defaults to PREFIX/include. If INCDIR refers to a"
-	echo "                 directory that does not exist, it will be created."
+	echo "                 files. If not given, INCDIR defaults to PREFIX/include."
+	echo "                 If INCDIR refers to a directory that does not exist, it"
+	echo "                 will be created."
 	echo " "
 	echo "   --sharedir=SHAREDIR"
 	echo " "
@@ -104,18 +109,9 @@ print_usage()
 	echo "                 and LDFLAGS). These files allow certain BLIS makefiles,"
 	echo "                 such as those in the examples or testsuite directories, to"
 	echo "                 operate on an installed copy of BLIS rather than a local"
-	echo "                 (and possibly uninstalled) copy. If given, SHAREDIR will"
-	echo "                 override the corresponding directory implied by --prefix;"
-	echo "                 if not given, SHAREDIR defaults to PREFIX/share. If"
-	echo "                 SHAREDIR refers to a directory that does not exist, it"
-	echo "                 will be created."
-	echo " "
-	echo "   -d DEBUG, --enable-debug[=DEBUG]"
-	echo " "
-	echo "                 Enable debugging symbols in the library. If argument"
-	echo "                 DEBUG is given as 'opt', then optimization flags are"
-	echo "                 kept in the framework, otherwise optimization is"
-	echo "                 turned off."
+	echo "                 (and possibly uninstalled) copy. If not given, SHAREDIR"
+	echo "                 defaults to PREFIX/share. If SHAREDIR refers to a"
+	echo "                 directory that does not exist, it will be created."
 	echo " "
 	echo "   --enable-verbose-make, --disable-verbose-make"
 	echo " "
@@ -129,6 +125,13 @@ print_usage()
 	echo "                 even if the command plus command line arguments exceeds"
 	echo "                 the operating system limit (ARG_MAX)."
 	echo " "
+	echo "   -d DEBUG, --enable-debug[=DEBUG]"
+	echo " "
+	echo "                 Enable debugging symbols in the library. If argument"
+	echo "                 DEBUG is given as 'opt', then optimization flags are"
+	echo "                 kept in the framework, otherwise optimization is"
+	echo "                 turned off."
+	echo " "
 	echo "   --disable-static, --enable-static"
 	echo " "
 	echo "                 Disable (enabled by default) building BLIS as a static"
@@ -141,6 +144,23 @@ print_usage()
 	echo "                 library. If the shared library build is disabled, the"
 	echo "                 static library build must remain enabled."
 	echo " "
+	echo "   -e SYMBOLS, --export-shared[=SYMBOLS]"
+	echo " "
+	echo "                 Specify the subset of library symbols that are exported"
+	echo "                 within a shared library. Valid values for SYMBOLS are:"
+	echo "                 'public' (the default) and 'all'. By default, only"
+	echo "                 functions and variables that belong to public APIs are"
+	echo "                 exported in shared libraries. However, the user may"
+	echo "                 instead export all symbols in BLIS, even those that were"
+	echo "                 intended for internal use only. Note that the public APIs"
+	echo "                 encompass all functions that almost any user would ever"
+	echo "                 want to call, including the BLAS/CBLAS compatibility APIs"
+	echo "                 as well as the basic and expert interfaces to the typed"
+	echo "                 and object APIs that are unique to BLIS. Also note that"
+	echo "                 changing this option to 'all' will have no effect in some"
+	echo "                 environments, such as when compiling with clang on"
+	echo "                 Windows."
+	echo " "
 	echo "   -t MODEL, --enable-threading[=MODEL], --disable-threading"
 	echo " "
 	echo "                 Enable threading in the library, using threading model"
@@ -222,6 +242,16 @@ print_usage()
 	echo "                 only be enabled when mixed domain/precision support is"
 	echo "                 enabled."
 	echo " "
+	echo "   --disable-sup-handling, --enable-sup-handling"
+	echo " "
+	echo "                 Disable (enabled by default) handling of small/skinny"
+	echo "                 matrix problems via separate code branches. When disabled,"
+	echo "                 these small/skinny level-3 operations will be performed by"
+	echo "                 the conventional implementation, which is optimized for"
+	echo "                 medium and large problems. Note that what qualifies as"
+	echo "                 \"small\" depends on thresholds that may vary by sub-"
+	echo "                 configuration."
+	echo " "
 	echo "   -s NAME --enable-sandbox=NAME"
 	echo " "
 	echo "                 Enable a separate sandbox implementation of gemm. This"
@@ -278,6 +308,7 @@ print_usage()
 	echo " Environment Variables:"
 	echo " "
 	echo "   CC            Specifies the C compiler to use."
+	echo "   CXX           Specifies the C++ compiler to use (sandbox only)."
 	echo "   RANLIB        Specifies the ranlib executable to use."
 	echo "   AR            Specifies the archiver to use."
 	echo "   CFLAGS        Specifies additional compiler flags to use (prepended)."
@@ -1016,7 +1047,7 @@ auto_detect()
 	# Set the linker flags. We need pthreads because it is needed for
 	# parts of bli_arch.c unrelated to bli_arch_string(), which is called
 	# by the main() function in ${main_c}.
-	if [ $is_win = no ]; then
+	if [[ $is_win == no || "$cc_vendor" != "clang" ]]; then
 		ldflags="${LIBPTHREAD--lpthread}"
 	fi

@@ -1294,8 +1325,7 @@ get_compiler_version()
 	# to OS X's egrep only returning the first match.
 	cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM' | { read first rest ; echo $first ; })
 	if [ "${cc_vendor}" = "icc" -o \
-	     "${cc_vendor}" = "gcc" -o \
-	     "${cc_vendor}" = "clang" ]; then
+	     "${cc_vendor}" = "gcc" ]; then
 		cc_version=$(${cc} -dumpversion)
 	else
 		cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
@@ -1343,7 +1373,7 @@ check_compiler()
 	# Specific:
 	#
 	#   skx: icc 15.0.1+, gcc 6.0+, clang 3.9+
-	#   knl: icc 14.0.1+, gcc 5.0+, clang 3.5+
+	#   knl: icc 14.0.1+, gcc 5.0+, clang 3.9+
 	#   haswell: any
 	#   sandybridge: any
 	#   penryn: any
@@ -1418,27 +1448,42 @@ check_compiler()

 	# clang
 	if [ "x${cc_vendor}" = "xclang" ]; then
-
-		if [ ${cc_major} -lt 3 ]; then
-			echoerr_unsupportedcc
-		fi
-		if [ ${cc_major} -eq 3 ]; then
-			if [ ${cc_minor} -lt 3 ]; then
+		if [ "$(echo ${vendor_string} | grep -o Apple)" = "Apple" ]; then
+			if [ ${cc_major} -lt 5 ]; then
 				echoerr_unsupportedcc
 			fi
-			if [ ${cc_minor} -lt 5 ]; then
+			# See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
+			if [ ${cc_major} -eq 5 ]; then
+				# Apple clang 5.0 is clang 3.4svn
 				blacklistcc_add "excavator"
 				blacklistcc_add "zen"
-				blacklistcc_add "knl"
 			fi
-			if [ ${cc_minor} -lt 9 ]; then
+			if [ ${cc_major} -lt 7 ]; then
+				blacklistcc_add "knl"
 				blacklistcc_add "skx"
 			fi
-		fi
-		if [ ${cc_major} -lt 4 ]; then
-			# See comment above regarding zen support.
-			#blacklistcc_add "zen"
-			: # explicit no-op since bash can't handle empty loop bodies.
+		else
+			if [ ${cc_major} -lt 3 ]; then
+				echoerr_unsupportedcc
+			fi
+			if [ ${cc_major} -eq 3 ]; then
+				if [ ${cc_minor} -lt 3 ]; then
+					echoerr_unsupportedcc
+				fi
+				if [ ${cc_minor} -lt 5 ]; then
+					blacklistcc_add "excavator"
+					blacklistcc_add "zen"
+				fi
+				if [ ${cc_minor} -lt 9 ]; then
+					blacklistcc_add "knl"
+					blacklistcc_add "skx"
+				fi
+			fi
+			if [ ${cc_major} -lt 4 ]; then
+				# See comment above regarding zen support.
+				#blacklistcc_add "zen"
+				: # explicit no-op since bash can't handle empty loop bodies.
+			fi
 		fi
 	fi
 }
@@ -1496,8 +1541,8 @@ check_assembler()
 	#

 	# The assembler on OS X won't recognize AVX-512 without help.
-	if [ "$(uname -s)" == "Darwin" ]; then
-		cflags="-Wa,-march=knl"
+	if [ "${cc_vendor}" == "clang" ]; then
+		cflags="-march=knl"
 	fi

 	asm_fp=$(find ${asm_dir} -name "avx512f.s")
@@ -1513,8 +1558,8 @@ check_assembler()
 	#

 	# The assembler on OS X won't recognize AVX-512 without help.
-	if [ "$(uname -s)" == "Darwin" ]; then
-		cflags="-Wa,-march=skylake-avx512"
+	if [ "${cc_vendor}" == "clang" ]; then
+		cflags="-march=skylake-avx512"
 	fi

 	asm_fp=$(find ${asm_dir} -name "avx512dq.s")
@@ -1731,21 +1776,33 @@ main()

 	# -- configure options --

-	# The user-given install prefix and a flag indicating it was given.
-	#install_prefix_def="${HOME}/blis"
-	install_prefix_user=${HOME}/blis  # default to this directory.
+	# Define the default prefix so that the print_usage() function can
+	# output it in the --help text.
+	prefix_def='/usr/local'
+
+	# The installation prefix, assigned its default value, and a flag to
+	# track whether or not it was given by the user.
+	prefix=${prefix_def}
 	prefix_flag=''

-	# The user-given install libdir and a flag indicating it was given.
-	install_libdir_user=''
+	# The installation exec_prefix, assigned its default value, and a flag to
+	# track whether or not it was given by the user.
+	exec_prefix='${prefix}'
+	exec_prefix_flag=''
+
+	# The installation libdir, assigned its default value, and a flag to
+	# track whether or not it was given by the user.
+	libdir='${exec_prefix}/lib'
 	libdir_flag=''

-	# The user-given install includedir and a flag indicating it was given.
-	install_incdir_user=''
-	incdir_flag=''
+	# The installation includedir, assigned its default value, and a flag to
+	# track whether or not it was given by the user.
+	includedir='${prefix}/include'
+	includedir_flag=''

-	# The user-given install sharedir and a flag indicating it was given.
-	install_sharedir_user=''
+	# The installation sharedir, assigned its default value, and a flag to
+	# track whether or not it was given by the user.
+	sharedir='${prefix}/share'
 	sharedir_flag=''

 	# The preset value of CFLAGS and LDFLAGS (ie: compiler and linker flags
@@ -1758,7 +1815,7 @@ main()
 	debug_flag=''

 	# The threading flag.
-	threading_model='no'
+	threading_model='off'

 	# The method of assigning micropanels to threads in the JR and JR loops.
 	thread_part_jrir='slab'
@@ -1772,6 +1829,7 @@ main()
 	enable_arg_max_hack='no'
 	enable_static='yes'
 	enable_shared='yes'
+	export_shared='public'
 	enable_pba_pools='yes'
 	enable_sba_pools='yes'
 	enable_mem_tracing='no'
@@ -1781,6 +1839,7 @@ main()
 	enable_cblas='no'
 	enable_mixed_dt='yes'
 	enable_mixed_dt_extra_mem='yes'
+	enable_sup_handling='yes'
 	enable_memkind='' # The default memkind value is determined later on.
 	force_version='no'

@@ -1821,7 +1880,7 @@ main()

 		# Process our command line options.
 		unset OPTIND
-		while getopts ":hp:d:s:t:r:qci:b:-:" opt; do
+		while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do
 			case $opt in
 				-)
 					case "$OPTARG" in
@@ -1833,19 +1892,23 @@ main()
 							;;
 						prefix=*)
 							prefix_flag=1
-							install_prefix_user=${OPTARG#*=}
+							prefix=${OPTARG#*=}
+							;;
+						exec-prefix=*)
+							exec_prefix_flag=1
+							exec_prefix=${OPTARG#*=}
 							;;
 						libdir=*)
 							libdir_flag=1
-							install_libdir_user=${OPTARG#*=}
+							libdir=${OPTARG#*=}
 							;;
 						includedir=*)
-							incdir_flag=1
-							install_incdir_user=${OPTARG#*=}
+							includedir_flag=1
+							includedir=${OPTARG#*=}
 							;;
 						sharedir=*)
 							sharedir_flag=1
-							install_sharedir_user=${OPTARG#*=}
+							sharedir=${OPTARG#*=}
 							;;
 						enable-debug)
 							debug_flag=1
@@ -1882,15 +1945,18 @@ main()
 						disable-shared)
 							enable_shared='no'
 							;;
+						export-shared=*)
+							export_shared=${OPTARG#*=}
+							;;
 						enable-threading=*)
 							threading_model=${OPTARG#*=}
 							;;
+						disable-threading)
+							threading_model='off'
+							;;
 						thread-part-jrir=*)
 							thread_part_jrir=${OPTARG#*=}
 							;;
-						disable-threading)
-							threading_model='no'
-							;;
 						enable-pba-pools)
 							enable_pba_pools='yes'
 							;;
@@ -1946,6 +2012,12 @@ main()
 						disable-mixed-dt-extra-mem)
 							enable_mixed_dt_extra_mem='no'
 							;;
+						enable-sup-handling)
+							enable_sup_handling='yes'
+							;;
+						disable-sup-handling)
+							enable_sup_handling='no'
+							;;
 						with-memkind)
 							enable_memkind='yes'
 							;;
@@ -1967,12 +2039,15 @@ main()
 					;;
 				p)
 					prefix_flag=1
-					install_prefix_user=$OPTARG
+					prefix=$OPTARG
 					;;
 				d)
 					debug_flag=1
 					debug_type=$OPTARG
 					;;
+				e)
+					export_shared=$OPTARG
+					;;
 				s)
 					sandbox_flag=1
 					sandbox=$OPTARG
@@ -2459,54 +2534,49 @@ main()

 	# -- Prepare variables for subsitution into template files -----------------

-	# Parse the status of the install prefix and echo feedback.
+	# Parse the status of the prefix option and echo feedback.
 	if [ -n "${prefix_flag}" ]; then
-		echo "${script_name}: detected --prefix='${install_prefix_user}'."
+		echo "${script_name}: detected --prefix='${prefix}'."
 	else
-		echo "${script_name}: no install prefix option given; defaulting to '${install_prefix_user}'."
+		echo "${script_name}: no install prefix option given; defaulting to '${prefix}'."
 	fi

-	# Set initial (candidate) values for the libdir and includedir using the
-	# install prefix that was determined above.
-	install_libdir=${install_prefix_user}/lib
-	install_incdir=${install_prefix_user}/include
-	install_sharedir=${install_prefix_user}/share
+	# Parse the status of the exec_prefix option and echo feedback.
+	if [ -n "${exec_prefix_flag}" ]; then
+		echo "${script_name}: detected --exec-prefix='${exec_prefix}'."
+	else
+		echo "${script_name}: no install exec_prefix option given; defaulting to PREFIX."
+	fi

-	# Set the install libdir, if it was specified. Note that this will override
-	# the default libdir implied by the install prefix, even if both options
-	# were given.
+	# Parse the status of the libdir option and echo feedback.
 	if [ -n "${libdir_flag}" ]; then
-		echo "${script_name}: detected --libdir='${install_libdir_user}'."
-		install_libdir=${install_libdir_user}
+		echo "${script_name}: detected --libdir='${libdir}'."
 	else
-		echo "${script_name}: no install libdir option given; defaulting to PREFIX/lib."
+		echo "${script_name}: no install libdir option given; defaulting to EXECPREFIX/lib."
 	fi

-	# Set the install includedir, if it was specified. Note that this will
-	# override the default includedir implied by the install prefix, even if
-	# both options were given.
-	if [ -n "${incdir_flag}" ]; then
-		echo "${script_name}: detected --includedir='${install_incdir_user}'."
-		install_incdir=${install_incdir_user}
+	# Parse the status of the includedir option and echo feedback.
+	if [ -n "${includedir_flag}" ]; then
+		echo "${script_name}: detected --includedir='${includedir}'."
 	else
 		echo "${script_name}: no install includedir option given; defaulting to PREFIX/include."
 	fi

-	# Set the install sharedir, if it was specified. Note that this will
-	# override the default sharedir implied by the install prefix, even if
-	# both options were given.
+	# Parse the status of the sharedir option and echo feedback.
 	if [ -n "${sharedir_flag}" ]; then
-		echo "${script_name}: detected --sharedir='${install_sharedir_user}'."
-		install_sharedir=${install_sharedir_user}
+		echo "${script_name}: detected --sharedir='${sharedir}'."
 	else
 		echo "${script_name}: no install sharedir option given; defaulting to PREFIX/share."
 	fi

 	# Echo the installation directories that we settled on.
 	echo "${script_name}: final installation directories:"
-	echo "${script_name}:   libdir:     ${install_libdir}"
-	echo "${script_name}:   includedir: ${install_incdir}"
-	echo "${script_name}:   sharedir:   ${install_sharedir}"
+	echo "${script_name}:   prefix:      "${prefix}
+	echo "${script_name}:   exec_prefix: "${exec_prefix}
+	echo "${script_name}:   libdir:      "${libdir}
+	echo "${script_name}:   includedir:  "${includedir}
+	echo "${script_name}:   sharedir:    "${sharedir}
+	echo "${script_name}: NOTE: the variables above can be overridden when running make."

 	# Check if CFLAGS is non-empty.
 	if [ -n "${CFLAGS}" ]; then
@@ -2573,6 +2643,23 @@ main()
 		exit 1
 	fi

+	# Check if the "export shared" flag was specified.
+	if [ "x${export_shared}" = "xall" ]; then
+		if [ "x${enable_shared}" = "xyes" ]; then
+			echo "${script_name}: exporting all symbols within shared library."
+		else
+			echo "${script_name}: ignoring request to export all symbols within shared library."
+		fi
+	elif [ "x${export_shared}" = "xpublic" ]; then
+		if [ "x${enable_shared}" = "xyes" ]; then
+			echo "${script_name}: exporting only public symbols within shared library."
+		fi
+	else
+		echo "${script_name}: *** Invalid argument '${export_shared}' to --export-shared option given."
+		echo "${script_name}: *** Please use 'public' or 'all'."
+		exit 1
+	fi
+
 	# Check the threading model flag and standardize its value, if needed.
 	# NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred.
 	enable_openmp='no'
@@ -2594,9 +2681,11 @@ main()
 		enable_pthreads='yes'
 		enable_pthreads_01=1
 		threading_model="pthreads" # Standardize the value.
-	elif [ "x${threading_model}" = "xno" ] ||
+	elif [ "x${threading_model}" = "xoff" ] ||
+	     [ "x${threading_model}" = "xno" ] ||
 	     [ "x${threading_model}" = "xnone" ]; then
 		echo "${script_name}: threading is disabled."
+		threading_model="off"
 	else
 		echo "${script_name}: *** Unsupported threading model: ${threading_model}."
 		exit 1
@@ -2707,6 +2796,13 @@ main()
 		enable_mixed_dt_extra_mem_01=0
 		enable_mixed_dt_01=0
 	fi
+	if [ "x${enable_sup_handling}" = "xyes" ]; then
+		echo "${script_name}: small matrix handling is enabled."
+		enable_sup_handling_01=1
+	else
+		echo "${script_name}: small matrix handling is disabled."
+		enable_sup_handling_01=0
+	fi

 	# Report integer sizes.
 	if [ "x${int_type_size}" = "x32" ]; then
@@ -2758,13 +2854,15 @@ main()
 	# Variables that may contain forward slashes, such as paths, need extra
 	# escaping when used in sed commands. We insert those extra escape
 	# characters here so that the sed commands below do the right thing.
-	os_name_esc=$(echo          "${os_name}"          | sed 's/\//\\\//g')
-	install_libdir_esc=$(echo   "${install_libdir}"   | sed 's/\//\\\//g')
-	install_incdir_esc=$(echo   "${install_incdir}"   | sed 's/\//\\\//g')
-	install_sharedir_esc=$(echo "${install_sharedir}" | sed 's/\//\\\//g')
-	dist_path_esc=$(echo        "${dist_path}"        | sed 's/\//\\\//g')
-	cc_esc=$(echo               "${found_cc}"         | sed 's/\//\\\//g')
-	cxx_esc=$(echo              "${found_cxx}"        | sed 's/\//\\\//g')
+	os_name_esc=$(echo     "${os_name}"     | sed 's/\//\\\//g')
+	prefix_esc=$(echo      "${prefix}"      | sed 's/\//\\\//g')
+	exec_prefix_esc=$(echo "${exec_prefix}" | sed 's/\//\\\//g')
+	libdir_esc=$(echo      "${libdir}"      | sed 's/\//\\\//g')
+	includedir_esc=$(echo  "${includedir}"  | sed 's/\//\\\//g')
+	sharedir_esc=$(echo    "${sharedir}"    | sed 's/\//\\\//g')
+	dist_path_esc=$(echo   "${dist_path}"   | sed 's/\//\\\//g')
+	cc_esc=$(echo          "${found_cc}"    | sed 's/\//\\\//g')
+	cxx_esc=$(echo         "${found_cxx}"   | sed 's/\//\\\//g')
 	#sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g')

 	# For RANLIB, if the variable is not set, we use a default value of
@@ -2779,7 +2877,7 @@ main()
 	# For Windows builds, clear the libpthread_esc variable so that
 	# no pthreads library is substituted into config.mk. (Windows builds
 	# employ an implementation of pthreads that is internal to BLIS.)
-	if [ $is_win = yes ]; then
+	if [[ $is_win == yes && "$cc_vendor" == "clang" ]]; then
 		libpthread_esc=
 	fi

@@ -2821,13 +2919,13 @@ main()

 	# -- Determine whether we are performing an out-of-tree build --------------

-	if [ ${dist_path} != "./" ]; then
+	if [ "${dist_path}" != "./" ]; then

 		# At this point, we know the user did not run "./configure". But we
 		# have not yet ruled out "<fullpath>/configure" or some # equivalent
 		# that uses relative paths. To further rule out these possibilities,
 		# we create a dummy file in the current build directory.
-		touch ./${dummy_file}
+		touch "./${dummy_file}"

 		# If the dummy file we just created in the current directory does not
 		# appear in the source distribution path, then we are in a different
@@ -2871,14 +2969,17 @@ main()
 		| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
 		| sed -e "s/@debug_type@/${debug_type}/g" \
 		| sed -e "s/@threading_model@/${threading_model}/g" \
-		| sed -e "s/@install_libdir@/${install_libdir_esc}/g" \
-		| sed -e "s/@install_incdir@/${install_incdir_esc}/g" \
-		| sed -e "s/@install_sharedir@/${install_sharedir_esc}/g" \
+		| sed -e "s/@prefix@/${prefix_esc}/g" \
+		| sed -e "s/@exec_prefix@/${exec_prefix_esc}/g" \
+		| sed -e "s/@libdir@/${libdir_esc}/g" \
+		| sed -e "s/@includedir@/${includedir_esc}/g" \
+		| sed -e "s/@sharedir@/${sharedir_esc}/g" \
 		| sed -e "s/@enable_verbose@/${enable_verbose}/g" \
 		| sed -e "s/@configured_oot@/${configured_oot}/g" \
 		| sed -e "s/@enable_arg_max_hack@/${enable_arg_max_hack}/g" \
 		| sed -e "s/@enable_static@/${enable_static}/g" \
 		| sed -e "s/@enable_shared@/${enable_shared}/g" \
+		| sed -e "s/@export_shared@/${export_shared}/g" \
 		| sed -e "s/@enable_blas@/${enable_blas}/g" \
 		| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
 		| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
@@ -2910,6 +3011,7 @@ main()
 		| sed   -e "s/@enable_cblas@/${enable_cblas_01}/g" \
 		| sed   -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \
 		| sed   -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
+		| sed   -e "s/@enable_sup_handling@/${enable_sup_handling_01}/g" \
 		| sed   -e "s/@enable_memkind@/${enable_memkind_01}/g" \
 		| sed   -e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g" \
 		| sed   -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \
--- a/docs/BuildSystem.md
+++ b/docs/BuildSystem.md
@@ -9,6 +9,9 @@
 * **[Step 3b: Testing (optional)](BuildSystem.md#step-3b-testing-optional)**
 * **[Step 4: Installation](BuildSystem.md#step-4-installation)**
 * **[Cleaning out build products](BuildSystem.md#cleaning-out-build-products)**
+* **[Compiling with BLIS](BuildSystem.md#compiling-with-blis)**
+  * [Disabling BLAS prototypes](BuildSystem.md#disabling-blas-prototypes)
+  * [CBLAS](BuildSystem.md#cblas)
 * **[Linking against BLIS](BuildSystem.md#linking-against-blis)**
 * **[Uninstalling](BuildSystem.md#uninstalling)**
 * **[make targets](BuildSystem.md#make-targets)**
@@ -83,11 +86,11 @@ Alternatively, `configure` can automatically select a configuration based on you
 ```
 $ ./configure auto
 ```
-However, as of this writing, only a limited number of architectures are detected. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used. 
+However, as of this writing, only a limited number of architectures are detected. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used.

 Upon running configure, you will get output similar to the following. The exact output will depend on whether you cloned BLIS from a `git` repository or whether you obtained BLIS via a downloadable tarball from the [releases](https://github.com/flame/blis/releases) page.
 ```
-$ ./configure haswell
+$ ./configure --prefix=$HOME/blis haswell
 configure: using 'gcc' compiler.
 configure: found gcc version 5.4.0 (maj: 5, min: 4, rev: 0).
 configure: checking for blacklisted configurations due to gcc 5.4.0.
@@ -166,17 +169,11 @@ The installation prefix can be specified via the `--prefix=PREFIX` option:
 ```
 $ ./configure --prefix=/usr <configname>
 ```
-This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `$(HOME)/blis`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
+This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `/usr/local`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively:
 ```
 $ ./configure --libdir=/usr/lib --includedir=/usr/include <configname>
 ```
-The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any `PREFIX` path, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `PREFIX/lib` and `INCDIR` defaults to `PREFIX/include`, but each will be overriden by their respective `--libdir`/`--includedir` options. So,
-```
-$ ./configure --libdir=/usr/lib <configname>
-
-```
-will configure BLIS to install libraries to `/usr/lib` and header files to the default location (`$HOME/blis/include`).
-Also, note that `configure` will create any installation directories that do not already exist.
+The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any path implied by `PREFIX`, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `EXECPREFIX/lib` (where `EXECPREFIX`, set via `--exec-prefix=EXECPREFIX`, defaults to `PREFIX`) and `INCDIR` defaults to `PREFIX/include`, but `LIBDIR` and `INCDIR` will each be overriden by their respective `--libdir`/`--includedir` options. There is a third related option, `--sharedir=SHAREDIR`, where `SHAREDIR` defaults to `PREFIX/share`. This option specifies the installation directory for certain makefile fragments that contain variables determined by `configure` (e.g. `CC`, `CFLAGS`, `LDFLAGS`, etc.). These files allow certain BLIS makefiles, such as those in the `examples` or `testsuite` directories, to operate on an installed copy of BLIS rather than a local (and possibly uninstalled) copy.

 For a complete list of supported `configure` options and arguments, run `configure` with the `-h` option:
 ```
@@ -338,6 +335,47 @@ Removing include.
 Running the `distclean` target is like saying, "Remove anything ever created by the build system."


+## Compiling with BLIS
+
+All BLIS definitions and prototypes may be included in your C source file by including a single header file, `blis.h`:
+```c
+#include "stdio.h"
+#include "stdlib.h"
+#include "otherstuff.h"
+#include "blis.h"
+```
+If the BLAS compatibility layer was enabled at configure-time (as it is by default), then `blis.h` will also provide BLAS prototypes to your source code.
+
+
+### Disabling BLAS prototypes
+
+Some applications already `#include` a header that contains BLAS prototypes. This can cause problems if those applications also try to `#include` the BLIS header file, as shown above. Suppose for a moment that `otherstuff.h` in the example above already provides BLAS prototypes.
+```
+$ gcc -I/path/to/blis -I/path/to/otherstuff -c main.c -o main.o
+In file included from main.c:41:0:
+/path/to/blis/blis.h:36900:111: error: conflicting declaration of C function ‘int xerbla_(const bla_character*, const bla_integer*, ftnlen)’
+ TEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
+```
+If your application is already declaring (prototyping) BLAS functions, then you may disable those prototypes from being defined included within `blis.h`. This prevents `blis.h` from re-declaring those prototypes, or, allows your other header to declare those functions for the first time, depending on the order that you `#include` the headers.
+```c
+#include "stdio.h"
+#include "stdlib.h"
+#include "otherstuff.h"
+#define BLIS_DISABLE_BLAS_DEFS    // disable BLAS prototypes within BLIS.
+#include "blis.h"
+```
+By `#defining` the `BLIS_DISABLE_BLAS_DEFS` macro, we signal to `blis.h` that it should skip over the BLAS prototypes, but otherwise `#include` everything else as it normally would. Note that `BLIS_DISABLE_BLAS_DEFS` must be `#defined` *prior* to the `#include "blis.h"` directive in order for it to have any effect.
+
+
+### CBLAS
+
+If you build BLIS with CBLAS enabled and you wish to access CBLAS function prototypes from within your application, you will have to `#include` the `cblas.h` header separately from `blis.h`.
+```
+#include "blis.h"
+#include "cblas.h"
+```
+
+
 ## Linking against BLIS

 Once you have instantiated (configured and compiled, and perhaps installed) a BLIS library, you can link to it in your application's makefile as you would any other library. The following is an abbreviated makefile for a small hypothetical application that has just two external dependencies: BLIS and the standard C math library. We also link against libpthread since that library has been a runtime dependency of BLIS since 70640a3 (December 2017).
@@ -357,7 +395,7 @@ OBJS        = main.o util.o other.o
 %.o: %.c
    $(CC) $(CFLAGS) -c $< -o $@

-all: $(OBJS) 
+all: $(OBJS)
    $(LINKER) $(OBJS) $(BLIS_LIB) $(OTHER_LIBS) -o my_program.x
 ```
 The above example assumes you will want to include BLIS definitions and function prototypes into your application via `#include blis.h`. (If you are only using the BLIS via the BLAS compatibility layer, including `blis.h` is not necessary.) Since BLIS headers are installed into a `blis` subdirectory of `PREFIX/include`, you must make sure that the compiler knows where to find the `blis.h` header file. This is typically accomplished by inserting `#include "blis.h"` into your application's source code files and compiling the code with `-I PREFIX/include/blis`.
--- a/docs/HardwareSupport.md
+++ b/docs/HardwareSupport.md
@@ -12,8 +12,8 @@ The following table lists architectures for which there exist optimized level-3
 A few remarks / reminders:
  * Optimizing only the [gemm microkernel](KernelsHowTo.md#gemm-microkernel) will result in optimal performance for all [level-3 operations](BLISTypedAPI#level-3-operations) except `trsm` (which will typically achieve 60 - 80% of attainable peak performance).
  * The [trsm](BLISTypedAPI#trsm) operation needs the [gemmtrsm microkernel(s)](KernelsHowTo.md#gemmtrsm-microkernels), in addition to the aforementioned [gemm microkernel](KernelsHowTo.md#gemm-microkernel), in order reach optimal performance.
-  * Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available. Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
-  * Some microarchitectures use the same sub-configuration. This is not a typo. For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kabylake, and Coffeelake all use the `haswell` sub-configuration and the kernels registered therein.
+  * Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available, but the "native" complex domain gemm microkernel is unavailable. Note that the table below lists native kernels, so if a microarchitecture lists only `sd`, support for both `c` and `z` datatypes will be provided via the 1m method. (Note: most people cannot tell the difference between native and 1m-based performance.) Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
+  * Some microarchitectures use the same sub-configuration. *This is not a typo.* For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kaby Lake, and Coffee Lake all use the `haswell` sub-configuration and the kernels registered therein. Microkernels can be recycled in this manner because the key detail that determines level-3 performance outcomes is actually the vector ISA, not the microarchitecture. In the previous example, all of the microarchitectures listed support AVX2 (but not AVX-512), and therefore they can reuse the same microkernels.
  * Remember that you (usually) don't have to choose your sub-configuration manually! Instead, you can always request configure-time hardware detection via `./configure auto`. This will defer to internal logic (based on CPUID for x86_64 systems) that will attempt to choose the appropriate sub-configuration automatically.

 | Vendor/Microarchitecture             | BLIS sub-configuration | `gemm` | `gemmtrsm` |
@@ -26,7 +26,7 @@ A few remarks / reminders:
 | Intel Core2 (SSE3)                   | `penryn`               | `sd`   |  `d`       |
 | Intel Sandy/Ivy Bridge (AVX/FMA3)    | `sandybridge`          | `sdcz` |            |
 | Intel Haswell, Broadwell (AVX/FMA3)  | `haswell`              | `sdcz` |  `sd`      |
-| Intel Sky/Kaby/Coffeelake (AVX/FMA3) | `haswell`              | `sdcz` |  `sd`      |
+| Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell`              | `sdcz` |  `sd`      |
 | Intel Knights Landing (AVX-512/FMA3) | `knl`                  | `sd`   |            |
 | Intel SkylakeX (AVX-512/FMA3)        | `skx`                  | `sd`   |            |
 | ARMv7 Cortex-A9 (NEON)               | `cortex-a9`            | `sd`   |            |
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -23,11 +23,17 @@

 # Introduction

-Our paper [Anatomy of High-Performance Many-Threaded Matrix Multiplication](https://github.com/flame/blis#citations), presented at IPDPS'14, identified 5 loops around the microkernel as opportunities for parallelization within level-3 operations such as `gemm`. Within BLIS, we have enabled parallelism for 4 of those loops and have extended it to the rest of the level-3 operations except for `trsm`.
+Our paper [Anatomy of High-Performance Many-Threaded Matrix Multiplication](https://github.com/flame/blis#citations), presented at IPDPS'14, identified five loops around the microkernel as opportunities for parallelization within level-3 operations such as `gemm`. Within BLIS, we have enabled parallelism for four of those loops, with the fifth planned for future work. This software architecture extends naturally to all level-3 operations except for `trsm`, where its application is necessarily limited to three of the five loops due to inter-iteration dependencies.
+
+**IMPORTANT**: Multithreading in BLIS is disabled by default. Furthermore, even when multithreading is enabled, BLIS will default to single-threaded execution at runtime. In order to both *allow* and *invoke* parallelism from within BLIS operations, you must both *enable* multithreading at configure-time and *specify* multithreading at runtime.
+
+To summarize: In order to observe multithreaded parallelism within a BLIS operation, you must do *both* of the following:
+1. Enable multithreading at configure-time. This is discussed in the [next section](docs/Multithreading.md#enabling-multithreading).
+2. Specify multithreading at runtime. This is also dicussed [later on](docs/Multithreading.md#specifying-multithreading).

 # Enabling multithreading

-Note that BLIS disables multithreading by default. In order to extract multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.
+BLIS disables multithreading by default. In order to allow multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.

 As of this writing, BLIS optionally supports multithreading via either OpenMP or POSIX threads.

@@ -101,7 +107,7 @@ This pattern--automatic or manual--holds regardless of which of the three method

 Regardless of which method is employed, and which specific way within each method, after setting the number of threads, the application may call the desired level-3 operation (via either the [typed API](docs/BLISTypedAPI.md) or the [object API](docs/BLISObjectAPI.md)) and the operation will execute in a multithreaded manner. (When calling BLIS via the BLAS API, only the first two (global) methods are available.)

-NOTE: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the manual way will always take precedence.** Also, specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1).
+**Note**: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the manual way will always take precedence.** Also, specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1).

 ## Globally via environment variables

@@ -109,6 +115,8 @@ The most common method of specifying multithreading in BLIS is globally via envi

 Regardless of whether you end up using the automatic or manual way of expressing a request for multithreading, note that the environment variables are read (via `getenv()`) by BLIS **only once**, when the library is initialized. Subsequent to library initialization, the global settings for parallelization may only be changed via the [global runtime API](Multithreading.md#globally-at-runtime). If this constraint is not a problem, then environment variables may work fine for you. Otherwise, please consider [local settings](Multithreading.md#locally-at-runtime). (Local settings may used at any time, regardless of whether global settings were explicitly specified, and local settings always override global settings.)

+**Note**: Regardless of which way ([automatic](Multithreading.md#environment-variables-the-automatic-way) or [manual](Multithreading.md#environment-variables-the-manual-way)) environment variables are used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs that are unique to BLIS.
+
 ### Environment variables: the automatic way

 The automatic way of specifying parallelism entails simply setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable:
@@ -119,7 +127,7 @@ $ ./my_blis_program
 ```
 This causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `OMP_NUM_THREADS`. If neither variable is set, the default number of threads is 1.

-**Note:** We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.
+**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`.

 ### Environment variables: the manual way

@@ -127,7 +135,7 @@ The manual way of specifying parallelism involves communicating which loops with

 The below chart describes the five loops used in BLIS's matrix multiplication operations.

-| Loop around microkernel | Environment variable | Direction | Notes       |
+| Loop around microkernel  | Environment variable | Direction | Notes       |
 |:-------------------------|:---------------------|:----------|:------------|
 | 5th loop                 | `BLIS_JC_NT`         | `n`       |             |
 | 4th loop                 | _N/A_                | `k`       | Not enabled |
@@ -154,6 +162,8 @@ Next, which combinations of loops to parallelize depends on which caches are sha

 If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized).

+**Note**: Regardless of which way ([automatic](Multithreading.md#globally-at-runtime-the-automatic-way) or [manual](Multithreading.md#globally-at-runtime-the-manual-way)) the global runtime API is used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs that are unique to BLIS.
+
 ### Globally at runtime: the automatic way

 If you simply want to specify an overall number of threads and let BLIS choose a thread factorization automatically, use the following function:
@@ -193,6 +203,8 @@ In addition to the global methods based on environment variables and runtime fun

 As with environment variables and the global runtime API, there are two ways to specify parallelism: the automatic way and the manual way. Both ways involve allocating a BLIS-specific object, initializing the object and encoding the desired parallelization, and then passing a pointer to the object into one of the expert interfaces of either the [typed](docs/BLISTypedAPI.md) or [object](docs/BLISObjectAPI) APIs. We provide examples of utilizing this threading object below.

+**Note**: Neither way ([automatic](Multithreading.md#locally-at-runtime-the-automatic-way) nor [manual](Multithreading.md#locally-at-runtime-the-manual-way)) of specifying multithreading via the local runtime API can be used via the BLAS interfaces. The local runtime API may *only* be used via the native [typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md) APIs, which are unique to BLIS. (Furthermore, the expert interfaces of each API must be used. This is demonstrated later on in this section.)
+
 ### Initializing a rntm_t

 Before specifying the parallelism (automatically or manually), you must first allocate a special BLIS object called a `rntm_t` (runtime). The object is quite small (about 64 bytes), and so we recommend allocating it statically on the function stack:
--- a/docs/Performance.md
+++ b/docs/Performance.md
@@ -0,0 +1,394 @@
+# Contents
+
+* **[Contents](Performance.md#contents)**
+* **[Introduction](Performance.md#introduction)**
+* **[General information](Performance.md#general-information)**
+* **[Level-3 performance](Performance.md#level-3-performance)**
+  * **[ThunderX2](Performance.md#thunderx2)**
+    * **[Experiment details](Performance.md#thunderx2-experiment-details)**
+    * **[Results](Performance.md#thunderx2-results)**
+  * **[SkylakeX](Performance.md#skylakex)**
+    * **[Experiment details](Performance.md#skylakex-experiment-details)**
+    * **[Results](Performance.md#skylakex-results)**
+  * **[Haswell](Performance.md#haswell)**
+    * **[Experiment details](Performance.md#haswell-experiment-details)**
+    * **[Results](Performance.md#haswell-results)**
+  * **[Epyc](Performance.md#epyc)**
+    * **[Experiment details](Performance.md#epyc-experiment-details)**
+    * **[Results](Performance.md#epyc-results)**
+* **[Feedback](Performance.md#feedback)**
+
+# Introduction
+
+This document showcases performance results for a representative sample of
+level-3 operations on large matrices with BLIS and BLAS for several hardware
+architectures.
+
+# General information
+
+Generally speaking, for level-3 operations on large matrices, we publish three
+"panels" for each type of hardware,
+each of which reports one of: single-threaded performance, multithreaded
+performance on a single socket, or multithreaded performance on two sockets.
+Each panel will consist of a 4x5 grid of graphs, with each row representing
+a different datatype (single real, double real, single complex, and double
+complex) and each column representing a different operation (`gemm`,
+`hemm`/`symm`, `herk`/`syrk`, `trmm`, and `trsm`).
+Each of the 20 graphs within a panel will contain an x-axis that reports
+problem size, with all matrix dimensions equal to the problem size (e.g.
+_m_ = _n_ = _k_), resulting in square matrices.
+The y-axis will report in units GFLOPS (billions of floating-point operations
+per second) in the case of single-threaded performance, or GFLOPS/core in the
+case of single- or dual-socket multithreaded performance, where GFLOPS/core
+is simply the total GFLOPS observed divided by the number of threads utilized.
+This normalization is done intentionally in order to facilitate a visual
+assessment of the drop in efficiency of multithreaded performance relative
+to their single-threaded baselines.
+
+It's also worth pointing out that the top of each graph (e.g. the maximum
+y-axis value depicted) _always_ corresponds to the theoretical peak performance
+under the conditions associated with that graph.
+Theoretical peak performance, in units of GFLOPS/core, is calculated as the
+product of:
+1. the maximum sustainable clock rate in GHz; and
+2. the maximum number of floating-point operations (flops) that can be
+executed per cycle (per core).
+
+Note that the maximum sustainable clock rate may change depending on the
+conditions.
+For example, on some systems the maximum clock rate is higher when only one
+core is active (e.g. single-threaded performance) versus when all cores are
+active (e.g. multithreaded performance).
+The maximum number of flops executable per cycle (per core) is generally
+computed as the product of:
+1. the maximum number of fused multiply-add (FMA) vector instructions that
+can be issued per cycle (per core);
+2. the maximum number of elements that can be stored within a single vector
+register (for the datatype in question); and
+3. 2.0, since an FMA instruction fuses two operations (a multiply and an add).
+
+The problem size range, represented on the x-axis, is usually sampled with 50
+equally-spaced problem size.
+For example, for single-threaded execution, we might choose to execute with
+problem sizes of 48 to 2400 in increments of 48, or 56 to 2800 in increments
+of 56.
+These values are almost never chosen for any particular (read: sneaky) reason;
+rather, we start with a "good" maximum problem size, such as 2400 or 2800, and
+then divide it by 50 to obtain the appropriate starting point and increment.
+
+Finally, each point along each curve represents the best of three trials.
+
+# Interpretation
+
+In general, the the curves associated with higher-performing implementations
+will appear higher in the graphs than lower-performing implementations.
+Ideally, an implementation will climb in performance (as a function of problem
+size) as quickly as possible and asymptotically approach some high fraction of
+peak performance.
+
+Occasionally, we may publish graphs with incomplete curves--for example,
+only the first 25 data points in a typical 50-point series--usually because
+the implementation being tested was slow enough that it was not practical to
+allow it to finish.
+
+Where along the x-axis you focus your attention will depend on the segment of
+the problem size range that you care about most. Some people's applications
+depend heavily on smaller problems, where "small" can mean anything from 10
+to 1000 or even higher. Some people consider 1000 to be quite large, while
+others insist that 5000 is merely "medium." What each of us considers to be
+small, medium, or large (naturally) depends heavily on the kinds of dense
+linear algebra problems we tend to encounter. No one is "right" or "wrong"
+about their characterization of matrix smallness or bigness since each person's
+relative frame of reference can vary greatly. That said, the
+[Science of High-Performance Computing](http://shpc.ices.utexas.edu/) group at
+[The University of Texas at Austin](https://www.utexas.edu/) tends to target
+matrices that it classifies as "medium-to-large", and so most of the graphs
+presented in this document will reflect that targeting in their x-axis range.
+
+When corresponding with us, via email or when opening an
+[issue](https://github.com/flame/blis/issues) on github, we kindly ask that
+you specify as closely as possible (though a range is fine) your problem
+size of interest so that we can better assist you.
+
+# Level-3 performance
+
+## ThunderX2
+
+### ThunderX2 experiment details
+
+* Location: Unknown
+* Processor model: Marvell ThunderX2 CN9975
+* Core topology: two sockets, 28 cores per socket, 56 cores total
+* SMT status: disabled at boot-time
+* Max clock rate: 2.2GHz (single-core and multicore)
+* Max vector register length: 128 bits (NEON)
+* Max FMA vector IPC: 2
+* Peak performance:
+  * single-core: 17.6 GFLOPS (double-precision), 35.2 GFLOPS (single-precision)
+  * multicore: 17.6 GFLOPS/core (double-precision), 35.2 GFLOPS/core (single-precision)
+* Operating system: Ubuntu 16.04 (Linux kernel 4.15.0)
+* Compiler: gcc 7.3.0
+* Results gathered: 14 February 2019
+* Implementations tested:
+  * BLIS 075143df (0.5.1-39)
+    * configured with `./configure -t openmp thunderx2` (single- and multithreaded)
+    * sub-configuration exercised: `thunderx2`
+    * Single-threaded (1 core) execution requested via no change in environment variables
+    * Multithreaded (28 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=7`
+    * Multithreaded (56 core) execution requested via `export BLIS_JC_NT=8 BLIS_IC_NT=7`
+  * OpenBLAS 52d3f7a
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=56` (multithreaded, 56 cores)
+    * Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
+    * Multithreaded (28 core) execution requested via `export OPENBLAS_NUM_THREADS=28`
+    * Multithreaded (56 core) execution requested via `export OPENBLAS_NUM_THREADS=56`
+  * ARMPL 18.4
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (28 core) execution requested via `export OMP_NUM_THREADS=28`
+    * Multithreaded (56 core) execution requested via `export OMP_NUM_THREADS=56`
+* Affinity:
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 55"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
+* Frequency throttling (via `cpupower`):
+  * No changes made.
+* Comments:
+  * ARMPL performance is remarkably uneven across datatypes and operations, though it would appear their "base" consists of OpenBLAS, which they then optimize for select, targeted routines. Unfortunately, we were unable to test the absolute latest versions of OpenBLAS and ARMPL on this hardware before we lost access. We will rerun these experiments once we gain access to a similar system.
+
+### ThunderX2 results
+
+#### pdf
+
+* [ThunderX2 single-threaded](graphs/large/l3_perf_tx2_nt1.pdf)
+* [ThunderX2 multithreaded (28 cores)](graphs/large/l3_perf_tx2_jc4ic7_nt28.pdf)
+* [ThunderX2 multithreaded (56 cores)](graphs/large/l3_perf_tx2_jc8ic7_nt56.pdf)
+
+#### png (inline)
+
+* **ThunderX2 single-threaded**
+![single-threaded](graphs/large/l3_perf_tx2_nt1.png)
+* **ThunderX2 multithreaded (28 cores)**
+![multithreaded (28 cores)](graphs/large/l3_perf_tx2_jc4ic7_nt28.png)
+* **ThunderX2 multithreaded (56 cores)**
+![multithreaded (56 cores)](graphs/large/l3_perf_tx2_jc8ic7_nt56.png)
+
+---
+
+## SkylakeX
+
+### SkylakeX experiment details
+
+* Location: Oracle cloud
+* Processor model: Intel Xeon Platinum 8167M (SkylakeX/AVX-512)
+* Core topology: two sockets, 26 cores per socket, 52 cores total
+* SMT status: enabled, but not utilized
+* Max clock rate: 2.0GHz (single-core and multicore)
+* Max vector register length: 512 bits (AVX-512)
+* Max FMA vector IPC: 2
+* Peak performance:
+  * single-core: 64 GFLOPS (double-precision), 128 GFLOPS (single-precision)
+  * multicore: 64 GFLOPS/core (double-precision), 128 GFLOPS/core (single-precision)
+* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
+* Compiler: gcc 7.3.0
+* Results gathered: 6 March 2019, 27 March 2019
+* Implementations tested:
+  * BLIS 9f1dbe5 (0.5.1-54)
+    * configured with `./configure -t openmp auto` (single- and multithreaded)
+    * sub-configuration exercised: `skx`
+    * Single-threaded (1 core) execution requested via no change in environment variables
+    * Multithreaded (26 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=13`
+    * Multithreaded (52 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=13`
+  * OpenBLAS 0.3.5
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=52` (multithreaded, 52 cores)
+    * Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
+    * Multithreaded (26 core) execution requested via `export OPENBLAS_NUM_THREADS=26`
+    * Multithreaded (52 core) execution requested via `export OPENBLAS_NUM_THREADS=52`
+  * Eigen 3.3.90
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (26 core) execution requested via `export OMP_NUM_THREADS=26`
+    * Multithreaded (52 core) execution requested via `export OMP_NUM_THREADS=52`
+    * **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
+  * MKL 2019 update 1
+    * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
+    * Multithreaded (26 core) execution requested via `export MKL_NUM_THREADS=26`
+    * Multithreaded (52 core) execution requested via `export MKL_NUM_THREADS=52`
+* Affinity:
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 51"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
+* Frequency throttling (via `cpupower`):
+  * Driver: acpi-cpufreq
+  * Governor: performance
+  * Hardware limits: 1.0GHz - 2.0GHz
+  * Adjusted minimum: 2.0GHz
+* Comments:
+  * MKL yields superb performance for most operations, though BLIS is not far behind except for `trsm`. (We understand the `trsm` underperformance and hope to address it in the future.) OpenBLAS lags far behind MKL and BLIS due to lack of full support for AVX-512, and possibly other reasons related to software architecture and register/cache blocksizes.
+
+### SkylakeX results
+
+#### pdf
+
+* [SkylakeX single-threaded](graphs/large/l3_perf_skx_nt1.pdf)
+* [SkylakeX multithreaded (26 cores)](graphs/large/l3_perf_skx_jc2ic13_nt26.pdf)
+* [SkylakeX multithreaded (52 cores)](graphs/large/l3_perf_skx_jc4ic13_nt52.pdf)
+
+#### png (inline)
+
+* **SkylakeX single-threaded**
+![single-threaded](graphs/large/l3_perf_skx_nt1.png)
+* **SkylakeX multithreaded (26 cores)**
+![multithreaded (26 cores)](graphs/large/l3_perf_skx_jc2ic13_nt26.png)
+* **SkylakeX multithreaded (52 cores)**
+![multithreaded (52 cores)](graphs/large/l3_perf_skx_jc4ic13_nt52.png)
+
+---
+
+## Haswell
+
+### Haswell experiment details
+
+* Location: TACC (Lonestar5)
+* Processor model: Intel Xeon E5-2690 v3 (Haswell)
+* Core topology: two sockets, 12 cores per socket, 24 cores total
+* SMT status: enabled, but not utilized
+* Max clock rate: 3.5GHz (single-core), 3.1GHz (multicore)
+* Max vector register length: 256 bits (AVX2)
+* Max FMA vector IPC: 2
+* Peak performance:
+  * single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision)
+  * multicore: 49.6 GFLOPS/core (double-precision), 99.2 GFLOPS/core (single-precision)
+* Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103)
+* Compiler: gcc 6.3.0
+* Results gathered: 25-26 February 2019, 27 March 2019
+* Implementations tested:
+  * BLIS 075143df (0.5.1-39)
+    * configured with `./configure -t openmp auto` (single- and multithreaded)
+    * sub-configuration exercised: `haswell`
+    * Single-threaded (1 core) execution requested via no change in environment variables
+    * Multithreaded (12 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=3 BLIS_JR_NT=2`
+    * Multithreaded (24 core) execution requested via `export BLIS_JC_NT=4 BLIS_IC_NT=3 BLIS_JR_NT=2`
+  * OpenBLAS 0.3.5
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=24` (multithreaded, 24 cores)
+    * Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
+    * Multithreaded (12 core) execution requested via `export OPENBLAS_NUM_THREADS=12`
+    * Multithreaded (24 core) execution requested via `export OPENBLAS_NUM_THREADS=24`
+  * Eigen 3.3.90
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
+    * Multithreaded (24 core) execution requested via `export OMP_NUM_THREADS=24`
+    * **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
+  * MKL 2018 update 2
+    * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
+    * Multithreaded (12 core) execution requested via `export MKL_NUM_THREADS=12`
+    * Multithreaded (24 core) execution requested via `export MKL_NUM_THREADS=24`
+* Affinity:
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 23"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
+* Frequency throttling (via `cpupower`):
+  * No changes made.
+* Comments:
+  * We were pleasantly surprised by how competitive BLIS performs relative to MKL on this multicore Haswell system, which is a _very_ common microarchitecture, and _very_ similar to the more recent Broadwells, Skylakes (desktop), Kaby Lakes, and Coffee Lakes that succeeded it.
+
+### Haswell results
+
+#### pdf
+
+* [Haswell single-threaded](graphs/large/l3_perf_has_nt1.pdf)
+* [Haswell multithreaded (12 cores)](graphs/large/l3_perf_has_jc2ic3jr2_nt12.pdf)
+* [Haswell multithreaded (24 cores)](graphs/large/l3_perf_has_jc4ic3jr2_nt24.pdf)
+
+#### png (inline)
+
+* **Haswell single-threaded**
+![single-threaded](graphs/large/l3_perf_has_nt1.png)
+* **Haswell multithreaded (12 cores)**
+![multithreaded (12 cores)](graphs/large/l3_perf_has_jc2ic3jr2_nt12.png)
+* **Haswell multithreaded (24 cores)**
+![multithreaded (24 cores)](graphs/large/l3_perf_has_jc4ic3jr2_nt24.png)
+
+---
+
+## Epyc
+
+### Epyc experiment details
+
+* Location: Oracle cloud
+* Processor model: AMD Epyc 7551 (Zen1)
+* Core topology: two sockets, 4 dies per socket, 2 core complexes (CCX) per die, 4 cores per CCX, 64 cores total
+* SMT status: enabled, but not utilized
+* Max clock rate: 3.0GHz (single-core), 2.55GHz (multicore)
+* Max vector register length: 256 bits (AVX2)
+* Max FMA vector IPC: 1
+  * Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each.
+* Peak performance:
+  * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
+  * multicore: 20.4 GFLOPS/core (double-precision), 40.8 GFLOPS/core (single-precision)
+* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
+* Compiler: gcc 7.3.0
+* Results gathered: 6 March 2019, 19 March 2019, 27 March 2019
+* Implementations tested:
+  * BLIS 9f1dbe5 (0.5.1-54)
+    * configured with `./configure -t openmp auto` (single- and multithreaded)
+    * sub-configuration exercised: `zen`
+    * Single-threaded (1 core) execution requested via no change in environment variables
+    * Multithreaded (32 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=8 BLIS_JR_NT=4`
+    * Multithreaded (64 core) execution requested via `export BLIS_JC_NT=2 BLIS_IC_NT=8 BLIS_JR_NT=4`
+  * OpenBLAS 0.3.5
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
+    * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=64` (multithreaded, 64 cores)
+    * Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1`
+    * Multithreaded (32 core) execution requested via `export OPENBLAS_NUM_THREADS=32`
+    * Multithreaded (64 core) execution requested via `export OPENBLAS_NUM_THREADS=64`
+  * Eigen 3.3.90
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (32 core) execution requested via `export OMP_NUM_THREADS=32`
+    * Multithreaded (64 core) execution requested via `export OMP_NUM_THREADS=64`
+    * **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
+  * MKL 2019 update 1
+    * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
+    * Multithreaded (32 core) execution requested via `export MKL_NUM_THREADS=32`
+    * Multithreaded (64 core) execution requested via `export MKL_NUM_THREADS=64`
+* Affinity:
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0 1 2 3 ... 63"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
+* Frequency throttling (via `cpupower`):
+  * Driver: acpi-cpufreq
+  * Governor: performance
+  * Hardware limits: 1.2GHz - 2.0GHz
+  * Adjusted minimum: 2.0GHz
+* Comments:
+  * MKL performance is dismal, despite being linked in the same manner as on the Xeon Platinum. It's not clear what is causing the slowdown. It could be that MKL's runtime kernel/blocksize selection logic is falling back to some older, more basic implementation because CPUID is not returning Intel as the hardware vendor. Alternatively, it's possible that MKL is trying to use kernels for the closest Intel architectures--say, Haswell/Broadwell--but its implementations use Haswell-specific optimizations that, due to microarchitectural differences, degrade performance on Zen.
+
+### Epyc results
+
+#### pdf
+
+* [Epyc single-threaded](graphs/large/l3_perf_epyc_nt1.pdf)
+* [Epyc multithreaded (32 cores)](graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.pdf)
+* [Epyc multithreaded (64 cores)](graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.pdf)
+
+#### png (inline)
+
+* **Epyc single-threaded**
+![single-threaded](graphs/large/l3_perf_epyc_nt1.png)
+* **Epyc multithreaded (32 cores)**
+![multithreaded (32 cores)](graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.png)
+* **Epyc multithreaded (64 cores)**
+![multithreaded (64 cores)](graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.png)
+
+---
+
+# Feedback
+
+Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
+
+Thanks for your interest in BLIS!
+
--- a/docs/PerformanceSmall.md
+++ b/docs/PerformanceSmall.md
@@ -0,0 +1,224 @@
+# Contents
+
+* **[Contents](Performance.md#contents)**
+* **[Introduction](Performance.md#introduction)**
+* **[General information](Performance.md#general-information)**
+* **[Level-3 performance](Performance.md#level-3-performance)**
+  * **[Kaby Lake](Performance.md#kaby-lake)**
+    * **[Experiment details](Performance.md#kaby-lake-experiment-details)**
+    * **[Results](Performance.md#kaby-lake-results)**
+  * **[Epyc](Performance.md#epyc)**
+    * **[Experiment details](Performance.md#epyc-experiment-details)**
+    * **[Results](Performance.md#epyc-results)**
+* **[Feedback](Performance.md#feedback)**
+
+# Introduction
+
+This document showcases performance results for the level-3 `gemm` operation
+on small matrices with BLIS and BLAS for select hardware architectures.
+
+# General information
+
+Generally speaking, for level-3 operations on small matrices, we publish 
+two "panels" for each type of hardware, one that reflects performance on
+row-stored matrices and another for column-stored matrices.
+Each panel will consist of a 4x7 grid of graphs, with each row representing
+a different transposition case (`nn`, `nt`, `tn`, `tt`)
+complex) and each column representing a different shape scenario, usually
+with one or two matrix dimensions bound to a fixed size for all problem
+sizes tested.
+Each of the 28 graphs within a panel will contain an x-axis that reports
+problem size, with one, two, or all three matrix dimensions equal to the
+problem size (e.g. _m_ = 6; _n_ = _k_, also encoded as `m6npkp`).
+The y-axis will report in units GFLOPS (billions of floating-point operations
+per second) on a single core.
+
+It's also worth pointing out that the top of each graph (e.g. the maximum
+y-axis value depicted) _always_ corresponds to the theoretical peak performance
+under the conditions associated with that graph.
+Theoretical peak performance, in units of GFLOPS, is calculated as the
+product of:
+1. the maximum sustainable clock rate in GHz; and
+2. the maximum number of floating-point operations (flops) that can be
+executed per cycle.
+
+Note that the maximum sustainable clock rate may change depending on the
+conditions.
+For example, on some systems the maximum clock rate is higher when only one
+core is active (e.g. single-threaded performance) versus when all cores are
+active (e.g. multithreaded performance).
+The maximum number of flops executable per cycle (per core) is generally
+computed as the product of:
+1. the maximum number of fused multiply-add (FMA) vector instructions that
+can be issued per cycle (per core);
+2. the maximum number of elements that can be stored within a single vector
+register (for the datatype in question); and
+3. 2.0, since an FMA instruction fuses two operations (a multiply and an add).
+
+The problem size range, represented on the x-axis, is sampled in
+increments of 4 up to 800 for the cases where one or two dimensions is small
+(and constant)
+and up to 400 in the case where all dimensions (e.g. _m_, _n_, and _k_) are
+bound to the problem size (i.e., square matrices).
+
+Note that the constant small matrix dimensions were chosen to be _very_
+small--in the neighborhood of 8--intentionally to showcase what happens when
+at least one of the matrices is abnormally "skinny." Typically, organizations
+and individuals only publish performance with square matrices, which can miss
+the problem sizes of interest to many applications. Here, in addition to square
+matrices (shown in the seventh column), we also show six other scenarios where
+one or two `gemm` dimensions (of _m,_ _n_, and _k_) is small.
+
+The legend in each graph contains two entries for BLIS, corresponding to the
+two black lines, one solid and one dotted. The dotted line, **"BLIS conv"**,
+represents the conventional implementation that targets large matrices. This
+was the only implementation available in BLIS prior to the addition to the
+small/skinny matrix support. The solid line, **"BLIS sup"**, makes use of the
+new small/skinny matrix implementation for certain small problems. Whenever
+these results differ by any significant amount (beyond noise), it denotes a
+problem size for which BLIS employed the new small/skinny implementation.
+Put another way, **the delta between these two lines represents the performance
+improvement between BLIS's previous status quo and the new regime.**
+
+Finally, each point along each curve represents the best of three trials.
+
+# Interpretation
+
+In general, the the curves associated with higher-performing implementations
+will appear higher in the graphs than lower-performing implementations.
+Ideally, an implementation will climb in performance (as a function of problem
+size) as quickly as possible and asymptotically approach some high fraction of
+peak performance.
+
+When corresponding with us, via email or when opening an
+[issue](https://github.com/flame/blis/issues) on github, we kindly ask that
+you specify as closely as possible (though a range is fine) your problem
+size of interest so that we can better assist you.
+
+# Level-3 performance
+
+## Kaby Lake
+
+### Kaby Lake experiment details
+
+* Location: undisclosed
+* Processor model: Intel Core i5-7500 (Kaby Lake)
+* Core topology: one socket, 4 cores total
+* SMT status: unavailable
+* Max clock rate: 3.8GHz (single-core)
+* Max vector register length: 256 bits (AVX2)
+* Max FMA vector IPC: 2
+* Peak performance:
+  * single-core: 57.6 GFLOPS (double-precision), 115.2 GFLOPS (single-precision)
+* Operating system: Gentoo Linux (Linux kernel 5.0.7)
+* Compiler: gcc 7.3.0
+* Results gathered: 31 May 2019, 3 June 2019, 19 June 2019
+* Implementations tested:
+  * BLIS 6bf449c (0.5.2-42)
+    * configured with `./configure --enable-cblas auto`
+    * sub-configuration exercised: `haswell`
+  * OpenBLAS 0.3.6
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
+  * BLASFEO 2c9f312
+    * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
+  * Eigen 3.3.90
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
+    * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
+  * MKL 2018 update 4
+    * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
+* Affinity:
+  * N/A.
+* Frequency throttling (via `cpupower`):
+  * Driver: intel_pstate
+  * Governor: performance
+  * Hardware limits: 800MHz - 3.8GHz
+  * Adjusted minimum: 3.7GHz
+* Comments:
+  * For both row- and column-stored matrices, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution (typically MKL), except for a few cases of where the _k_ dimension is very small. It is likely the case that this shape scenario begs a different kernel approach, since the BLIS microkernel is inherently designed to iterate over many _k_ dimension iterations (which leads them to incur considerable overhead for small values of _k_).
+  * For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 80 to 180. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
+
+### Kaby Lake results
+
+#### pdf
+
+* [Kaby Lake row-stored](graphs/sup/dgemm_rrr_kbl_nt1.pdf)
+* [Kaby Lake column-stored](graphs/sup/dgemm_ccc_kbl_nt1.pdf)
+
+#### png (inline)
+
+* **Kaby Lake row-stored**
+![row-stored](graphs/sup/dgemm_rrr_kbl_nt1.png)
+* **Kaby Lake column-stored**
+![column-stored](graphs/sup/dgemm_ccc_kbl_nt1.png)
+
+---
+
+## Epyc
+
+### Epyc experiment details
+
+* Location: Oracle cloud
+* Processor model: AMD Epyc 7551 (Zen1)
+* Core topology: two sockets, 4 dies per socket, 2 core complexes (CCX) per die, 4 cores per CCX, 64 cores total
+* SMT status: enabled, but not utilized
+* Max clock rate: 3.0GHz (single-core), 2.55GHz (multicore)
+* Max vector register length: 256 bits (AVX2)
+* Max FMA vector IPC: 1
+  * Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each.
+* Peak performance:
+  * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
+* Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
+* Compiler: gcc 7.3.0
+* Results gathered: 31 May 2019, 3 June 2019, 19 June 2019
+* Implementations tested:
+  * BLIS 6bf449c (0.5.2-42)
+    * configured with `./configure --enable-cblas auto`
+    * sub-configuration exercised: `zen`
+  * OpenBLAS 0.3.6
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
+  * BLASFEO 2c9f312
+    * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
+  * Eigen 3.3.90
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
+    * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
+  * MKL 2019 update 4
+    * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
+* Affinity:
+  * N/A.
+* Frequency throttling (via `cpupower`):
+  * Driver: acpi-cpufreq
+  * Governor: performance
+  * Hardware limits: 1.2GHz - 2.0GHz
+  * Adjusted minimum: 2.0GHz
+* Comments:
+  * As with Kaby Lake, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution, except for a few cases of where the _k_ dimension is very small.
+  * For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 12 to 256. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
+
+### Epyc results
+
+#### pdf
+
+* [Epyc row-stored](graphs/sup/dgemm_rrr_epyc_nt1.pdf)
+* [Epyc column-stored](graphs/sup/dgemm_ccc_epyc_nt1.pdf)
+
+#### png (inline)
+
+* **Epyc row-stored**
+![row-stored](graphs/sup/dgemm_rrr_epyc_nt1.png)
+* **Epyc column-stored**
+![column-stored](graphs/sup/dgemm_ccc_epyc_nt1.png)
+
+---
+
+# Feedback
+
+Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
+
+Thanks for your interest in BLIS!
+
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -4,6 +4,8 @@

 ## Contents

+* [Changes in 0.6.0](ReleaseNotes.md#changes-in-060)
+* [Changes in 0.5.2](ReleaseNotes.md#changes-in-052)
 * [Changes in 0.5.1](ReleaseNotes.md#changes-in-051)
 * [Changes in 0.5.0](ReleaseNotes.md#changes-in-050)
 * [Changes in 0.4.1](ReleaseNotes.md#changes-in-041)
@@ -33,6 +35,70 @@
 * [Changes in 0.0.2](ReleaseNotes.md#changes-in-002)
 * [Changes in 0.0.1](ReleaseNotes.md#changes-in-001)

+## Changes in 0.6.0
+June 3, 2019
+
+Improvements present in 0.6.0:
+
+Framework:
+- Implemented small/skinny/unpacked (sup) framework for accelerated level-3 performance when at least one matrix dimension is small (or very small). For now, only `dgemm` is optimized, and this new implementation currently only targets Intel Haswell through Coffee Lake, and AMD Zen-based Ryzen/Epyc. (The existing kernels should extend without significant modification to Zen2-based Ryzen/Epyc once they are available.) Also, multithreaded parallelism is not yet implemented, though application-level threading should be fine. (AMD)
+- Changed function pointer usages of `void*` to new, typedef'ed type `void_fp`.
+- Allow compile-time disabling of BLAS prototypes in BLIS, in case the application already has access to prototypes.
+- In `bli_system.h`, define `_POSIX_C_SOURCE` to `200809L` if the macro is not already defined. This ensures that things such as pthreads are properly defined by an application that has `#include "blis.h"` but omits the definition of `_POSIX_C_SOURCE` from the command-line compiler options. (Christos Psarras)
+
+Kernels:
+- None.
+
+Build system:
+- Updated the way configure and the top-level Makefile handle installation prefixes (`prefix`, `exec_prefix`, `libdir`, `includedir`, `sharedir`) to better conform with GNU conventions.
+- Improved clang version detection. (Isuru Fernando)
+- Use pthreads on MinGW and Cygwin. (Isuru Fernando)
+
+Testing:
+- Added Eigen support to test drivers in `test/3`.
+- Fix inadvertently hidden `xerbla_()` in blastest drivers when building only shared libraries. (Isuru Fernando, M. Zhou)
+
+Documentation:
+- Added `docs/PerformanceSmall.md` to showcase new BLIS small/skinny `dgemm` performance on Kaby Lake and Epyc.
+- Added Eigen results (3.3.90) to performance graphs showcased in `docs/Performance.md`.
+- Added BLIS thread factorization info to `docs/Performance.md`.
+
+## Changes in 0.5.2
+March 19, 2019
+
+Improvements present in 0.5.2:
+
+Framework:
+- Added support for IC loop parallelism to the `trsm` operation.
+- Implemented a pool-based small block allocator and a corresponding `configure` option (enabled by default), which minimizes the number of calls to `malloc()` and `free()` for the purposes of allocating small blocks (on the order of 100 bytes). These small blocks are used by internal data structures, and the repeated allocation and freeing of these structures could, perhaps, cause memory fragmentation issues in certain application circumstances. This was never reproduced and observed, however, and remains entirely theoretical. Still, the sba should be no slower, and perhaps a little faster, than repeatedly calling `malloc()` and `free()` for these internal data structures. Also, the sba was designed to be thread-safe. (AMD)
+- Refined and extended the output enabled by `--enable-mem-tracing`, which allows a developer to follow memory allocation and release performed by BLIS.
+- Initialize error messages at compile-time rather than at runtime. (Minh Quan Ho)
+- Fixed a potential situation whereby the multithreading parameters in a `rntm_t` object that is passed into an expert interface is ignored.
+- Prevent a redefinition of `ftnlen` in the `f2c_types.h` in blastest. (Jeff Diamond)
+
+Kernels:
+- Adjusted the cache blocksizes in the `zen` sub-configuration for `float`, `scomplex`, and `dcomplex` datatypes. The previous values, taken directly from the `haswell` subconfig, were merely meant to be reasonable placeholders until more suitable values were determined, as had already taken place for the `double` datatype. (AMD)
+- Rewrote reference kernels in terms of simplified indexing annotated by the `#pragma omp simd` directive, which a compiler can use to vectorize certain constant-bounded loops. The `#pragma` is disabled via a preprocessor macro layer if the compiler is found by `configure` to not support `-fopenmp-simd`. (Devin Matthews, Jeff Hammond)
+
+Build system:
+- Added symbol-export annotation macros to all of the function prototypes and global variable declarations for public symbols, and created a new `configure` option, `--export-shared=[public|all]`, that controls which symbols--only those that are meant to be public, or all symbols--are exported to the shared library. (Isuru Fernando)
+- Standardized to using `-O3` in various subconfigs, and also `-funsafe-math-optimizations` for reference kernels. (Dave Love, Jeff Hammond)
+- Disabled TBM, XOP, LWP instructions in all AMD subconfigs. (Devin Matthews)
+- Fixed issues that prevented using BLIS on GNU Hurd. (M. Zhou)
+- Relaxed python3 requirements to allow python 3.4 or later. Previously, python 3.5 or later was required if python3 was being used. (Dave Love)
+- Added `thunderx2` sub-configuration. (Devangi Parikh)
+- Added `power9` sub-configuration. For now, this subconfig only uses reference kernels. (Nicholai Tukanov)
+- Fixed an issue with `configure` failing on OSes--including certain flavors of BSD--that contain a slash '/' character in the output of `uname -s`. (Isuru Fernando, M. Zhou)
+
+Testing:
+- Renamed `test/3m4m` directory to `test/3`.
+- Lots of updates and improvements to Makefiles, shell scripts, and matlab scripts in `test/3`.
+
+Documentation:
+- Added a new `docs/Performance.md` document that showcases single-threaded, single-socket, and dual-socket performance results of `single`, `double`, `scomplex`, and `dcomplex` level-3 operations in BLIS, OpenBLAS, and MKL/ARMPL for Haswell, SkylakeX, ThunderX2, and Epyc hardware architectures. (Note: Other implementations such as Eigen and ATLAS may be added to these graphs in the future.)
+- Updated `README.md` to include new language on external packages. (Dave Love)
+- Updated `docs/Multithreading.md` to be more explicit about the fact that multithreading is disabled by default at configure-time, and the fact that BLIS will run executed single-threaded at runtime by default if no multithreaded specification is given. (M. Zhou)
+
 ## Changes in 0.5.1
 December 18, 2018

@@ -88,7 +154,7 @@ Kernels:
 Build system:
 - Added support for building Windows DLLs via AppVeyor [2], complete with a built-in implementation of pthreads for Windows, as well as an implementation of the `pthread_barrier_*()` APIs for use on OS X. (Isuru Fernando, Devin Matthews, Mathieu Poumeyrol, Matthew Honnibal)
 - Defined a `cortexa53` sub-configuration, which is similar to `cortexa57` except that it uses slightly different compiler flags. (Mathieu Poumeyrol)
- Added python version checking to configure script.
+- Added python version checking to `configure` script.
 - Added a script to automate the regeneration of the symbols list file (now located in `build/libblis-symbols.def`).
 - Various tweaks in preparation for BLIS's inclusion within Debian. (M. Zhou)
 - Various fixes and cleanups.
@@ -246,16 +312,16 @@ May 2, 2017
 - Implemented the 1m method for inducing complex matrix multiplication. (Please see ACM TOMS publication ["Implementing high-performance complex matrix multiplication via the 1m method"](https://github.com/flame/blis#citations) for more details.)
 - Switched to simpler `trsm_r` implementation.
 - Relaxed constraints that `MC % NR = 0` and `NC % MR = 0`, as this was only needed for the more sophisticated `trsm_r` implementation.
- Automatic loop thread assignment. (Devin Matthews) 
- Updates to `.travis.yml` configuration file. (Devin Matthews) 
+- Automatic loop thread assignment. (Devin Matthews)
+- Updates to `.travis.yml` configuration file. (Devin Matthews)
 - Updates to non-default haswell microkernels.
 - Match storage format of the temporary micro-tiles in macrokernels to that of the microkernel storage preference for edge cases.
- Added support for Intel's Knight's Landing. (Devin Matthews) 
- Added more flexible options to specify multithreading via the configure script. (Devin Matthews) 
- OS X compatibility fixes. (Devin Matthews) 
- Other small changes and fixes. 
+- Added support for Intel's Knight's Landing. (Devin Matthews)
+- Added more flexible options to specify multithreading via the configure script. (Devin Matthews)
+- OS X compatibility fixes. (Devin Matthews)
+- Other small changes and fixes.

-Also, thanks to Elmar Peise, Krzysztof Drewniak, and Francisco Igual for their contributions in reporting/fixing certain bugs that were addressed in this version. 
+Also, thanks to Elmar Peise, Krzysztof Drewniak, and Francisco Igual for their contributions in reporting/fixing certain bugs that were addressed in this version.

 ## Changes in 0.2.1
 October 5, 2016
@@ -439,7 +505,7 @@ While neither `bli_config.h` nor `bli_kernel.h` has changed formats since 0.0.7,
 ## Changes in 0.0.7
 April 30, 2013

-This version incorporates many small fixes and feature enhancements made during our SC13 collaboration. 
+This version incorporates many small fixes and feature enhancements made during our SC13 collaboration.

 ## Changes in 0.0.6
 April 13, 2013
@@ -478,7 +544,7 @@ The compatibility layer is enabled via a configuration option in `bl2_config.h`.
 ## Changes in 0.0.2
 February 11, 2013

-Most notably, this version contains the new test suite I've been working on for the last month. 
+Most notably, this version contains the new test suite I've been working on for the last month.

 What is the test suite? It is a highly configurable test driver that allows one to test an arbitrary set of BLIS operations, with an arbitrary set of parameter combinations, and matrix/vector storage formats, as well as whichever datatypes you are interested in. (For now, only homogeneous datatyping is supported, which is what most people want.) You can also specify an arbitrary problem size range with arbitrary increments, and arbitrary ratios between dimensions (or anchor a dimension to a single value), and you can output directly to files which store the output in matlab syntax, which makes it easy to generate performance graphs.

--- a/docs/graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.pdf
+++ b/docs/graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.pdf
--- a/docs/graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.png
+++ b/docs/graphs/large/l3_perf_epyc_jc1ic8jr4_nt32.png
--- a/docs/graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.pdf
+++ b/docs/graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.pdf
--- a/docs/graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.png
+++ b/docs/graphs/large/l3_perf_epyc_jc2ic8jr4_nt64.png
--- a/docs/graphs/large/l3_perf_epyc_nt1.pdf
+++ b/docs/graphs/large/l3_perf_epyc_nt1.pdf
--- a/docs/graphs/large/l3_perf_epyc_nt1.png
+++ b/docs/graphs/large/l3_perf_epyc_nt1.png
--- a/docs/graphs/large/l3_perf_has_jc2ic3jr2_nt12.pdf
+++ b/docs/graphs/large/l3_perf_has_jc2ic3jr2_nt12.pdf
--- a/docs/graphs/large/l3_perf_has_jc2ic3jr2_nt12.png
+++ b/docs/graphs/large/l3_perf_has_jc2ic3jr2_nt12.png
--- a/docs/graphs/large/l3_perf_has_jc4ic3jr2_nt24.pdf
+++ b/docs/graphs/large/l3_perf_has_jc4ic3jr2_nt24.pdf
--- a/docs/graphs/large/l3_perf_has_jc4ic3jr2_nt24.png
+++ b/docs/graphs/large/l3_perf_has_jc4ic3jr2_nt24.png
--- a/docs/graphs/large/l3_perf_has_nt1.pdf
+++ b/docs/graphs/large/l3_perf_has_nt1.pdf
--- a/docs/graphs/large/l3_perf_has_nt1.png
+++ b/docs/graphs/large/l3_perf_has_nt1.png
--- a/docs/graphs/large/l3_perf_skx_jc2ic13_nt26.pdf
+++ b/docs/graphs/large/l3_perf_skx_jc2ic13_nt26.pdf
--- a/docs/graphs/large/l3_perf_skx_jc2ic13_nt26.png
+++ b/docs/graphs/large/l3_perf_skx_jc2ic13_nt26.png
--- a/docs/graphs/large/l3_perf_skx_jc4ic13_nt52.pdf
+++ b/docs/graphs/large/l3_perf_skx_jc4ic13_nt52.pdf
--- a/docs/graphs/large/l3_perf_skx_jc4ic13_nt52.png
+++ b/docs/graphs/large/l3_perf_skx_jc4ic13_nt52.png
--- a/docs/graphs/large/l3_perf_skx_nt1.pdf
+++ b/docs/graphs/large/l3_perf_skx_nt1.pdf
--- a/docs/graphs/large/l3_perf_skx_nt1.png
+++ b/docs/graphs/large/l3_perf_skx_nt1.png
--- a/docs/graphs/large/l3_perf_tx2_jc4ic7_nt28.pdf
+++ b/docs/graphs/large/l3_perf_tx2_jc4ic7_nt28.pdf
--- a/docs/graphs/large/l3_perf_tx2_jc4ic7_nt28.png
+++ b/docs/graphs/large/l3_perf_tx2_jc4ic7_nt28.png
--- a/docs/graphs/large/l3_perf_tx2_jc8ic7_nt56.pdf
+++ b/docs/graphs/large/l3_perf_tx2_jc8ic7_nt56.pdf
--- a/docs/graphs/large/l3_perf_tx2_jc8ic7_nt56.png
+++ b/docs/graphs/large/l3_perf_tx2_jc8ic7_nt56.png
--- a/docs/graphs/large/l3_perf_tx2_nt1.pdf
+++ b/docs/graphs/large/l3_perf_tx2_nt1.pdf
--- a/docs/graphs/large/l3_perf_tx2_nt1.png
+++ b/docs/graphs/large/l3_perf_tx2_nt1.png
--- a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf
+++ b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf
--- a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png
+++ b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png
--- a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf
+++ b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf
--- a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png
+++ b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png
--- a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf
+++ b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf
--- a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png
+++ b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png
--- a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf
+++ b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf
--- a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png
+++ b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png
--- a/examples/oapi/Makefile
+++ b/examples/oapi/Makefile
@@ -114,7 +114,7 @@ CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
 CFLAGS         += -I$(TEST_SRC_PATH)

 # Locate the libblis library to which we will link.
-LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
+#LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)

 # Binary executable name.
 TEST_BINS      := 00obj_basic.x \
--- a/examples/tapi/Makefile
+++ b/examples/tapi/Makefile
@@ -102,7 +102,7 @@ CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
 CFLAGS         += -I$(TEST_SRC_PATH)

 # Locate the libblis library to which we will link.
-LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
+#LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)

 # Binary executable name.
 TEST_BINS      := 00level1v.x \
--- a/frame/0/bli_l0_oapi.c
+++ b/frame/0/bli_l0_oapi.c
@@ -64,7 +64,7 @@ void PASTEMAC0(opname) \
 	bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
 \
 	f \
@@ -100,7 +100,7 @@ void PASTEMAC0(opname) \
 	    PASTEMAC(opname,_check)( chi, psi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
 \
 	f \
@@ -137,7 +137,7 @@ void PASTEMAC0(opname) \
 	    PASTEMAC(opname,_check)( chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
 \
 	f \
@@ -170,7 +170,7 @@ void PASTEMAC0(opname) \
 	    PASTEMAC(opname,_check)( chi, psi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
 \
 	f \
@@ -213,7 +213,7 @@ void PASTEMAC0(opname) \
 	else                             dt_use = dt_chi; \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_use ); \
 \
 	f \
@@ -247,7 +247,7 @@ void PASTEMAC0(opname) \
 	    PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
 \
 	f \
@@ -290,7 +290,7 @@ void PASTEMAC0(opname) \
 	bli_obj_scalar_set_dt_buffer( chi, dt_zeta_c, &dt_chi, &buf_chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
 \
 	f \
@@ -327,7 +327,7 @@ void PASTEMAC0(opname) \
 	    PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
 \
 	/* Query a type-specific function pointer, except one that uses
-	   void* instead of typed pointers. */ \
+	   void* for function arguments instead of typed pointers. */ \
 	PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
 \
 	f \
--- a/frame/0/bli_l0_oapi.h
+++ b/frame/0/bli_l0_oapi.h
@@ -40,7 +40,7 @@
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       obj_t*  chi, \
       obj_t*  absq  \
@@ -53,7 +53,7 @@ GENPROT( normfsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       obj_t*  chi, \
       obj_t*  psi  \
@@ -69,7 +69,7 @@ GENPROT( subsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       obj_t*  chi  \
     );
@@ -80,7 +80,7 @@ GENPROT( invertsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       obj_t*  chi, \
       double* zeta_r, \
@@ -93,7 +93,7 @@ GENPROT( getsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       double  zeta_r, \
       double  zeta_i, \
@@ -106,7 +106,7 @@ GENPROT( setsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       obj_t*  chi, \
       obj_t*  zeta_r, \
@@ -119,7 +119,7 @@ GENPROT( unzipsc )
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       obj_t*  zeta_r, \
       obj_t*  zeta_i, \
--- a/frame/0/bli_l0_tapi.h
+++ b/frame/0/bli_l0_tapi.h
@@ -40,7 +40,7 @@
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       conj_t  conjchi, \
       ctype*  chi, \
@@ -56,7 +56,7 @@ INSERT_GENTPROT_BASIC0( subsc )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       conj_t  conjchi, \
       ctype*  chi  \
@@ -68,7 +68,7 @@ INSERT_GENTPROT_BASIC0( invertsc )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       ctype*   chi, \
       ctype_r* absq  \
@@ -81,7 +81,7 @@ INSERT_GENTPROTR_BASIC0( normfsc )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       ctype*  chi, \
       ctype*  psi  \
@@ -93,7 +93,7 @@ INSERT_GENTPROT_BASIC0( sqrtsc )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       ctype*  chi, \
       double* zeta_r, \
@@ -106,7 +106,7 @@ INSERT_GENTPROT_BASIC0( getsc )
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       double  zeta_r, \
       double  zeta_i, \
@@ -119,7 +119,7 @@ INSERT_GENTPROT_BASIC0( setsc )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       ctype*   chi, \
       ctype_r* zeta_r, \
@@ -132,7 +132,7 @@ INSERT_GENTPROTR_BASIC0( unzipsc )
 #undef  GENTPROTR
 #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
 \
-void PASTEMAC(ch,opname) \
+BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
     ( \
       ctype_r* zeta_r, \
       ctype_r* zeta_i, \
@@ -143,14 +143,14 @@ INSERT_GENTPROTR_BASIC0( zipsc )

 // -----------------------------------------------------------------------------

-void bli_igetsc
+BLIS_EXPORT_BLIS void bli_igetsc
     (
       dim_t*  chi,
       double* zeta_r,
       double* zeta_i
     );

-void bli_isetsc
+BLIS_EXPORT_BLIS void bli_isetsc
     (
       double  zeta_r,
       double  zeta_i,
--- a/frame/0/copysc/bli_copysc.h
+++ b/frame/0/copysc/bli_copysc.h
@@ -40,7 +40,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-void PASTEMAC0(opname) \
+BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
     ( \
       obj_t*  chi, \
       obj_t*  psi  \
@@ -55,7 +55,7 @@ GENFRONT( copysc )
 #undef  GENTPROT2
 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
 \
-void PASTEMAC2(chx,chy,varname) \
+BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
     ( \
       conj_t conjchi, \
       void*  chi, \
--- a/Show More
+++ b/Show More