Merge branch 'ref/heads/amd-staging-rome-2.2' of ssh://git.amd.com:29418/cpulibraries/er/blis into amd-staging-rome-2.2

Change-Id: I46acf48354ff73fb4eaeac255132d21095ea4d98
2026-05-14 03:02:08 +00:00 · 2020-05-30 10:31:10 +05:30
parent bb7eeec843 72443e7173
commit 0c52aaefe1
128 changed files with 6945 additions and 2494 deletions
--- a/2261
+++ b/2261
--- a/12
+++ b/12
@@ -15,9 +15,11 @@ but many others have contributed code and feedback, including
  Erling Andersen          @erling-d-andersen
  Alex Arslan              @ararslan
  Vernon Austel                                (IBM, T.J. Watson Research Center)
+  Satish Balay             @balay              (Argonne National Laboratory)
  Matthew Brett            @matthew-brett      (University of Birmingham)
  Jed Brown                @jedbrown           (Argonne National Laboratory)
  Robin Christ             @robinchrist
+  Mat Cross                @matcross           (NAG)
  Kay Dewhurst             @jkd2016            (Max Planck Institute, Halle, Germany)
  Jeff Diamond                                 (Oracle)
  Johannes Dieterich       @iotamudelta
@@ -43,23 +45,27 @@ but many others have contributed code and feedback, including
  Tony Kelman              @tkelman
  Lee Killough             @leekillough        (Cray)
  Mike Kistler             @mkistler           (IBM, Austin Research Laboratory)
+  Kyungmin Lee             @kyungminlee        (Ohio State University)
  Michael Lehn             @michael-lehn
-                           @ShmuelLevine
+  Shmuel Levine            @ShmuelLevine
  Dave Love                @loveshack
  Tze Meng Low                                 (The University of Texas at Austin)
  Ye Luo                   @ye-luo             (Argonne National Laboratory)
  Ricardo Magana           @magania            (Hewlett Packard Enterprise)
  Bryan Marker             @bamarker           (The University of Texas at Austin)
+  Simon Lukas Märtens      @ACSimon33          (RWTH Aachen University)
  Devin Matthews           @devinamatthews     (The University of Texas at Austin)
  Stefanos Mavros          @smavros
+  Bhaskar Nallani          @BhaskarNallani     (AMD)
  Nisanth Padinharepatt                        (AMD)
+  Ajay Panyala             @ajaypanyala
  Devangi Parikh           @dnparikh           (The University of Texas at Austin)
  Elmar Peise              @elmar-peise        (RWTH-Aachen)
  Clément Pernet           @ClementPernet
  Ilya Polkovnichenko
  Jack Poulson             @poulson            (Stanford)
  Mathieu Poumeyrol        @kali
-  Christos Psarras         @ChrisPsa           (RWTH-Aachen)
+  Christos Psarras         @ChrisPsa           (RWTH Aachen University)
                           @qnerd
  Michael Rader            @mrader1248
  Pradeep Rao              @pradeeptrgit       (AMD)
@@ -73,7 +79,7 @@ but many others have contributed code and feedback, including
  Nathaniel Smith          @njsmith
  Shaden Smith             @ShadenSmith
  Tyler Smith              @tlrmchlsmth        (The University of Texas at Austin)
-  Paul Springer            @springer13         (RWTH-Aachen)
+  Paul Springer            @springer13         (RWTH Aachen University)
  Adam J. Stewart          @adamjstewart       (University of Illinois at Urbana-Champaign)
  Vladimir Sukarev
  Santanu Thangaraj                            (AMD)
--- a/README.md
+++ b/README.md
@@ -113,16 +113,16 @@ and high performance." Their statement continues, "The framework will continue
 having an important influence on the design and the instantiation of dense linear
 algebra libraries."

- * **Small/skinny matrix support for dgemm now available!** Thanks to
+ * **Multithreaded small/skinny matrix support for dgemm now available!** Thanks to
 contributions made possible by our partnership with AMD, we have dramatically
 accelerated `gemm` for double-precision real matrix problems where one or two
 dimensions is exceedingly small. A natural byproduct of this optimization is
 that the traditional case of small _m = n = k_ (i.e. square matrices) is also
 accelerated, even though it was not targeted specifically. And though only
-`dgemm` was optimized for now, support for other datatypes, other operations,
-and/or multithreading may be implemented in the future. We've also added a new
-[PerformanceSmall](docs/PerformanceSmall.md) document to showcase the
-improvement in performance when some matrix dimensions are small.
+`dgemm` was optimized for now, support for other datatypes and/or other operations
+may be implemented in the future. We've also added new graphs to the
+[PerformanceSmall](docs/PerformanceSmall.md) document to showcase multithreaded
+performance when one or more matrix dimensions are small.

 * **Performance comparisons now available!** We recently measured the
 performance of various level-3 operations on a variety of hardware architectures,
@@ -489,6 +489,12 @@ Debian package tracker can be found [here](https://tracker.debian.org/pkg/blis).
 (Also, thanks to [Nico Schlömer](https://github.com/nschloe) for previously
 volunteering his time to set up a standalone PPA.)

+ * **Gentoo**. [M. Zhou](https://github.com/cdluminate) also maintains the
+[BLIS package](https://packages.gentoo.org/packages/sci-libs/blis) entry for
+[Gentoo](https://www.gentoo.org/), a Linux distribution known for its
+source-based [portage](https://wiki.gentoo.org/wiki/Portage) package manager
+and distribution system.
+
 * **EPEL/Fedora**. There are official BLIS packages in Fedora and EPEL (for
 RHEL7+ and compatible distributions) with versions for 64-bit integers, OpenMP,
 and pthreads, and shims which can be dynamically linked instead of reference
@@ -637,13 +643,13 @@ A fifth paper, submitted to ACM TOMS, begins the study of so-called
 ``` 

 A sixth paper, submitted to ACM TOMS, revisits the topic of the previous
-article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev2.pdf):
+article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_sisc_rev1.pdf):

 ```
@article{BLIS6,
   author      = {Field G. {V}an~{Z}ee},
   title       = {Implementing High-Performance Complex Matrix Multiplication via the 1m Method},
-   journal     = {ACM Transactions on Mathematical Software},
+   journal     = {SIAM Journal on Scientific Computing},
   note        = {submitted}
 }
 ``` 
--- a/11
+++ b/11
@@ -26,14 +26,19 @@ Here are the steps to follow to create a new release (version) of BLIS:
 6. Update docs/ReleaseNotes.md file with body of finalized announcement
   and the date of the release.

-7. Bump the version number:
+7. Commit changes from steps 5 and 6.
+
+8. Bump the version number:

   $ ./build/bump-version.sh "0.3.2"

-8. Push the new commits and new tag associated with the new version:
+   This will result in two new commits: a version file update and a CHANGELOG
+   file update.
+
+9. Push the new commits and new tag associated with the new version:

   $ git push
   $ git push --tag

-9. Send finalized announcement to blis-devel.
+10. Send finalized announcement to blis-devel.

--- a/attic/windows/Makefile
+++ b/attic/windows/Makefile
@@ -1,341 +0,0 @@
-#
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-
-
-#
-# --- Include variables determined at configure-time --------------------------
-#
-CONFIGURE_DEFS = config\config.mk
-
-!if exist ( $(CONFIGURE_DEFS) )
-!include $(CONFIGURE_DEFS)
-!else
-!error nmake: $(CONFIGURE_DEFS) does not exist! Run configure.cmd first.
-!endif
-
-
-
-#
-# --- Include environment- and build-specific definitions ----------------------
-#
-
-MAKE_DEFS = build\defs.mk
-
-# Include build definitions
-!if exist ( $(MAKE_DEFS) )
-!include $(MAKE_DEFS)
-!else
-!error nmake: $(MAKE_DEFS) does not exist! Your libblis distribution may be incomplete.
-!endif
-
-
-
-#
-# --- Variable modifications ---------------------------------------------------
-#
-
-
-
-#
-# --- High-level rules ---------------------------------------------------------
-#
-
-all: libblis
-
-libblis: libblis-lib
-
-libblis-objs: $(BLIS_OBJS)
-
-libblis-lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB)
-
-libblis-dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL)
-
-lib: libblis-lib
-
-dll: libblis-dll
-
-install: install-lib install-headers
-
-install-lib: $(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib
-
-install-dll: $(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll \
-             $(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib \
-             $(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp
-
-install-headers: $(INSTALL_PREFIX_INC)\$(BLIS_H)
-
-clean: clean-build clean-log
-
-distclean: clean-config clean-build clean-log
-
-
-
-#
-# --- Source code (inference) rules --------------------------------------------
-#
-
-# --- C source files in flamec directory ---
-{$(SRC_BLI_DIRPATH)}.c{$(OBJ_BLI_DIRPATH)}.obj:
-!ifdef VERBOSE
-	if not exist $(OBJ_BLI_DIRPATH) \
-	   ( $(MKDIR) $(OBJ_BLI_DIRPATH) )
-	$(CC) $(CFLAGS) /c $< /Fo$@
-!else
-	@if not exist $(OBJ_BLI_DIRPATH) \
-	 (  ( $(ECHO) nmake: Creating $(OBJ_BLI_DIRPATH) directory ) & \
-	    ( $(MKDIR) $(OBJ_BLI_DIRPATH) ) )
-	@$(ECHO) nmake: Compiling $<
-	@$(CC) $(CFLAGS) /c $< /Fo$@ >> $(CC_LOG_FILE)
-!endif
-
-
-
-#
-# --- Library generation rules -------------------------------------------------
-#
-
-# --- Static library ---
-$(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB): libblis-objs
-!ifdef VERBOSE
-	if not exist $(LIB_LIBBLIS_DIRPATH) \
-	   ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) )
-	$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH)
-	$(CD) $(LIB_LIBBLIS_DIRPATH)
-	$(LIB) $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)
-	$(DEL) *.obj
-	$(CD) $(TOP_BUILD_DIR_ABS)
-!else
-	@if not exist $(LIB_LIBBLIS_DIRPATH) \
-	 (  ( $(ECHO) nmake: Creating $(LIB_LIBBLIS_DIRPATH) directory ) & \
-	    ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) ) )
-	@$(ECHO) nmake: Creating static library $@
-	@$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
-	@$(CD) $(LIB_LIBBLIS_DIRPATH)
-	@$(LIB) /VERBOSE $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)
-	@$(DEL) *.obj
-	@$(CD) $(TOP_BUILD_DIR_ABS)
-!endif
-
-# --- Dynamic library (object code file, import library, and export file) ---
-$(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL): libblis-objs
-!ifdef VERBOSE
-	if not exist $(DLL_LIBBLIS_DIRPATH) \
-	   ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) )
-	$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
-	$(CD) $(DLL_LIBBLIS_DIRPATH)
-	$(DIR) /B *.obj > $(OBJ_LIST_FILE)
-	$(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)
-	$(DEL) $(OBJ_LIST_FILE)
-	$(DEL) *.obj
-	$(CD) $(TOP_BUILD_DIR_ABS)
-!else
-	@if not exist $(DLL_LIBBLIS_DIRPATH) \
-	 (  ( $(ECHO) nmake: Creating $(DLL_LIBBLIS_DIRPATH) directory ) & \
-	    ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) ) )
-	@$(ECHO) nmake: Creating dynamic library $@
-	@$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
-	@$(CD) $(DLL_LIBBLIS_DIRPATH)
-	@$(DIR) /B *.obj > $(OBJ_LIST_FILE)
-	@$(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)
-	@$(DEL) $(OBJ_LIST_FILE)
-	@$(DEL) *.obj
-	@$(CD) $(TOP_BUILD_DIR_ABS)
-!endif
-
-
-
-#
-# --- Install rules ------------------------------------------------------------
-#
-
-# --- Header files ---
-$(INSTALL_PREFIX_INC)\$(BLIS_H): $(INC_BLI_DIRPATH)\$(BLIS_H) \
-                                  $(BUILD_DIRNAME)\$(BLI_CONFIG_H)
-!ifdef VERBOSE
-	if not exist $(INSTALL_PREFIX_INC) \
-	   ( $(MKDIR) $(INSTALL_PREFIX_INC) )
-    $(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-    $(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-!else
-	@if not exist $(INSTALL_PREFIX_INC) \
-	    ( $(MKDIR) $(INSTALL_PREFIX_INC) )
-    @$(ECHO) nmake: Installing libblis header files to $(INSTALL_PREFIX_INC)
-    @$(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-    @$(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-!endif
-
-# --- Static library ---
-$(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib
-!ifdef VERBOSE
-	if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )
-	if     exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
-	   ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) )
-!else
-	@if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )
-	@if     exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
-	 (  ( $(ECHO) nmake: Installing $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_LIB) ) & \
-	    ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (object code) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll
-!ifdef VERBOSE
-	if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
-	if     exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \
-	   ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
-	@if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
-	@if     exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \
-	 (  ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll to $(INSTALL_PREFIX_DLL) ) & \
-	    ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (import library) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib
-!ifdef VERBOSE
-	if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
-	if     exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
-	   ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
-	@if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
-	@if     exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
-	 (  ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_DLL) ) & \
-	    ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (export file) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp
-!ifdef VERBOSE
-	if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
-	if     exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \
-	   ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
-	@if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
-	@if     exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \
-	 (  ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp to $(INSTALL_PREFIX_DLL) ) & \
-	    ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-
-
-#
-# --- Clean rules --------------------------------------------------------------
-#
-
-clean-log:
-!ifdef VERBOSE
-	if exist $(CC_LOG_FILE) \
-	   ( $(DEL) $(CC_LOG_FILE) )
-	if exist $(FC_LOG_FILE) \
-	   ( $(DEL) $(FC_LOG_FILE) )
-	if exist $(COPY_LOG_FILE) \
-	   ( $(DEL) $(COPY_LOG_FILE) )
-!else
-	@if exist $(CC_LOG_FILE) \
-	 (  ( $(ECHO) nmake: Deleting $(CC_LOG_FILE) ) & \
-	    ( $(DEL) $(CC_LOG_FILE) ) )
-	@if exist $(FC_LOG_FILE) \
-	 (  ( $(ECHO) nmake: Deleting $(FC_LOG_FILE) ) & \
-	    ( $(DEL) $(FC_LOG_FILE) ) )
-	@if exist $(COPY_LOG_FILE) \
-	 (  ( $(ECHO) nmake: Deleting $(COPY_LOG_FILE) ) & \
-	    ( $(DEL) $(COPY_LOG_FILE) ) )
-!endif
-
-clean-config:
-!ifdef VERBOSE
-	if exist $(CNF_DIRNAME) \
-	   ( $(RMDIR) $(CNF_DIRNAME) )
-	if exist $(INC_DIRNAME) \
-	   ( $(RMDIR) $(INC_DIRNAME) )
-	if exist $(SRC_DIRNAME) \
-	   ( $(RMDIR) $(SRC_DIRNAME) )
-!else
-	@if exist $(CNF_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(CNF_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(CNF_DIRNAME) ) )
-	@if exist $(INC_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(INC_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(INC_DIRNAME) ) )
-	@if exist $(SRC_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(SRC_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(SRC_DIRNAME) ) )
-!endif
-
-clean-build:
-!ifdef VERBOSE
-	if exist $(OBJ_DIRNAME) \
-	   ( $(RMDIR) $(OBJ_DIRNAME) )
-	if exist $(LIB_DIRNAME) \
-	   ( $(RMDIR) $(LIB_DIRNAME) )
-	if exist $(DLL_DIRNAME) \
-	   ( $(RMDIR) $(DLL_DIRNAME) )
-!else
-	@if exist $(OBJ_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(OBJ_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(OBJ_DIRNAME) ) )
-	@if exist $(LIB_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(LIB_DIRNAME) ) )
-	@if exist $(DLL_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(DLL_DIRNAME) ) )
-!endif
-
-# Useful for developing when all we want to do is remove the library products.
-clean-lib:
-!ifdef VERBOSE
-	if exist $(LIB_DIRNAME) \
-	   ( $(RMDIR) $(LIB_DIRNAME) )
-	if exist $(DLL_DIRNAME) \
-	   ( $(RMDIR) $(DLL_DIRNAME) )
-!else
-	@if exist $(LIB_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(LIB_DIRNAME) ) )
-	@if exist $(DLL_DIRNAME) \
-	 (  ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \
-	    ( $(RMDIR) $(DLL_DIRNAME) ) )
-!endif
-
-
-
-#
-# --- Help target --------------------------------------------------------------
-#
-
-help:
-	@$(NMAKE_HELP)
-
--- a/attic/windows/build/config.mk.in
+++ b/attic/windows/build/config.mk.in
@@ -1,52 +0,0 @@
-#
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-#
-# --- Configuration variable definitions ---------------------------------------
-#
-# Environment-related variables:
-#   REVISION            - The code's revision number.
-#   PWD                 - The path to current working directory.
-#   ARCH_STR            - A string to identify the requested build architecture.
-#   BUILD_STR           - A string to identify the requested build type.
-#   CCOMPILER_STR       - A string to identify the requested C compiler.
-#
-# Target-related variables:
-#   FLAMEC_OBJS         - List of paths to flamec object files.
-#   LAPACK2FLAMEC_OBJS  - List of paths to lapack2flamec object files.
-#
-# Note: these variables are not present in the .in template file. Instead, they
-# are appended to the contents of the .in file by a build script and output to
-# a separate file (by the same name, without the .in extension).
-#
--- a/attic/windows/build/defs.mk
+++ b/attic/windows/build/defs.mk
@@ -1,240 +0,0 @@
-#
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-
-#
-# --- General build system options --------------------------------------------
-#
-
-# Uncomment this for verbose output from nmake.
-# VERBOSE = 1
-
-# Assign this varible to be the full path to the directory to which you would
-# like the BLIS build products to be installed upon running "nmake install".
-# The nmake install target will create the install directory and all requisite
-# subdirectories if they do not already exist (in which case the user must have
-# permission to create these directories).
-INSTALL_PREFIX = c:\field\lib
-
-
-#
-# --- Important build system filenames ----------------------------------------
-#
-
-# DLL link arguments. The contents of this file should be customized when
-# building a dynamically-linked library. The lines of the file should contain
-# linker options, library names, and library paths. Note that the library
-# paths must be declared in the following form:
-#
-#   /link /LIBPATH:<path1>
-#   /link /LIBPATH:<path2>
-#   /link /LIBPATH:<path3>
-#
-# where <path1>, <path2>, and <path3> are library paths to add to the list
-# of paths to search when the linker attempts to locate other libraries
-# listed in the file.
-LINKARGS_FILENAME = linkargs.txt
-LINKARGS_FILEPATH = $(PWD)\$(LINKARGS_FILENAME)
-
-# Various log file names that capture standard output when VERBOSE is undefined.
-CC_LOG_FILE   = nmake-cc.log
-FC_LOG_FILE   = nmake-fc.log
-COPY_LOG_FILE = nmake-copy.log
-
-
-#
-# --- General name and directory definitions -----------------------------------
-#
-
-# The relative and absolute locations of the top-level Windows build directory.
-# This is the directory in which nmake is run (not the directory named "build").
-TOP_BUILD_DIR_REL = .
-TOP_BUILD_DIR_ABS = $(PWD)
-
-# The revision string.
-REV_STR           = r$(REVISION)
-
-# The names of the libraries.
-LIBBLIS_NAME_ONLY = libblis
-LIBBLIS           = $(LIBBLIS_NAME_ONLY)-$(ARCH_STR)-$(REV_STR)
-
-# Directories that reside within the top-level Windows directory.
-CNF_DIRNAME       = config
-INC_DIRNAME       = include
-SRC_DIRNAME       = frame
-OBJ_DIRNAME       = obj
-LIB_DIRNAME       = lib
-DLL_DIRNAME       = dll
-
-# Leaves of interest for Windows.
-
-# Relative directory paths to each of the above subdirectories.
-INC_DIRPATH       = $(TOP_BUILD_DIR_REL)\$(INC_DIRNAME)
-SRC_DIRPATH       = $(TOP_BUILD_DIR_REL)\$(SRC_DIRNAME)
-OBJ_DIRPATH       = $(TOP_BUILD_DIR_REL)\$(OBJ_DIRNAME)
-LIB_DIRPATH       = $(TOP_BUILD_DIR_REL)\$(LIB_DIRNAME)
-DLL_DIRPATH       = $(TOP_BUILD_DIR_REL)\$(DLL_DIRNAME)
-
-# We only have header files for flamec leaves.
-INC_BLI_DIRPATH   = $(INC_DIRPATH)
-
-# We have source code for flamec and lapack2flamec leaves.
-SRC_BLI_DIRPATH   = $(SRC_DIRPATH)
-
-
-# And we have object file paths corresponding to those source leaves defined
-# above.
-OBJ_BLI_DIRPATH   = $(OBJ_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# Separate directories into which we'll move object files when we create the
-# static libraries.
-LIB_LIBBLIS_DIRPATH = $(LIB_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# Separate directories into which we'll move object files when we create the
-# dynamic libraries.
-DLL_LIBBLIS_DIRPATH = $(DLL_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# The install subdirectories.
-INSTALL_PREFIX_LIB = $(INSTALL_PREFIX)\libblis\lib
-INSTALL_PREFIX_DLL = $(INSTALL_PREFIX)\libblis\dll
-INSTALL_PREFIX_INC = $(INSTALL_PREFIX)\libblis\include-$(ARCH_STR)-$(REV_STR)
-
-# Definitions for important header files used in the install-headers rule.
-BUILD_DIRNAME      = build
-BLIS_H             = blis.h
-
-
-#
-# --- General shell definitions ------------------------------------------------
-#
-
-CD     = cd
-DIR    = dir
-COPY   = copy
-DEL    = del /F /Q
-MKDIR  = mkdir
-RMDIR  = rd /S /Q
-ECHO   = echo
-
-
-#
-# --- Helper scripts -----------------------------------------------------------
-#
-
-NMAKE_HELP = .\build\nmake-help.cmd
-
-
-
-#
-# --- Compiler-related definitions ---------------------------------------------
-#
-
-#!include $(VERSION_FILE)
-
-# --- C compiler definitions ---
-
-WINDOWS_BUILD = BLIS_ENABLE_WINDOWS_BUILD
-VERS_STR      = 0.0.9
-VERSION       = BLIS_VERSION_STRING=\"$(VERS_STR)\"
-
-!if "$(CCOMPILER_STR)"=="icl"
-
-!if "$(BUILD_STR)"=="debug"
-CDEBUG = /Zi
-COPTIM = /Od
-!elseif "$(BUILD_STR)"=="release"
-CDEBUG =
-COPTIM = /Ox
-!endif
-
-CC            = icl.exe
-CMISCFLAGS    = /nologo
-CLANGFLAGS    =
-CPPROCFLAGS   = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)
-CWARNFLAGS    = /w
-CDBGFLAGS     = $(CDEBUG)
-COPTFLAGS     = $(COPTIM)
-CRTIMEFLAGS   = /MT
-CMTHREADFLAGS = /Qopenmp
-CFLAGS        = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \
-                $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)
-
-!elseif "$(CCOMPILER_STR)"=="cl"
-
-!if "$(BUILD_STR)"=="debug"
-CDEBUG = /Zi
-COPTIM = /Od
-!elseif "$(BUILD_STR)"=="release"
-CDEBUG =
-COPTIM = /Ox
-!endif
-
-CC            = cl.exe
-CMISCFLAGS    = /nologo
-CLANGFLAGS    =
-CPPROCFLAGS   = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)
-CWARNFLAGS    = /w
-CDBGFLAGS     = $(CDEBUG)
-COPTFLAGS     = $(COPTIM)
-CRTIMEFLAGS   = /MT
-CMTHREADFLAGS = /openmp
-CFLAGS        = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \
-                $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)
-
-!endif
-
-
-
-#
-# --- Library-related definitions ----------------------------------------------
-#
-
-# --- Static library definitions ---
-
-LIBBLIS_LIB          = $(LIBBLIS).lib
-
-LIB                   = lib
-LIB_OPTIONS           = /nologo
-LIB_BLI_OUTPUT_ARG    = /out:$(LIBBLIS_LIB)
-LIB_BLI_INPUT_ARGS    = *.obj
-
-# --- Dynamic library definitions ---
-
-LIBBLIS_DLL          = $(LIBBLIS).dll
-
-GENDLL                = $(TOP_BUILD_DIR_ABS)\gendll.cmd
-OBJ_LIST_FILE         = libblis-objects.txt
-
-SYM_DEF_FILEPATH      = $(TOP_BUILD_DIR_ABS)\$(BUILD_DIRNAME)\libblis-symbols.def
-
--- a/attic/windows/build/gather-src-for-windows.py
+++ b/attic/windows/build/gather-src-for-windows.py
@@ -1,351 +0,0 @@
-#! /usr/bin/env python
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-# ------------------------------------------------------------------------------
-
-# Import modules
-import sys
-import os
-import os.path
-import getopt
-import shutil
-import string
-
-# Global variables for command line options, with default settings.
-script_name  = ""
-dry_run_flag = False
-verbose_flag = False
-
-# Global constants
-flat_config_dirname  = "config"
-flat_header_dirname  = "include"
-flat_source_dirname  = "frame"
-leaf_list_path       = "build/leaf_list"
-ignore_list_path     = "build/ignore_list"
-ignore_list_win_path = "build/ignore_list.windows"
-
-# ------------------------------------------------------------------------------
-
-def print_usage():
-	
-	# Print help information.
-	print " "
-	print " %s" % script_name
-	print " "
-	print " Field G. Van Zee"
-	print " "
-	print " Walk the BLIS source tree and copy all sources necessary for"
-	print " building BLIS under Windows into a single flat directory with"
-	print " no subdirectory hierarchy."
-	print " "
-	print " Usage:"
-	print "   %s [options] tree_dir flat_dir" % script_name
-	print " "
-	print " The following options are accepted:"
-	print " "
-	print "   -d          dry-run"
-	print "                 Go through all the motions, but don't actually copy any"
-	print "                 files."
-	print "   -v          verbose"
-	print "                 Be verbose about actions (one line of output her action)."
-	print " "
-
-	# Exit the script.
-	sys.exit()
-
-# ------------------------------------------------------------------------------
-
-def main():
-
-	# Extern our global veriables.	
-	global script_name
-	global dry_run_flag
-	global verbose_flag
-
-	# Get the script name so we can use it in our output.
-	( script_dir, script_name ) = os.path.split( sys.argv[0] )
-	
-	try:
-		
-		# Get the command line options.
-		options, args = getopt.getopt( sys.argv[1:], "dv")
-	
-	except getopt.GetoptError, err:
-	
-		# print help information and exit:
-		print str( err ) # will print something like "option -a not recognized"
-		print_usage()
-	
-	# Parse our expected command line options.
-	print 'checking options'
-	for o, a in options:
-		
-		if o == "-d":
-			print 'found dry run'
-			dry_run_flag = True
-		elif o == "-v":
-			verbose_flag = True
-		else:
-			assert False, "unhandled option"
-	
-	# Check the number of arguments after command line option processing.
-	n_args = len( args )
-	if n_args != 2:
-		print_usage() 
-
-	# Acquire the non-optional arguments.
-	tree_dir = args[0]
-	flat_dir = args[1]
-
-	# Acquire the list of directories we will ignore.
-	ignore_list = read_ignore_list()
-
-	# Acquire the list of leaf-type directories we will descend into.
-	leaf_list = read_leaf_list()
-
-	# Create strings for each of the base subdirectories in the flat
-	# destination directory.
-	flat_config_base_dirpath = os.path.join( flat_dir, flat_config_dirname )
-	flat_header_base_dirpath = os.path.join( flat_dir, flat_header_dirname )
-	flat_source_base_dirpath = os.path.join( flat_dir, flat_source_dirname )
-
-	# Start a list of directories to create.
-	dirs_to_create = []
-
-	# Append the config directory. We do this outside of the for loop because
-	# we don't need subdirectories for each leaf type.
-	dirs_to_create.append( flat_config_base_dirpath )
-
-	# For each of the leaf specifications, make the full pathnames of the
-	# subdirectories that will reside within the root destination directory.
-	for leaf_spec in leaf_list:
-		
-		# Unpack the leaf_spec tuple.
-		src_exts, hdr_exts = leaf_spec
-		
-		# Append the directory path name to our list. 
-		dirs_to_create.append( flat_header_base_dirpath )
-		dirs_to_create.append( flat_source_base_dirpath )
-
-	# Iterate over the directory list we just created.
-	for dirpath in dirs_to_create:
-
-		# Make the subdirectories within the root destination directory, but
-		# only if they are not existing directories.
-		if os.path.isdir( dirpath ) == False:
-	
-			# Take action only if this is not a dry run.
-			if dry_run_flag == False:
-	
-				# Be verbose if verbosity was requested.
-				if verbose_flag == True:
-					print "%s: creating directory %s" % ( script_name, dirpath )
-			
-				# Make the directory, and parent directories, for dirpath.
-				os.makedirs( dirpath )
-	
-			else:
-	
-				# Be verbose if verbosity was requested.
-				if verbose_flag == True:
-					print "%s: (dry-run) creating directory %s" % ( script_name, dirpath )
-
-
-	# Walk the directory structure top-down.
-	for dirpath, dirnames, filenames in os.walk( tree_dir ):
-		
-		# Remove directories that appear in the ignore list.
-		for item in ignore_list:
-			if item in dirnames:
-				dirnames.remove( item )
-
-		# Consider each leaf specification. If we find the name in the directory
-		# path, then copy the files with its designated extensions into the flat
-		# source directory.
-		for leaf_spec in leaf_list:
-
-			# Unpack the leaf_spec tuple.
-			src_exts, hdr_exts = leaf_spec
-
-			# At this point following line can probably be removed
-			type_dir_name = os.sep + ''
-
-			flat_source_leaf_dirpath = flat_source_base_dirpath
-			flat_header_leaf_dirpath = flat_header_base_dirpath
-
-			if dirpath.find( type_dir_name ) != -1:
-				copy_files_to_flat_subdirs( dirpath, filenames, src_exts, hdr_exts,
-				                            flat_source_leaf_dirpath,
-				                            flat_header_leaf_dirpath )
-
-# ------------------------------------------------------------------------------
-
-def copy_files_to_flat_subdirs( dirpath, filenames, src_exts, hdr_exts, src_dirpath, hdr_dirpath ):
-
-	# Consider all files in dirpath.
-	for filename in filenames:
-		
-		# Construct the full file path for the current file.
-		filepath = os.path.join( dirpath, filename )
-
-		# Iterate over the valid source extensions for the current directory
-		# path.
-		for src_ext in src_exts:
-
-			# If the filename/filepath ends with the source extension, copy it
-			# to the source subdirectory within the flat destination directory.
-			if filepath.endswith( src_ext ):
-				
-				# Take action only if this is not a dry run.
-				if dry_run_flag == False:
-	
-					# Be verbose if verbosity was requested.
-					if verbose_flag == True:
-						print "%s: copying to %s from %s" % ( script_name, src_dirpath, filepath )
-				
-					# Copy the source file to the source subdirectory.
-					shutil.copy2( filepath, src_dirpath )
-	
-				else:
-	
-					# Be verbose if verbosity was requested.
-					if verbose_flag == True:
-						print "%s: (dry-run) copying to %s from %s" % ( script_name, src_dirpath, filepath )
-	
-		# Iterate over the valid header extensions for the current directory
-		# path.
-		for hdr_ext in hdr_exts:
-
-			# If the filename/filepath ends with the header extension, copy it
-			# to the include subdirectory within the flat destination directory.
-			if filepath.endswith( hdr_ext ):
-	
-				# Take action only if this is not a dry run.
-				if dry_run_flag == False:
-	
-					# Be verbose if verbosity was requested.
-					if verbose_flag == True:
-						print "%s: copying to %s from %s" % ( script_name, hdr_dirpath, filepath )
-				
-					# Copy the header file to the header subdirectory.
-					shutil.copy2( filepath, hdr_dirpath )
-	
-				else:
-
-					# Be verbose if verbosity was requested.
-					if verbose_flag == True:
-						print "%s: (dry-run) copying to %s from %s" % ( script_name, hdr_dirpath, filepath )
-
-# ------------------------------------------------------------------------------
-
-def read_ignore_list():
-
-	# Open the ignore list files as read-only.
-	ignore_file     = open( ignore_list_path, 'r' )
-	ignore_file_win = open( ignore_list_win_path, 'r' )
-
-	# Read all lines in the ignore list files. The items in these lists contain
-	# newlines, which we'll strip out shortly.
-	raw_list     = ignore_file.readlines()
-	raw_win_list = ignore_file_win.readlines()
-
-	# Close the files.
-	ignore_file.close()
-	ignore_file_win.close()
-
-	# Initialize an empty ignore list for the stripped version of the raw list.
-	ignore_list = []
-
-	# Iterate over the first raw list.
-	for line in raw_list:
-		
-		# Append the stripped line to a new list.
-		ignore_list.append( line.strip() )
-
-	# Iterate over the second raw list.
-	for line in raw_win_list:
-		
-		# Append the stripped line to a new list.
-		ignore_list.append( line.strip() )
-
-	# Return the list of stripped lines.
-	return ignore_list
-
-# ------------------------------------------------------------------------------
-
-def read_leaf_list():
-
-	# Open the leaf list file.
-	leaf_file = open( leaf_list_path, 'r' )
-
-	# Read the lines in the file.
-	line_list = leaf_file.readlines()
-
-	# Start with a blank list.
-	leaf_list = []
-
-	# Iterate over the lines.
-	for line in line_list:
-
-		# Split the specification by colon to separate the fields.
-		fields = string.split( string.strip( line ), ':' )
-
-		# Get the individual fields of the specification.
-		src_exts = string.split( fields[0], ',' )
-		hdr_exts = string.split( fields[1], ',' )
-
-		# If it's a singleton list of an empty string, make it an empty list.
-		if len(src_exts) == 1:
-			if src_exts[0] == '':
-				src_exts = []
-		
-		# If it's a singleton list of an empty string, make it an empty list.
-		if len(hdr_exts) == 1:
-			if hdr_exts[0] == '':
-				hdr_exts = []
-		
-		# Pack the fields into a tuple.
-		leaf_spec = ( src_exts, hdr_exts )
-
-		
-		# Append the tuple to our list.
-		leaf_list.append( leaf_spec )
-
-	# Return the list.
-	return leaf_list
-
-# ------------------------------------------------------------------------------
-
-# Begin by executing main().
-main()
--- a/attic/windows/build/gen-check-rev-file.py
+++ b/attic/windows/build/gen-check-rev-file.py
@@ -1,252 +0,0 @@
-#! /usr/bin/env python
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-# ------------------------------------------------------------------------------
-
-# Import modules
-import sys
-import os
-import os.path
-import getopt
-
-# Global variables for command line options, with default settings.
-script_name  = ""
-verbose_flag = False
-
-# Global constants
-toplevel_dirpath  = "."
-svn_dirname       = ".svn"
-entries_filename  = "entries"
-revision_filename = "revision"
-dummy_rev_string  = "unknown"
-
-
-# ------------------------------------------------------------------------------
-
-def print_usage():
-	
-	# Print help information.
-	print " "
-	print " %s" % script_name
-	print " "
-	print " Field G. Van Zee"
-	print " "
-	print " This script ensures that a revision file exists so nmake can include the"
-	print " revision number in the subdirectory paths to the build products."
-	print " "
-	print " If a .svn directory exists, the revision file is created (or updated)"
-	print " to contain the revision number contained in .svn\entries file."
-	print " Otherwise, if a .svn directory does not exist, the revision file is"
-	print " left untouched if it exists, and created with a dummy value if it does"
-	print " not."
-	print " "
-	print " This script is typically invoked by configure.cmd, but it can also be"
-	print " run manually."
-	print " "
-	print " Usage:"
-	print "   %s" % script_name
-	print " "
-	print " The following options are accepted:"
-	print " "
-	print "   -v          verbose"
-	print "                 Be verbose. Output what's happening."
-	print " "
-
-	# Exit the script.
-	sys.exit()
-
-# ------------------------------------------------------------------------------
-
-def main():
-
-	# Extern our global veriables.
-	global script_name
-	global verbose_flag
-
-	# Get the script name so we can use it in our output.
-	( script_dir, script_name ) = os.path.split( sys.argv[0] )
-	
-	try:
-		
-		# Get the command line options.
-		options, args = getopt.getopt( sys.argv[1:], "v")
-	
-	except getopt.GetoptError, err:
-	
-		# print help information and exit:
-		print str( err ) # will print something like "option -a not recognized"
-		print_usage()
-	
-	# Parse our expected command line options.
-	for o, a in options:
-		
-		if o == "-v":
-			verbose_flag = True
-		else:
-			assert False, "unhandled option"
-	
-	# Check the number of arguments after command line option processing.
-	n_args = len( args )
-	if n_args != 0:
-		print_usage() 
-
-	# Construct the filepaths to the entries and revision files.
-	entries_filepath  = os.path.join( toplevel_dirpath, svn_dirname, entries_filename )
-	revision_filepath = os.path.join( toplevel_dirpath, revision_filename )
-
-	# Test for the existence of the entries file (and by proxy, a working copy).
-	entries_file_exists = file_exists( entries_filepath )
-
-	# If the entries file exists, we are in a working copy, and thus we can
-	# overwrite the revision file with a potentially new value.
-	if entries_file_exists == True:
-
-		# Read the revision number from the entries file.
-		rev_num_str = read_revision_from_entries( entries_filepath )
-
-		# Be verbose if verbosity was requested.
-		if verbose_flag == True:
-			print "%s: Found working copy; writing revision string \"%s\" to %s" % ( script_name, rev_num_str, revision_filepath )
-			
-		# Write the revision number to the revision file.
-		write_revision_to_file( rev_num_str, revision_filepath )
-
-	# If we can't find the entries file, we probably are in an exported
-	# copy: either an official snapshot, or a copy that someone exported
-	# manually--hopefully (and likely) the former.
-	else:
-
-		# Be verbose if verbosity was requested.
-		if verbose_flag == True:
-			print "%s: Found export. Checking for revision file..." % ( script_name )
-		
-		# Test for the existence of the revision file.
-		rev_file_exists = file_exists( revision_filepath )
-
-		# If the revision file does not exist, create a dummy file so the
-		# configure script has something to work with.
-		if rev_file_exists == False:
-
-			# Be verbose if verbosity was requested.
-			if verbose_flag == True:
-				print "%s: Revision file not found. Writing dummy revision string \"%s\" to %s" % ( script_name, dummy_rev_string, revision_filepath )
-			
-			# Write the dummy string to the revision file.
-			write_revision_to_file( dummy_rev_string, revision_filepath )
-
-		else:
-
-			# Get the revision number from the file just for the purposes of
-			# being verbose, if it was requested.
-			rev_num_str = read_revision_file( revision_filepath )
-
-			# Be verbose if verbosity was requested.
-			if verbose_flag == True:
-				print "%s: Revision file found containing revision string \"%s\". Export is valid snapshot!" % ( script_name, rev_num_str )
-
-
-# ------------------------------------------------------------------------------
-
-def file_exists( filepath ):
-
-	# Try to open the file read-only.
-	try:
-		
-		fp = open( filepath, 'r' )
-		fp.close()
-		exists = True
-	
-	except IOError, err:
-		
-		exists = False
-	
-	return exists
-
-
-# ------------------------------------------------------------------------------
-
-def read_revision_from_entries( entries_filepath ):
-
-	# Open the ignore list files as read-only.
-	entries_file = open( entries_filepath, 'r' )
-
-	# Read all lines in the entries file.
-	raw_list     = entries_file.readlines()
-
-	# Close the file.
-	entries_file.close()
-
-	# Grab the fourth line, which is where the revision number lives, and strip
-	# it of whitespace (probably just a newline).
-	rev_num_str = raw_list[3].strip()
-
-	# Return the revision number string.
-	return rev_num_str
-
-# ------------------------------------------------------------------------------
-
-def write_revision_to_file( rev_string, revision_filepath ):
-
-	# Open the revision file for writing.
-	revision_file = open( revision_filepath, 'w' )
-
-	# Write the revision string to the file.
-	revision_file.write( rev_string )
-
-	# Close the file.
-	revision_file.close()
-
-# ------------------------------------------------------------------------------
-
-def read_revision_file( revision_filepath ):
-
-	# Open the revision file.
-	revision_file = open( revision_filepath, 'r' )
-
-	# Read the first (and only) line.
-	line = revision_file.readline()
-
-	# Close the file.
-	revision_file.close()
-
-	# Grab the string and strip the it of whitespace (should just be a newline).
-	rev_num_str = line.strip()
-
-	# Return the revision number string.
-	return rev_num_str
-
-# ------------------------------------------------------------------------------
-
-# Begin by executing main().
-main()
--- a/attic/windows/build/gen-config-file.py
+++ b/attic/windows/build/gen-config-file.py
@@ -1,360 +0,0 @@
-#! /usr/bin/env python
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-# ------------------------------------------------------------------------------
-
-# Import modules
-import sys
-import os
-import os.path
-import getopt
-import re
-import string
-
-# Global variables for command line options, with default settings.
-script_name  = ""
-dry_run_flag = False
-verbose_flag = False
-
-# Global constants
-config_dirname     = "config"
-source_dirname     = "frame"
-object_dirname     = "obj"
-object_extension   = ".obj"
-leaf_list_path     = "build/leaf_list"
-revision_filename  = "revision"
-rev_varname        = "REVISION"
-pwd_varname        = "PWD"
-arch_varname       = "ARCH_STR"
-build_varname      = "BUILD_STR"
-ccompiler_varname  = "CCOMPILER_STR"
-
-
-# ------------------------------------------------------------------------------
-
-def print_usage():
-	
-	# Print help information.
-	print " "
-	print " %s" % script_name
-	print " "
-	print " Field G. Van Zee"
-	print " "
-	print " Create a config.mk file that is to be included by the nmake Makefile."
-	print " This config.mk file is based on a template, but also includes variable"
-	print " definitions that are needed for the specific build were are performing."
-	print " The variables which are currently appended to config.mk at runtime are:"
-	print "   - the revision string"
-	print "   - the path to the current working directory"
-	print "   - the build string (e.g. debug, release)"
-	print "   - the architecture string (e.g. x86, x64)"
-	print "   - the C compiler to use (e.g. icl, cl)"
-	print "   - a list of paths to the object files to be compiled"
-	print " The config.mk file is placed within the config subdirectory." 
-	print " "
-	print " Usage:"
-	print "   %s [options] flat_dir arch build ccompiler path\\to\\config.mk.in" % script_name
-	print " "
-	print " The following options are accepted:"
-	print " "
-	print "   -d          dry-run"
-	print "                 Go through all the motions, but don't actually output"
-	print "                 the nmake definition file."
-	print "   -v          verbose"
-	print "                 Be verbose about actions (one line of output her action)."
-	print " "
-
-	# Exit the script.
-	sys.exit()
-
-# ------------------------------------------------------------------------------
-
-def main():
-
-	# Extern our global veriables.	
-	global script_name
-	global dry_run_flag
-	global verbose_flag
-
-	# Get the script name so we can use it in our output.
-	( script_dir, script_name ) = os.path.split( sys.argv[0] )
-	
-	try:
-		
-		# Get the command line options.
-		options, args = getopt.getopt( sys.argv[1:], "dv")
-	
-	except getopt.GetoptError, err:
-	
-		# print help information and exit:
-		print str( err ) # will print something like "option -a not recognized"
-		print_usage()
-	
-	# Parse our expected command line options.
-	for o, a in options:
-		
-		if o == "-d":
-			dry_run_flag = True
-		elif o == "-v":
-			verbose_flag = True
-		else:
-			assert False, "unhandled option"
-	
-	# Check the number of arguments after command line option processing.
-	n_args = len( args )
-	if n_args != 5:
-		print_usage() 
-
-	# Acquire the non-optional arguments.
-	flat_dir         = args[0]
-	arch_string      = args[1]
-	build_string     = args[2]
-	ccompiler_string = args[3]
-	input_filepath   = args[4]
-
-	# Acquire the list of leaf-type directories we will descend into.
-	leaf_list = read_leaf_list()
-
-	# Read the contents of the template file.
-	template_file_line_list = read_template_file( input_filepath )
-
-	# Initialize a new list for the lines to be output
-	output_file_line_list = template_file_line_list
-
-	# Read the revision number from the revision file.
-	rev_num_str = read_revision_file( revision_filename )
-
-	# Add a variable for the revision number of the code we're working with.
-	rev_var_value = rev_varname + " = " + rev_num_str + "\n"
-	output_file_line_list.append( rev_var_value )
-	
-	# Add a variable for the path to the current working directory and append
-	# it to our list.
-	pwd_var_value = pwd_varname + " = " + os.getcwd() + "\n"
-	output_file_line_list.append( pwd_var_value )
-	
-	# Add a variable for the architecture string and append it to our list.
-	arch_var_value = arch_varname + " = " + arch_string + "\n"
-	output_file_line_list.append( arch_var_value )
-	
-	# Add a variable for the build type string and append it to our list.
-	build_var_value = build_varname + " = " + build_string + "\n"
-	output_file_line_list.append( build_var_value )
-	
-	# Add a variable for the C compiler string and append it to our list.
-	ccompiler_var_value = ccompiler_varname + " = " + ccompiler_string + "\n"
-	output_file_line_list.append( ccompiler_var_value )
-	
-	# Walk the flat subdirectories for each of the leaves.
-	for leaf_spec in leaf_list:
-		
-		# Unpack the leaf_spec tuple.
-		src_exts, hdr_exts = leaf_spec
-
-		# Create the paths to the source and object subdirectories.
-		src_dirpath = os.path.join( flat_dir, source_dirname )
-		obj_dirpath = os.path.join( flat_dir, object_dirname, arch_string, build_string )
-
-		# Get a list of files from the leaf subdirectory.
-		src_filenames = os.listdir( src_dirpath )
-		
-		# This will be the nmake variable name to which we will assign the list
-		# of source files.
-		nmake_varname = "BLIS_OBJS"
-		
-		# Generate the line to output.
-		leaf_line = generate_object_list( nmake_varname, src_filenames, src_exts, obj_dirpath )
-
-		# Accumulate the lines.
-		output_file_line_list.append( leaf_line )
-	
-	# Get the filename part of the input filepath.
-	input_filedir, input_filename = os.path.split( input_filepath )
-
-	# Remove the .in extension in the output filename.
-	output_filename = re.sub( '.mk.in', '.mk', input_filename )
-	
-	# Construct the filepath for the output file.
-	output_filepath = os.path.join( flat_dir, config_dirname, output_filename )
-
-	# Write the output lines.
-	write_output_file( output_filepath, output_file_line_list )
-
-# ------------------------------------------------------------------------------
-
-def read_revision_file( filepath ):
-
-	# Try to open the revision file.
-	try:
-		
-		revision_file = open( filepath, 'r' )
-	
-	except IOError, err:
-		
-		print "%s: Couldn't open revision file %s" % ( script_name, filepath )
-		sys.exit(1)
-
-	# Read the first (and only) line.
-	line = revision_file.readline()
-
-	# Close the file.
-	revision_file.close()
-
-	# Grab the string and strip the it of whitespace (should just be a newline).
-	rev_num_str = line.strip()
-
-	# Return the revision number string.
-	return rev_num_str
-
-# ------------------------------------------------------------------------------
-
-def generate_object_list( nmake_varname, src_filenames, src_exts, obj_dirpath ):
-
-	# Initialize the string as an assignment operation.
-	the_line = nmake_varname + " = "
-	
-	# Return early if there are no source extensions for this leaf spec.
-	if src_exts == []:
-		return ""
-
-	# Construct a pattern to match any file ending with any of the source file
-	# extensions given. This string is going to look something like ".[cf]".
-	src_pattern = '\.['
-	for src_ext in src_exts:
-		src_pattern = src_pattern + src_ext
-	src_pattern = src_pattern + ']'
-
-	# Consider all source files.
-	for src_filename in src_filenames:
-		
-		obj_filename = re.sub( src_pattern, '.obj', src_filename )
-		
-		# Create the full path to the file.
-		obj_filepath = os.path.join( obj_dirpath, obj_filename )
-		
-		# Be verbose if verbosity was requested.
-		if verbose_flag == True:
-			print "%s: adding file %s" % ( script_name, obj_filepath )
-				
-		# And then add it to the list.
-		the_line = the_line + obj_filepath + " "
-
-	# Be verbose if verbosity was requested.
-	if verbose_flag == True:
-		print "%s: %s" % ( script_name, the_line )
-	
-	# Append a newline to the end of the line, for file.writelines().
-	the_line = the_line + "\n"
-
-	# Return the new line.
-	return the_line
-
-# ------------------------------------------------------------------------------
-
-def read_template_file( template_filepath ):
-	
-	# Open the template file as read-only.
-	template_file = open( template_filepath, 'r' )
-
-	# Read all lines in the template file.
-	template_file_lines = template_file.readlines()
-
-	# Close the file.
-	template_file.close()
-
-	# Return the list of lines in the template file.
-	return template_file_lines
-
-# ------------------------------------------------------------------------------
-
-def write_output_file( output_filepath, output_lines ):
-
-	# Take action only if this is not a dry run.
-	if dry_run_flag == False:
-
-		# Open the template file as writable.
-		output_file = open( output_filepath, 'w' )
-
-		# Write the lines.
-		output_file.writelines( output_lines )
-
-		# Close the file.
-		output_file.close()
-
-# ------------------------------------------------------------------------------
-
-def read_leaf_list():
-
-	# Open the leaf list file.
-	leaf_file = open( leaf_list_path, 'r' )
-
-	# Read the lines in the file.
-	line_list = leaf_file.readlines()
-
-	# Start with a blank list.
-	leaf_list = []
-
-	# Iterate over the lines.
-	for line in line_list:
-
-		# Split the specification by colon to separate the fields.
-		fields = string.split( string.strip( line ), ':' )
-
-		# Get the individual fields of the specification.
-		src_exts = string.split( fields[0], ',' )
-		hdr_exts = string.split( fields[1], ',' )
-		
-		# If it's a singleton list of an empty string, make it an empty list.
-		if len(src_exts) == 1:
-			if src_exts[0] == '':
-				src_exts = []
-		
-		# If it's a singleton list of an empty string, make it an empty list.
-		if len(hdr_exts) == 1:
-			if hdr_exts[0] == '':
-				hdr_exts = []
-
-		# Pack the fields into a tuple.
-		leaf_spec = ( src_exts, hdr_exts )
-
-		# Append the tuple to our list.
-		leaf_list.append( leaf_spec )
-
-	# Return the list.
-	return leaf_list
-
-# ------------------------------------------------------------------------------
-
-# Begin by executing main().
-main()
--- a/attic/windows/build/ignore_list
+++ b/attic/windows/build/ignore_list
@@ -1,7 +0,0 @@
-attic
-broken
-old
-other
-temp
-tmp
-test
--- a/attic/windows/build/ignore_list.windows
+++ b/attic/windows/build/ignore_list.windows
@@ -1 +0,0 @@
-.git
--- a/attic/windows/build/leaf_list
+++ b/attic/windows/build/leaf_list
@@ -1 +0,0 @@
-c:h
--- a/attic/windows/build/nmake-help.cmd
+++ b/attic/windows/build/nmake-help.cmd
@@ -1,72 +0,0 @@
-::
-::
-::  BLIS    
-::  An object-based framework for developing high-performance BLAS-like
-::  libraries.
-::
-::  Copyright (C) 2014, The University of Texas at Austin
-::
-::  Redistribution and use in source and binary forms, with or without
-::  modification, are permitted provided that the following conditions are
-::  met:
-::   - Redistributions of source code must retain the above copyright
-::     notice, this list of conditions and the following disclaimer.
-::   - Redistributions in binary form must reproduce the above copyright
-::     notice, this list of conditions and the following disclaimer in the
-::     documentation and/or other materials provided with the distribution.
-::   - Neither the name(s) of the copyright holder(s) nor the names of its
-::     contributors may be used to endorse or promote products derived
-::     from this software without specific prior written permission.
-::
-::  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-::  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-::  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-::  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-::  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-::  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-::  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-::  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-::  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-::  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-::  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-::
-::
-
-@echo off
-
-echo. 
-echo  Makefile
-echo. 
-echo  Field G. Van Zee
-echo.  
-echo  nmake Makefile for building BLIS for Microsoft Windows. nmake targets
-echo  may be invoked after running the configure.cmd script. Valid targets are:
-echo. 
-echo    all          - Invoke the lib and dll targets.
-echo    lib          - Build BLIS as a static library.
-echo    dll          - Build BLIS as a dynamically-linked library.
-echo    help         - Output help and usage information.
-echo    clean        - Invoke clean-log and clean-build targets.
-echo    clean-log    - Remove any log files present.
-echo    clean-config - Remove all products of configure.cmd. Namely, remove the
-echo                   config, include, and src directories.
-echo    clean-build  - Remove all products of the compilation portion of the build
-echo                   process. Namely, remove the obj, lib, and dll directories.
-echo    distclean    - Invoke clean-log, clean-config, and clean-build targets.
-echo.
-echo  The Makefile also recognizes configuration options corresponding to the
-echo  following Makefile variables:
-echo.
-echo    VERBOSE               - When defined, nmake outputs the actual commands
-echo                            executed instead of more concise one-line progress
-echo                            indicators. (Undefined by default.)
-echo.
-echo  Typically, these options are specified by commenting or uncommenting the
-echo  corresponding lines in the Makefile. However, if the Makefile currently does
-echo  not define one of the options, and you wish to enable the corresponding
-echo  feature without editing the Makefile, you may define the variable at the
-echo  command line when nmake is invoked. For example, you may enable verboseness
-echo  while invoking the lib target as follows:
-echo.
-echo    nmake lib VERBOSE=1
-echo.
--- a/attic/windows/configure.cmd
+++ b/attic/windows/configure.cmd
@@ -1,87 +0,0 @@
-::
-::
-::  BLIS    
-::  An object-based framework for developing high-performance BLAS-like
-::  libraries.
-::
-::  Copyright (C) 2014, The University of Texas at Austin
-::
-::  Redistribution and use in source and binary forms, with or without
-::  modification, are permitted provided that the following conditions are
-::  met:
-::   - Redistributions of source code must retain the above copyright
-::     notice, this list of conditions and the following disclaimer.
-::   - Redistributions in binary form must reproduce the above copyright
-::     notice, this list of conditions and the following disclaimer in the
-::     documentation and/or other materials provided with the distribution.
-::   - Neither the name(s) of the copyright holder(s) nor the names of its
-::     contributors may be used to endorse or promote products derived
-::     from this software without specific prior written permission.
-::
-::  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-::  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-::  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-::  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-::  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-::  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-::  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-::  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-::  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-::  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-::  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-::
-::
-
-@echo off
-
-:ENVIRONMENT
-	set GEN_CHECK_REV_FILE=.\build\gen-check-rev-file.py
-	set GATHER_SRC=.\build\gather-src-for-windows.py
-	set GEN_CONFIG_FILE=.\build\gen-config-file.py
-	set CONFIG_DEFS_TEMPL=.\build\config.mk.in
-	set SRC_TREE_DIR=..\frame
-	set TOP_BUILD_DIR=.
-
-:PARAMS
-	if "%1"=="" (goto USAGE)
-	if "%2"=="" (goto USAGE)
-	if "%3"=="" (goto USAGE)
-
-	set ARCH=%1
-	set BUILD=%2
-	set CCOMPILER=%3
-	
-:TASK_UNIT
-	echo %0: Checking/updating revision file.
-	%GEN_CHECK_REV_FILE% -v
-	echo %0: Gathering source files into local flat directories.
-	%GATHER_SRC% %SRC_TREE_DIR% %TOP_BUILD_DIR%
-	echo %0: Creating configure definitions file.
-	%GEN_CONFIG_FILE% %TOP_BUILD_DIR% %ARCH% %BUILD% %CCOMPILER% %CONFIG_DEFS_TEMPL%
-	echo %0: Configuration and setup complete. You may now run nmake. 
-
-	goto END
-
-:USAGE
-	echo. 
-	echo  configure.cmd
-	echo. 
-	echo  A wrapper script for various configuration and setup scripts that need
-	echo. to be run before nmake when building BLIS for Microsoft Windows.
-	echo. 
-	echo  USAGE:
-	echo     %0 [arch] [build] [cc]
-	echo.
-	echo        arch     -- The architecture string to build.
-	echo                    Supported values: {x86,x64}
-	echo        build    -- The kind of build.
-	echo                    Supported values: {debug,release}
-	echo        cc       -- The C compiler to use.
-	echo                    Supported values: {icl,cl}
-	echo. 
-	echo  examples:
-	echo     %0 x86 debug icl
-	echo     %0 x64 release cl
-	echo.
-
-:END
--- a/attic/windows/gendll.cmd
+++ b/attic/windows/gendll.cmd
@@ -1,128 +0,0 @@
-@echo off
-@setlocal enabledelayedexpansion
-
-rem --------------------------------------------------------------------
-rem Build a dll out of a set of object files specified by the 
-rem argument /objlist.
-rem
-rem The .lib file thus created is an "import" library, which one links
-rem with, but the bulk of the code ends up in the associated .dll file.
-rem ---------------------------------------------------------------------
-
-set THIS_SCRIPT=%~dp0%~nx0
-
-if "%1"=="" goto USAGE
-if "%2"=="" goto USAGE
-if "%3"=="" goto USAGE
-if "%4"=="" goto USAGE
-if "%5"=="" goto USAGE
-
-set gd_lib_name=%1
-set gd_link=%gd_lib_name%-static.link
-set LINKER=%3
-set LINKARGSFILE=%4
-set gd_def=%5
-
-:PARSE_ARGS
-set IMPORT=
-set OBJLIST=
-:ARGLOOP
-if "%6"=="" goto ENDARGLOOP
-if /i not "%6"=="/import" goto OBJARG
-set IMPORT=!IMPORT! %7
-goto SHIFT
-:OBJARG
-if /i not "%6"=="/objlist" goto ENDARGLOOP
-set OBJLIST=%7
-:SHIFT
-shift /4
-shift /4
-goto ARGLOOP
-:ENDARGLOOP
-
-if defined OBJLIST goto COMPILER_SETUP
-echo Error: must supply /objlist <file with list of object names>
-goto USAGE
-
-:COMPILER_SETUP
-set gd_path=%2
-set gd_dll_path=%gd_path%.dll
-set gd_main_c=dll_main__%gd_lib_name%.c
-set gd_main_obj=dll_main__%gd_lib_name%.obj
-
-rem create C file for dll_main
-for /F "tokens=*" %%i in ("#include <windows.h>") do echo %%i >%gd_main_c%
-echo. >>%gd_main_c%
-echo BOOLEAN WINAPI DllMain( >>%gd_main_c%
-echo 	HINSTANCE hDllHandle, >>%gd_main_c%
-echo 	DWORD     nReason,    >>%gd_main_c%
-echo 	LPVOID    Reserved){  >>%gd_main_c%
-echo.                        >>%gd_main_c%
-echo BOOLEAN bSuccess = TRUE;>>%gd_main_c%
-echo.                        >>%gd_main_c%
-echo	switch (nReason){     >>%gd_main_c%
-echo		case DLL_PROCESS_ATTACH: >>%gd_main_c%
-echo			DisableThreadLibraryCalls( hDllHandle ); >>%gd_main_c%
-echo		break; >>%gd_main_c%
-echo		case DLL_PROCESS_DETACH: >>%gd_main_c%
-echo		break; >>%gd_main_c%
-echo.            >>%gd_main_c%
-echo	}; >>%gd_main_c%
-echo.   >>%gd_main_c%
-echo	return bSuccess; >>%gd_main_c%
-echo }; >>%gd_main_c%
-echo.>>%gd_main_c%
-
-rem set up link file by specifying dll filepath and main object
-echo /Fe%gd_dll_path% > %gd_link%
-echo %gd_main_obj% >> %gd_link%
-
-rem add contents of linkargs file; most of the link argument action is
-rem in this file
-type %LINKARGSFILE% >> %gd_link%
-
-rem add command-line import libraries, if any
-if defined IMPORT echo !IMPORT! >> %gd_link%
-
-rem add export specification
-echo %gd_def% >> %gd_link%
-
-rem add contents of OBJLIST file
-type %OBJLIST% >> %gd_link%
-
-rem create dll, import lib, and export file
-%LINKER% /nologo /c /O2 /Fo%gd_main_obj% %gd_main_c% >> gendll-cl.log
-%LINKER% @%gd_link%
-
-:CLEANUP
-del /F /Q %gd_link% %gd_main_c% %gd_main_obj% gendll-cl.log
-goto END
-
-
-:USAGE
-echo. 
-echo. gendll.cmd
-echo. 
-echo. Generate a dynamically-linked library from a set of object files
-echo. specified in objlist_file.
-echo. 
-echo. Usage:
-echo.   %0 dllname dllpath linker linkargs_file symbols_file {/import importlib} /objlist objlist_file
-echo.
-echo.     dllname       -- the name of the DLL being created, with no extension.
-echo.     dllpath       -- the path to the DLL being created, with no extension.
-echo.     linker        -- the compiler to use to link the DLL.
-echo.     linkargs_file -- the path to a file containing a list of all linker
-echo.                      arguments--link options, libraries, and library paths--
-echo.                      that that may be needed to successfully link the DLL
-echo.                      being created.
-echo.     symbols_file  -- the path to a file containing a list of symbols to
-echo.                      export in the DLL.
-echo.     importlib     -- the path to a .lib library that you wish to import into
-echo.                      the DLL being created. Optional.
-echo.     objlist_file  -- the path to a file containing the list of object files
-echo.                      that make up the bulk of the DLL being created.
-echo.
-
-:END
-endlocal
--- a/attic/windows/linkargs.txt
+++ b/attic/windows/linkargs.txt
@@ -1,11 +0,0 @@
-/nologo
-/LD /MT
-/LIBPATH:"C:\Program Files\Microsoft SDKs\Windows\v6.0A\Lib"
-/LIBPATH:"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\lib"
-/nodefaultlib:libcmt /nodefaultlib:libc /nodefaultlib:libmmt
-msvcrt.lib
-/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\lib\ia32"
-/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\mkl\ia32\lib"
-mkl_intel_c.lib
-mkl_sequential.lib
-mkl_core.lib
--- a/attic/windows/linkargs64.txt
+++ b/attic/windows/linkargs64.txt
@@ -1,11 +0,0 @@
-/nologo
-/LD /MT 
-/LIBPATH:"C:\Program Files\Microsoft SDKs\Windows\v6.0A\Lib\x64"
-/LIBPATH:"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\lib\amd64"
-/nodefaultlib:libcmt /nodefaultlib:libc /nodefaultlib:libmmt
-msvcrt.lib
-/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\lib\intel64"
-/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\mkl\em64t\lib"
-mkl_intel_lp64.lib
-mkl_sequential.lib
-mkl_core.lib
--- a/attic/windows/revision
+++ b/attic/windows/revision
@@ -1 +0,0 @@
-unknown
--- a/attic/windows/vc110.pdb
+++ b/attic/windows/vc110.pdb
--- a/common.mk
+++ b/common.mk
@@ -212,6 +212,11 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
 files-that-contain      = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),)))
 files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f))))

+# Define a function that removes duplicate words from a list.
+# NOTE: This function was obtained via [1]; thanks bobbogo for this
+# concise definition.
+# [1] https://stackoverflow.com/questions/16144115/makefile-remove-duplicate-words-without-sorting
+rm-dupls = $(if $1,$(firstword $1) $(call rm-dupls,$(filter-out $(firstword $1),$1)))


 #
@@ -535,7 +540,7 @@ endif
 ifeq ($(OS_NAME),Darwin)
 # OS X shared library link flags.
 SOFLAGS    := -dynamiclib
-SOFLAGS    += -Wl,-install_name,$(LIBBLIS_SONAME)
+SOFLAGS    += -Wl,-install_name,$(libdir)/$(LIBBLIS_SONAME)
 else
 SOFLAGS    := -shared
 ifeq ($(IS_WIN),yes)
@@ -833,9 +838,6 @@ endif
 # --- LDFLAGS cleanup ----------------------------------------------------------
 #

-# Remove duplicate flags/options in LDFLAGS (such as -lpthread) by sorting.
-LDFLAGS := $(sort $(LDFLAGS))
-


 #
@@ -1080,4 +1082,3 @@ BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY
 # end of ifndef COMMON_MK_INCLUDED conditional block
 endif

-
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -90,9 +90,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  10,
+#if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+#endif
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
--- a/config/knl/bli_cntx_init_knl.c
+++ b/config/knl/bli_cntx_init_knl.c
@@ -79,9 +79,11 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  10,
+#if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+#endif
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
--- a/config/old/haswellbb/bli_cntx_init_haswell.c
+++ b/config/old/haswellbb/bli_cntx_init_haswell.c
@@ -150,9 +150,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  10,
+#if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+#endif
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
--- a/config/power9/bli_cntx_init_power9.c
+++ b/config/power9/bli_cntx_init_power9.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -35,11 +35,33 @@
 #include "blis.h"

 // Instantiate prototypes for packm kernels.
+PACKM_KER_PROT(    float,  s, packm_6xk_bb4_power9_ref )
 PACKM_KER_PROT(    double, d, packm_6xk_bb2_power9_ref )

 // Instantiate prototypes for level-3 kernels.
-//GEMM_UKR_PROT(     double, d, gemmbb_power9_ref )
+GEMM_UKR_PROT(     float,  s, gemmbb_power9_ref )
+GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_l_power9_ref )
+GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_u_power9_ref )
+TRSM_UKR_PROT(     float,  s, trsmbb_l_power9_ref )
+TRSM_UKR_PROT(     float,  s, trsmbb_u_power9_ref )

+GEMM_UKR_PROT(     double, d, gemmbb_power9_ref )
+GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref )
+GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref )
+TRSM_UKR_PROT(     double, d, trsmbb_l_power9_ref )
+TRSM_UKR_PROT(     double, d, trsmbb_u_power9_ref )
+
+GEMM_UKR_PROT(     scomplex, c, gemmbb_power9_ref )
+GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref )
+GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref )
+TRSM_UKR_PROT(     scomplex, c, trsmbb_l_power9_ref )
+TRSM_UKR_PROT(     scomplex, c, trsmbb_u_power9_ref )
+
+GEMM_UKR_PROT(     dcomplex, z, gemmbb_power9_ref )
+GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref )
+GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref )
+TRSM_UKR_PROT(     dcomplex, z, trsmbb_l_power9_ref )
+TRSM_UKR_PROT(     dcomplex, z, trsmbb_u_power9_ref )

 void bli_cntx_init_power9( cntx_t* cntx )
 {
@@ -47,25 +69,56 @@ void bli_cntx_init_power9( cntx_t* cntx )

 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_power9_ref( cntx );
-	
+
+	// -------------------------------------------------------------------------
+
 	// Update the context with optimized native gemm micro-kernels and
 	// their storage preferences.
 	bli_cntx_set_l3_nat_ukrs
 	(
-	  1,
-	  //BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemmbb_power9_ref,        FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,        FALSE,
+	  12,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemmbb_power9_ref,        FALSE,
+	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power9_ref,      FALSE,
+	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power9_ref,      FALSE,
+
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,     FALSE,
+	  
+	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power9_ref,      FALSE,
+	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power9_ref,      FALSE,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power9_ref,        FALSE,
+	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref,      FALSE,
+	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref,      FALSE,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power9_ref,        FALSE,
+	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref,      FALSE,
+	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref,      FALSE,
+	  cntx
+	);
+
+	// Update the context with customized virtual [gemm]trsm micro-kernels.
+	bli_cntx_set_l3_vir_ukrs
+	(
+	  8,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_l_power9_ref,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_u_power9_ref,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_l_power9_ref,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_u_power9_ref,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref,
 	  cntx
 	);

 	// Update the context with optimized packm kernels.
 	bli_cntx_set_packm_kers
 	(
-	  1,
+	  2,
+	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_6xk_bb4_power9_ref,
 	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power9_ref,
 	  cntx
 	);

+
 	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    12,    -1,    -1 );
 	bli_blksz_init     ( &blkszs[ BLIS_NR ],    -1,     6,    -1,    -1,
 	                                            -1,    12,    -1,    -1 );
@@ -73,6 +126,9 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,  1408,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  8190,    -1,    -1 );

+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
 	  BLIS_NAT, 5,
@@ -84,5 +140,5 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 	  cntx
 	);
-}

+}
--- a/config/power9/bli_family_power9.h
+++ b/config/power9/bli_family_power9.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -38,4 +38,9 @@
 #define BLIS_POOL_ADDR_OFFSET_SIZE_A 192
 #define BLIS_POOL_ADDR_OFFSET_SIZE_B 152

-
+// Disable right-side hemm, symm, and trmm[3] to accommodate the broadcasting of
+// elements within the packed matrix B.
+#define BLIS_DISABLE_HEMM_RIGHT
+#define BLIS_DISABLE_SYMM_RIGHT
+#define BLIS_DISABLE_TRMM_RIGHT
+#define BLIS_DISABLE_TRMM3_RIGHT
--- a/config/power9/make_defs.mk
+++ b/config/power9/make_defs.mk
@@ -5,7 +5,7 @@
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2019, The University of Texas at Austin
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
--- a/config/skx/bli_cntx_init_skx.c
+++ b/config/skx/bli_cntx_init_skx.c
@@ -71,9 +71,11 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  10,
+#if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+#endif
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -83,9 +83,11 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  16,
+#if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+#endif
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -89,9 +89,12 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  16,
+#if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+#endif
+	  // axpyv

 	  // axpyv
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
--- a/2
+++ b/2
@@ -4,7 +4,7 @@
 # Please refer to the BLIS wiki on configurations for information on the
 # syntax and semantics of this file [1].
 #
-# [1] https://github.com/flame/blis/wiki/ConfigurationHowTo
+# [1] https://github.com/flame/blis/blob/master/docs/ConfigurationHowTo.md
 #

 # Processor families.
--- a/43
+++ b/43
@@ -1323,14 +1323,18 @@ get_compiler_version()
 	# isolate the version number.
 	# The last part ({ read first rest ; echo $first ; }) is a workaround
 	# to OS X's egrep only returning the first match.
-	cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM' | { read first rest ; echo $first ; })
+	cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI' | { read first rest ; echo $first ; })
 	if [ "${cc_vendor}" = "icc" -o \
 	     "${cc_vendor}" = "gcc" ]; then
 		cc_version=$(${cc} -dumpversion)
 	#if compiler is AOCC, first grep for clang and then the version number.
 	elif [ "${cc_vendor}" = "clang" ]; then
            cc_version=$(echo "${vendor_string}" | egrep -o 'clang version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')
-    else
+	elif [ "${cc_vendor}" = "oneAPI" ]; then
+		# Treat Intel oneAPI's clang as clang, not icc.
+		cc_vendor="clang"
+		cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
+	else
 		cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; })
 	fi

@@ -1449,6 +1453,15 @@ check_compiler()
 				blacklistcc_add "skx"
 			fi
 		fi
+		if [ ${cc_major} -eq 18 ]; then
+			echo "${script_name}: ${cc} ${cc_version} is known to cause erroneous results. See https://github.com/flame/blis/issues/371 for details."
+			blacklistcc_add "knl"
+			blacklistcc_add "skx"
+		fi
+		if [ ${cc_major} -ge 19 ]; then
+			echo "${script_name}: ${cc} ${cc_version} is known to cause erroneous results. See https://github.com/flame/blis/issues/371 for details."
+			echoerr_unsupportedcc
+		fi
 	fi

 	# clang
@@ -2421,8 +2434,22 @@ main()
 		# Call the auto_detect() function and save the returned string in
 		# config_name.
 		config_name=$(auto_detect)
+		#config_name="generic"

 		echo "${script_name}: hardware detection driver returned '${config_name}'."
+
+		# If the auto-detect code returned the "generic" string, it means we
+		# were unable to automatically detect the user's hardware type. While
+		# this is going to be a rare event, it will likely lead the user to
+		# experience much lower performance than expected, and thus we will
+		# warn them about it at the end of the configure output (to increase
+		# the chances that they see it).
+		if [ "${config_name}" = "generic" ]; then
+
+			warn_user_generic=1
+		else
+			warn_user_generic=0
+		fi
 	else

 		# Use the command line argument as the configuration name.
@@ -3476,6 +3503,18 @@ main()
 		echo "${script_name}: configured to build within top-level directory of source distribution."
 	fi

+	if [ "${warn_user_generic}" = "1" ]; then
+
+		echo "${script_name}: "
+		echo "${script_name}: *** Unable to automatically detect hardware type! ***"
+		echo "${script_name}: "
+		echo "${script_name}: NOTE: configure was unable to identify a subconfiguration"
+		echo "${script_name}: optimized for your hardware. As a result, the 'generic'"
+		echo "${script_name}: subconfiguration (with low-performance reference kernels)"
+		echo "${script_name}: will be used. For support, please open an issue on GitHub"
+		echo "${script_name}: at https://github.com/flame/blis/issues."
+		echo "${script_name}: "
+	fi

 	# Exit peacefully.
 	return 0
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -1051,6 +1051,7 @@ void bli_?her2
     (
       uplo_t  uploa,
       conj_t  conjx,
+       conj_t  conjy,
       dim_t   m,
       ctype*  alpha,
       ctype*  x, inc_t incx,
@@ -1115,6 +1116,7 @@ void bli_?syr2
     (
       uplo_t  uploa,
       conj_t  conjx,
+       conj_t  conjy,
       dim_t   m,
       ctype*  alpha,
       ctype*  x, inc_t incx,
--- a/docs/BuildSystem.md
+++ b/docs/BuildSystem.md
@@ -27,6 +27,7 @@ The BLIS build system was designed for use with GNU/Linux (or some other sane UN
  * GNU `bash` (3.2 or later)
  * GNU `make` (3.81 or later)
  * a working C99 compiler
+  * Perl (any version)

 BLIS also requires a POSIX threads library at link-time (`-lpthread` or `libpthread.so`). This requirement holds even when configuring BLIS with multithreading disabled (the default) or with multithreading via OpenMP (`--enable-multithreading=openmp`). (Note: BLIS implements basic pthreads functionality automatically for Windows builds via [AppVeyor](https://ci.appveyor.com/project/shpc/blis/).)

--- a/docs/HardwareSupport.md
+++ b/docs/HardwareSupport.md
@@ -15,6 +15,7 @@ A few remarks / reminders:
  * Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available, but the "native" complex domain gemm microkernel is unavailable. Note that the table below lists native kernels, so if a microarchitecture lists only `sd`, support for both `c` and `z` datatypes will be provided via the 1m method. (Note: most people cannot tell the difference between native and 1m-based performance.) Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
  * Some microarchitectures use the same sub-configuration. *This is not a typo.* For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kaby Lake, and Coffee Lake all use the `haswell` sub-configuration and the kernels registered therein. Microkernels can be recycled in this manner because the key detail that determines level-3 performance outcomes is actually the vector ISA, not the microarchitecture. In the previous example, all of the microarchitectures listed support AVX2 (but not AVX-512), and therefore they can reuse the same microkernels.
  * Remember that you (usually) don't have to choose your sub-configuration manually! Instead, you can always request configure-time hardware detection via `./configure auto`. This will defer to internal logic (based on CPUID for x86_64 systems) that will attempt to choose the appropriate sub-configuration automatically.
+  * There is a difficulty in automatically choosing the ideal sub-configuration for use on Skylake-X systems, which may have one or two FMA units. The `skx` sub-configuration is only beneficial when used on hardware with two FMA units. Otherwise the hardware is treated as a "desktop" Skylake system, which uses the `haswell` sub-configuration. Furthermore, the number of units can't be queried directly; instead, we rely on a manually-maintained list of CPU models (via logic in `frame/base/bli_cpuid.c`), which may be incorrect for new processors, particularly Gold models. In that case, you can either fix the code (and please raise an issue!) or manually target the `skx` at configure-time (i.e., `./configure [options] skx`). If your performance seems low, you can set `export BLIS_ARCH_DEBUG=1`, which will cause BLIS to output some basic debugging info to `stderr` that will reveal whether your system was detected as having one or two VPUs (FMA units).

 | Vendor/Microarchitecture             | BLIS sub-configuration | `gemm` | `gemmtrsm` |
 |:-------------------------------------|:-----------------------|:-------|:-----------|
@@ -28,7 +29,8 @@ A few remarks / reminders:
 | Intel Haswell, Broadwell (AVX/FMA3)  | `haswell`              | `sdcz` |  `sd`      |
 | Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell`              | `sdcz` |  `sd`      |
 | Intel Knights Landing (AVX-512/FMA3) | `knl`                  | `sd`   |            |
-| Intel SkylakeX (AVX-512/FMA3)        | `skx`                  | `sd`   |            |
+| Intel SkylakeX (AVX-512/2×FMA3)      | `skx`                  | `sd`   |            |
+| Intel SkylakeX (AVX-512/1×FMA3)      | `haswell`              | `sdcz` |  `sd`      |
 | ARMv7 Cortex-A9 (NEON)               | `cortex-a9`            | `sd`   |            |
 | ARMv7 Cortex-A15 (NEON)              | `cortex-a15`           | `sd`   |            |
 | ARMv8 Cortex-A53 (NEON)              | `cortex-a53`           | `sd`   |            |
--- a/docs/KernelsHowTo.md
+++ b/docs/KernelsHowTo.md
@@ -278,7 +278,7 @@ void bli_?gemm_ukernel
 The `gemm` microkernel, sometimes simply referred to as "the BLIS microkernel" or "the microkernel", performs the following operation:

 ```
-  C11 := beta * C11 + A1 * B1
+  C11 := beta * C11 + alpha * A1 * B1
 ```

 where `A1` is an _MR x k_ "micropanel" matrix stored in packed (column-wise) format, `B1` is a _k x NR_ "micropanel" matrix stored in packed (row-wise) format, `C11` is an _MR x NR_ general matrix stored according to its row and column strides `rsc` and `csc`, and `alpha` and beta are scalars.
--- a/docs/PerformanceSmall.md
+++ b/docs/PerformanceSmall.md
@@ -35,14 +35,18 @@ sizes tested.
 Each of the 28 graphs within a panel will contain an x-axis that reports
 problem size, with one, two, or all three matrix dimensions equal to the
 problem size (e.g. _m_ = 6; _n_ = _k_, also encoded as `m6npkp`).
-The y-axis will report in units GFLOPS (billions of floating-point operations
-per second) on a single core.
+The y-axis will report in units GFLOPS (or billions of floating-point operations
+per second) per core.

-It's also worth pointing out that the top of each graph (e.g. the maximum
-y-axis value depicted) _always_ corresponds to the theoretical peak performance
-under the conditions associated with that graph.
-Theoretical peak performance, in units of GFLOPS, is calculated as the
-product of:
+It's also worth pointing out that the top of some graphs (e.g. the maximum
+y-axis value depicted) correspond to the theoretical peak performance
+under the conditions associated with that graph, while in other graphs the
+y-axis has been adjusted to better show the difference between the various
+curves. (We *strongly* prefer to always use peak performance as the top of
+the graph; however, this is one of the few exceptions where we feel some
+scaling is warranted.)
+Theoretical peak performance on a single core, in units of GFLOPS, is
+calculated as the product of:
 1. the maximum sustainable clock rate in GHz; and
 2. the maximum number of floating-point operations (flops) that can be
 executed per cycle.
@@ -60,30 +64,32 @@ can be issued per cycle (per core);
 register (for the datatype in question); and
 3. 2.0, since an FMA instruction fuses two operations (a multiply and an add).

-The problem size range, represented on the x-axis, is sampled in
-increments of 4 up to 800 for the cases where one or two dimensions is small
-(and constant)
-and up to 400 in the case where all dimensions (e.g. _m_, _n_, and _k_) are
-bound to the problem size (i.e., square matrices).
+Typically, organizations and individuals publish performance with square
+matrices, which can miss the problem sizes of interest to many applications.
+Here, in addition to square matrices (shown in the seventh column), we also
+show six other scenarios where one or two `gemm` dimensions (of _m,_ _n_, and
+_k_) is small. In these six columns, the constant small matrix dimensions were
+chosen to be _very_ small--in the neighborhood of 8--intentionally to showcase
+what happens when at least one of the matrices is abnormally "skinny."

-Note that the constant small matrix dimensions were chosen to be _very_
-small--in the neighborhood of 8--intentionally to showcase what happens when
-at least one of the matrices is abnormally "skinny." Typically, organizations
-and individuals only publish performance with square matrices, which can miss
-the problem sizes of interest to many applications. Here, in addition to square
-matrices (shown in the seventh column), we also show six other scenarios where
-one or two `gemm` dimensions (of _m,_ _n_, and _k_) is small.
+The problem size range, represented on the x-axis, is sampled in
+increments that vary. These increments (and the overall range) are generally
+large for the cases where two dimensions are small (and constant), medium for
+cases where one dimension is small (and constant), and small for cases where
+all dimensions (e.g. _m_, _n_, and _k_) are variable and bound to the problem
+size (i.e., square matrices).

 The legend in each graph contains two entries for BLIS, corresponding to the
 two black lines, one solid and one dotted. The dotted line, **"BLIS conv"**,
 represents the conventional implementation that targets large matrices. This
 was the only implementation available in BLIS prior to the addition to the
 small/skinny matrix support. The solid line, **"BLIS sup"**, makes use of the
-new small/skinny matrix implementation for certain small problems. Whenever
-these results differ by any significant amount (beyond noise), it denotes a
-problem size for which BLIS employed the new small/skinny implementation.
-Put another way, **the delta between these two lines represents the performance
-improvement between BLIS's previous status quo and the new regime.**
+new small/skinny matrix implementation. Sometimes, the performance of
+**"BLIS sup"** drops below that of **"BLIS conv"** for somewhat larger problems.
+However, in practice, we use a threshold to determine when to switch from the
+former to the latter, and therefore the goal is for the performance of
+**"BLIS conv"** to serve as an approximate floor below which BLIS performance
+never drops.

 Finally, each point along each curve represents the best of three trials.

@@ -119,7 +125,8 @@ and/or install some (or all) of the implementations shown (e.g.
 [BLASFEO](https://github.com/giaf/blasfeo), and
 [libxsmm](https://github.com/hfp/libxsmm)), including BLIS. Be sure to consult
 the detailed notes provided below; they should be *very* helpful in successfully
-building the libraries. The `runme.sh` script in `test/sup` will help you run
+building the libraries. The `runme.sh` script in `test/sup` (or `test/supmt`)
+will help you run
 some (or all) of the test drivers produced by the `Makefile`, and the
 Matlab/Octave function `plot_panel_trxsh()` defined in the `octave` directory
 will help you turn the output of those test drivers into a PDF file of graphs.
@@ -140,20 +147,25 @@ The `runthese.m` file will contain example invocations of the function.
 * Max FMA vector IPC: 2
 * Peak performance:
  * single-core: 57.6 GFLOPS (double-precision), 115.2 GFLOPS (single-precision)
+  * multicore: 57.6 GFLOPS/core (double-precision), 115.2 GFLOPS/core (single-precision)
 * Operating system: Gentoo Linux (Linux kernel 5.2.4)
 * Page size: 4096 bytes
 * Compiler: gcc 8.3.0
-* Results gathered: 23-28 August 2019
+* Results gathered: 3 March 2020
 * Implementations tested:
-  * BLIS 4a0a6e8 (0.6.0-28)
-    * configured with `./configure --enable-cblas auto`
+  * BLIS 90db88e (0.6.1-8)
+    * configured with `./configure --enable-cblas auto` (single-threaded)
+    * configured with `./configure --enable-cblas -t openmp auto` (multithreaded)
    * sub-configuration exercised: `haswell`
-  * OpenBLAS 0.3.7
-    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
-  * BLASFEO 01f6b7f
+    * Multithreaded (4 cores) execution requested via `export BLIS_NUM_THREADS=4`
+  * OpenBLAS 0.3.8
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0 USE_LOCKING=1` (single-threaded)
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=4` (multithreaded)
+    * Multithreaded (4 cores) execution requested via `export OPENBLAS_NUM_THREADS=4`
+  * BLASFEO f9b78c6
    * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
  * Eigen 3.3.90
-    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019)
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (36b9596)
    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
         ```
         # These lines added after line 67.
@@ -165,18 +177,20 @@ The `runthese.m` file will contain example invocations of the function.
    * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas`
    * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
-    * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
-  * MKL 2019 update 4
-    * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
-  * libxsmm 77a295c (1.6.5-6679)
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (4 cores) execution requested via `export OMP_NUM_THREADS=4`
+  * MKL 2020 initial release
+    * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
+    * Multithreaded (4 cores) execution requested via `export MKL_NUM_THREADS=4`
+  * libxsmm a40a833 (post-1.14)
    * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally.
 * Affinity:
-  * N/A.
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-3"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
 * Frequency throttling (via `cpupower`):
  * Driver: intel_pstate
  * Governor: performance
  * Hardware limits: 800MHz - 3.8GHz
-  * Adjusted minimum: 3.7GHz
+  * Adjusted minimum: 3.8GHz
 * Comments:
  * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices.

@@ -184,15 +198,21 @@ The `runthese.m` file will contain example invocations of the function.

 #### pdf

-* [Kaby Lake row-stored](graphs/sup/dgemm_rrr_kbl_nt1.pdf)
-* [Kaby Lake column-stored](graphs/sup/dgemm_ccc_kbl_nt1.pdf)
+* [Kaby Lake single-threaded row-stored](graphs/sup/dgemm_rrr_kbl_nt1.pdf)
+* [Kaby Lake single-threaded column-stored](graphs/sup/dgemm_ccc_kbl_nt1.pdf)
+* [Kaby Lake multithreaded (4 cores) row-stored](graphs/sup/dgemm_rrr_kbl_nt4.pdf)
+* [Kaby Lake multithreaded (4 cores) column-stored](graphs/sup/dgemm_ccc_kbl_nt4.pdf)

 #### png (inline)

-* **Kaby Lake row-stored**
-![row-stored](graphs/sup/dgemm_rrr_kbl_nt1.png)
-* **Kaby Lake column-stored**
-![column-stored](graphs/sup/dgemm_ccc_kbl_nt1.png)
+* **Kaby Lake single-threaded row-stored**
+![single-threaded row-stored](graphs/sup/dgemm_rrr_kbl_nt1.png)
+* **Kaby Lake single-threaded column-stored**
+![single-threaded column-stored](graphs/sup/dgemm_ccc_kbl_nt1.png)
+* **Kaby Lake multithreaded (4 cores) row-stored**
+![multithreaded row-stored](graphs/sup/dgemm_rrr_kbl_nt4.png)
+* **Kaby Lake multithreaded (4 cores) column-stored**
+![multithreaded column-stored](graphs/sup/dgemm_ccc_kbl_nt4.png)

 ---

@@ -209,20 +229,25 @@ The `runthese.m` file will contain example invocations of the function.
 * Max FMA vector IPC: 2
 * Peak performance:
  * single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision)
+  * multicore: 49.6 GFLOPS/core (double-precision), 99.2 GFLOPS/core (single-precision)
 * Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103)
 * Page size: 4096 bytes
 * Compiler: gcc 7.3.0
-* Results gathered: 23-28 August 2019
+* Results gathered: 3 March 2020
 * Implementations tested:
-  * BLIS 4a0a6e8 (0.6.0-28)
-    * configured with `./configure --enable-cblas auto`
+  * BLIS 90db88e (0.6.1-8)
+    * configured with `./configure --enable-cblas auto` (single-threaded)
+    * configured with `./configure --enable-cblas -t openmp auto` (multithreaded)
    * sub-configuration exercised: `haswell`
-  * OpenBLAS 0.3.7
-    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
-  * BLASFEO 01f6b7f
+    * Multithreaded (12 cores) execution requested via `export BLIS_NUM_THREADS=12`
+  * OpenBLAS 0.3.8
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0 USE_LOCKING=1` (single-threaded)
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=12` (multithreaded)
+    * Multithreaded (12 cores) execution requested via `export OPENBLAS_NUM_THREADS=12`
+  * BLASFEO f9b78c6
    * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
  * Eigen 3.3.90
-    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019)
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (36b9596)
    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
         ```
         # These lines added after line 67.
@@ -234,13 +259,15 @@ The `runthese.m` file will contain example invocations of the function.
    * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas`
    * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
-    * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
-  * MKL 2019 update 4
-    * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
-  * libxsmm 77a295c (1.6.5-6679)
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (12 cores) execution requested via `export OMP_NUM_THREADS=12`
+  * MKL 2020 initial release
+    * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
+    * Multithreaded (12 cores) execution requested via `export MKL_NUM_THREADS=12`
+  * libxsmm a40a833 (post-1.14)
    * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally.
 * Affinity:
-  * N/A.
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-11"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
 * Frequency throttling (via `cpupower`):
  * No changes made.
 * Comments:
@@ -250,15 +277,21 @@ The `runthese.m` file will contain example invocations of the function.

 #### pdf

-* [Haswell row-stored](graphs/sup/dgemm_rrr_has_nt1.pdf)
-* [Haswell column-stored](graphs/sup/dgemm_ccc_has_nt1.pdf)
+* [Haswell single-threaded row-stored](graphs/sup/dgemm_rrr_has_nt1.pdf)
+* [Haswell single-threaded column-stored](graphs/sup/dgemm_ccc_has_nt1.pdf)
+* [Haswell multithreaded (12 cores) row-stored](graphs/sup/dgemm_rrr_has_nt12.pdf)
+* [Haswell multithreaded (12 cores) column-stored](graphs/sup/dgemm_ccc_has_nt12.pdf)

 #### png (inline)

-* **Haswell row-stored**
-![row-stored](graphs/sup/dgemm_rrr_has_nt1.png)
-* **Haswell column-stored**
-![column-stored](graphs/sup/dgemm_ccc_has_nt1.png)
+* **Haswell single-threaded row-stored**
+![single-threaded row-stored](graphs/sup/dgemm_rrr_has_nt1.png)
+* **Haswell single-threaded column-stored**
+![single-threaded column-stored](graphs/sup/dgemm_ccc_has_nt1.png)
+* **Haswell multithreaded (12 cores) row-stored**
+![multithreaded row-stored](graphs/sup/dgemm_rrr_has_nt12.png)
+* **Haswell multithreaded (12 cores) column-stored**
+![multithreaded column-stored](graphs/sup/dgemm_ccc_has_nt12.png)

 ---

@@ -276,20 +309,26 @@ The `runthese.m` file will contain example invocations of the function.
  * Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each.
 * Peak performance:
  * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
+   * multicore: 20.4 GFLOPS/core (double-precision), 40.8 GFLOPS/core (single-precision)
 * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
 * Page size: 4096 bytes
 * Compiler: gcc 7.4.0
-* Results gathered: 23-28 August 2019
+* Results gathered: 3 March 2020
 * Implementations tested:
-  * BLIS 4a0a6e8 (0.6.0-28)
-    * configured with `./configure --enable-cblas auto`
-    * sub-configuration exercised: `zen`
-  * OpenBLAS 0.3.7
-    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
-  * BLASFEO 01f6b7f
+  * BLIS 90db88e (0.6.1-8)
+    * configured with `./configure --enable-cblas auto` (single-threaded)
+    * configured with `./configure --enable-cblas -t openmp auto` (multithreaded)
+    * sub-configuration exercised: `haswell`
+    * Multithreaded (32 cores) execution requested via `export BLIS_NUM_THREADS=32`
+  * OpenBLAS 0.3.8
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0 USE_LOCKING=1` (single-threaded)
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=32` (multithreaded)
+    * Multithreaded (32 cores) execution requested via `export OPENBLAS_NUM_THREADS=32`
+  * BLASFEO f9b78c6
    * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
+    * built BLAS library via `make CC=gcc`
  * Eigen 3.3.90
-    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019)
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (36b9596)
    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
         ```
         # These lines added after line 67.
@@ -301,13 +340,15 @@ The `runthese.m` file will contain example invocations of the function.
    * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas`
    * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
-    * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
-  * MKL 2019 update 4
-    * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
-  * libxsmm 77a295c (1.6.5-6679)
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (32 cores) execution requested via `export OMP_NUM_THREADS=32`
+  * MKL 2020 initial release
+    * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1`
+    * Multithreaded (32 cores) execution requested via `export MKL_NUM_THREADS=32`
+  * libxsmm a40a833 (post-1.14)
    * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally.
 * Affinity:
-  * N/A.
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-31"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset.
 * Frequency throttling (via `cpupower`):
  * Driver: acpi-cpufreq
  * Governor: performance
@@ -320,15 +361,21 @@ The `runthese.m` file will contain example invocations of the function.

 #### pdf

-* [Epyc row-stored](graphs/sup/dgemm_rrr_epyc_nt1.pdf)
-* [Epyc column-stored](graphs/sup/dgemm_ccc_epyc_nt1.pdf)
+* [Epyc single-threaded row-stored](graphs/sup/dgemm_rrr_epyc_nt1.pdf)
+* [Epyc single-threaded column-stored](graphs/sup/dgemm_ccc_epyc_nt1.pdf)
+* [Epyc multithreaded (32 cores) row-stored](graphs/sup/dgemm_rrr_epyc_nt32.pdf)
+* [Epyc multithreaded (32 cores) column-stored](graphs/sup/dgemm_ccc_epyc_nt32.pdf)

 #### png (inline)

-* **Epyc row-stored**
-![row-stored](graphs/sup/dgemm_rrr_epyc_nt1.png)
-* **Epyc column-stored**
-![column-stored](graphs/sup/dgemm_ccc_epyc_nt1.png)
+* **Epyc single-threaded row-stored**
+![single-threaded row-stored](graphs/sup/dgemm_rrr_epyc_nt1.png)
+* **Epyc single-threaded column-stored**
+![single-threaded column-stored](graphs/sup/dgemm_ccc_epyc_nt1.png)
+* **Epyc multithreaded (32 cores) row-stored**
+![multithreaded row-stored](graphs/sup/dgemm_rrr_epyc_nt32.png)
+* **Epyc multithreaded (32 cores) column-stored**
+![multithreaded column-stored](graphs/sup/dgemm_ccc_epyc_nt32.png)

 ---

--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -4,6 +4,8 @@

 ## Contents

+* [Changes in 0.7.0](ReleaseNotes.md#changes-in-070)
+* [Changes in 0.6.1](ReleaseNotes.md#changes-in-061)
 * [Changes in 0.6.0](ReleaseNotes.md#changes-in-060)
 * [Changes in 0.5.2](ReleaseNotes.md#changes-in-052)
 * [Changes in 0.5.1](ReleaseNotes.md#changes-in-051)
@@ -35,6 +37,90 @@
 * [Changes in 0.0.2](ReleaseNotes.md#changes-in-002)
 * [Changes in 0.0.1](ReleaseNotes.md#changes-in-001)

+## Changes in 0.7.0
+April 7, 2020
+
+Improvements present in 0.7.0:
+
+Framework:
+- Implemented support for multithreading within the sup (skinny/small/unpacked) framework, which previously was single-threaded only. Note that this feature works harmoniously with the selective packing introduced into the sup framework in 0.6.1. (AMD)
+- Renamed `bli_thread_obarrier()` and `bli_thread_obroadcast()` functions to drop the 'o', which was left over from when `thrcomm_t` objects tracked both "inner" and "outer" communicators.
+- Fixed an obscure `int`-to-`packbuf_t` type conversion error that only affects certain C++ compilers (including g++) when compiling application code that includes the BLIS header file `blis.h`. (Ajay Panyala)
+- Added a missing early `return` statement in `bli_thread_partition_2x2()`, which provides a slight optimization. (Kiran Varaganti)
+
+Kernels:
+- Fixed the semantics of the `bli_amaxv()` kernels ('s' and 'd') within the `zen` kernel set. Previously, the kernels (incorrectly) returned the index of the last element whose absolute value was largest (in the event there were multiple of equal value); now, it (correclty) returns the index of the first of such elements. The kernels also now return the index of the first NaN, if one is encountered. (Mat Cross, Devin Matthews)
+
+Build system:
+- Warn the user at configure-time when hardware auto-detection returns the `generic` subconfiguration since this is probably not what they were expecting. (Devin Matthews)
+- Removed unnecessary sorting (and duplicate removal) on `LDFLAGS` in `common.mk`. (Isuru Fernando)
+- Specify the full path to the location of the dynamic library on OSX so that other dynamic libraries that depend on BLIS know where to find the library. (Satish Balay, Jed Brown)
+
+Testing:
+- Updated and reorganized test drivers in `test/sup` so that they work for either single-threaded or multithreaded purposes. (AMD)
+- Updated/optimized octave scripts in `test/sup` for use with octave 5.2.0.
+- Minor updates/tweaks to `test/1m4m`.
+
+Documentation:
+- Updated existing single-threaded sup performance graphs with new data and added multithreaded sup graphs to `docs/PerformanceSmall.md`.
+- Added mention of Gentoo support under the external packages section of the `README.md`.
+- Tweaks to `docs/Multithreading.md` that clarify that setting any `BLIS_*_NT` variable to 1 will be considered manual specification for the purposes of determining whether to auto-factorize via `BLIS_NUM_THREADS`. (AMD)
+
+## Changes in 0.6.1
+January 14, 2020
+
+Improvements present in 0.6.1:
+
+Framework:
+- Added support for pre-broadcast when packing B. This causes elements of B to be repeated (broadcast) in the packed copy of B so that subsequent vector loads will result in the element already being pre-broadcast into the vector register.
+- Added support for selective packing to `gemmsup` (controlled via environment variables and/or the `rntm_t` object). (AMD)
+- Fixed a bug in `sdsdot_sub()` that redundantly added the "alpha" scalar and a separate bug in the order of typecasting intermediate products in `sdsdot_()`. (Simon Lukas Märtens, Devin Matthews)
+- Fixed an obscure bug in `bli_acquire_mpart_mdim()`/`bli_acquire_mpart_ndim()`. (Minh Quan Ho)
+- Fixed a subtle and complicated bug that only manifested via the BLAS test drivers in the `generic` subconfiguration, and possibly any other subconfiguration that did not register complex-domain `gemm` ukernels, or registered ONLY real-domain ukernels as row-preferential. (Dave Love)
+- Always use `sumsqv` to compute `normfv` instead of the "dot product trick" that was previously employed for performance reasons. (Roman Yurchak, Devin Matthews, and Isuru Fernando)
+- Fixed bug in `thrinfo_t` debugging/printing code.
+
+Kernels:
+- Implemented and registered an optimized `dgemm` microkernel for the `power9` kernel set. (Nicholai Tukanov)
+- Pacify a `restrict` warning in the `gemmtrsm4m1` reference ukernel. (Dave Love, Devin Matthews)
+
+Build system:
+- Fixed parsing in `vpu_count()` on some SkylakeX workstations. (Dave Love)
+- Reimplemented `bli_cpuid_query()` for ARM to use `stdio`-based functions instead of `popen()`. (Dave Love)
+- Use `-march=znver1` for clang on `zen2` subconfig.
+- Updated `-march` flags for `sandybridge`, `haswell` subconfigurations to use newer syntax (e.g. `haswell` instead of `core-avx2` and `sandybridge` instead of `corei7-avx`.
+- Correctly use `-qopenmp-simd` for reference kernels when compiling with icc. (Victor Eikjhout)
+- Added `-march` support for select gcc version ranges where flag syntax changes or new flags are added. The ranges we identify are: versions older than 4.9.0; versions older than 6.1.0 (but newer than 4.9.0); versions older than 9.1.0 (but newer than 6.1.0).
+- Use `-funsafe-math-optimizations` and `-ffp-contract=fast` for all reference kernels when using gcc or clang.
+- Updated MC cache blocksizes used by `haswell` subconfig.
+- Updated NC cache blocksizes used by `zen` subconfig.
+- Fixed a typo in the context registration of the `cortexa53` subconfiguration in `bli_gks.c`. (Francisco Igual)
+- Output a more informative error when the user manually targets a subconfiguration that configure places in the configuration blacklist. (Tze Meng Low)
+- Set execute bits of shared library at install-time. (Adam J. Stewart)
+- Added missing thread-related symbols for export to shared libraries. (Kyungmin Lee)
+- Removed (finally) the `attic/windows` directory since we offer Windows DLL support via AppVeyor's build artifacts, and thus that directory was only likely confusing people.
+
+Testing:
+- Fixed latent testsuite microkernel module bug for `power9` subconfig. (Jeff Hammond)
+- Added `test/1m4m` driver directory for test drivers related to the 1m paper.
+- Added libxsmm support to `test/sup drivers`. (Robert van de Geijn)
+- Updated `.travis.yml` and `do_sde.sh` to automatically accept SDE license and download SDE directly from Intel. (Devin Matthews, Jeff Hammond)
+- Updated standalone test drivers to iterate backwards through the specified problem space. This often helps avoid the situation whereby the CPU doesn't immediately throttle up to its maximum clock frequency, which can produce strange discontinuities (sharply rising "cliffs") in performance graphs.
+- Pacify an unused variable warning in `blastest/f2c/lread.c`. (Jeff Hammond)
+- Various other minor fixes/tweaks to test drivers.
+
+Documentation:
+- Added libxsmm results to `docs/PerformanceSmall.md`.
+- Added BLASFEO results to `docs/PerformanceSmall.md`.
+- Added the page size and location of the performance drivers to `docs/Performance.md` and `docs/PerformanceSmall.md`. (Dave Love)
+- Added notes to `docs/Multithreading.md` regarding the nuances of setting multithreading parameters the manual way vs. the automatic way. (Jérémie du Boisberranger)
+- Added a section on reproduction to `docs/Performance.md` and `docs/PerformanceSmall.md`. (Dave Love)
+- Documented Eigen `-march=native` hack in `docs/Performance.md` and `docs/PerformanceSmall.md`. (Sameer Agarwal)
+- Inserted multithreading links and disclaimers to `BuildSystem.md`. (Jeff Diamond)
+- Fixed typo in description for `bli_?axpy2v()` in `docs/BLISTypedAPI.md`. (Shmuel Levine)
+- Added "How to Download BLIS" section to `README.md`. (Jeff Diamond)
+- Various other minor documentation fixes.
+
 ## Changes in 0.6.0
 June 3, 2019

--- a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf
+++ b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf
--- a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png
+++ b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png
--- a/docs/graphs/sup/dgemm_ccc_epyc_nt32.pdf
+++ b/docs/graphs/sup/dgemm_ccc_epyc_nt32.pdf
--- a/docs/graphs/sup/dgemm_ccc_epyc_nt32.png
+++ b/docs/graphs/sup/dgemm_ccc_epyc_nt32.png
--- a/docs/graphs/sup/dgemm_ccc_has_nt1.pdf
+++ b/docs/graphs/sup/dgemm_ccc_has_nt1.pdf
--- a/docs/graphs/sup/dgemm_ccc_has_nt1.png
+++ b/docs/graphs/sup/dgemm_ccc_has_nt1.png
--- a/docs/graphs/sup/dgemm_ccc_has_nt12.pdf
+++ b/docs/graphs/sup/dgemm_ccc_has_nt12.pdf
--- a/docs/graphs/sup/dgemm_ccc_has_nt12.png
+++ b/docs/graphs/sup/dgemm_ccc_has_nt12.png
--- a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf
+++ b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf
--- a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png
+++ b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png
--- a/docs/graphs/sup/dgemm_ccc_kbl_nt4.pdf
+++ b/docs/graphs/sup/dgemm_ccc_kbl_nt4.pdf
--- a/docs/graphs/sup/dgemm_ccc_kbl_nt4.png
+++ b/docs/graphs/sup/dgemm_ccc_kbl_nt4.png
--- a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf
+++ b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf
--- a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png
+++ b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png
--- a/docs/graphs/sup/dgemm_rrr_epyc_nt32.pdf
+++ b/docs/graphs/sup/dgemm_rrr_epyc_nt32.pdf
--- a/docs/graphs/sup/dgemm_rrr_epyc_nt32.png
+++ b/docs/graphs/sup/dgemm_rrr_epyc_nt32.png
--- a/docs/graphs/sup/dgemm_rrr_has_nt1.pdf
+++ b/docs/graphs/sup/dgemm_rrr_has_nt1.pdf
--- a/docs/graphs/sup/dgemm_rrr_has_nt1.png
+++ b/docs/graphs/sup/dgemm_rrr_has_nt1.png
--- a/docs/graphs/sup/dgemm_rrr_has_nt12.pdf
+++ b/docs/graphs/sup/dgemm_rrr_has_nt12.pdf
--- a/docs/graphs/sup/dgemm_rrr_has_nt12.png
+++ b/docs/graphs/sup/dgemm_rrr_has_nt12.png
--- a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf
+++ b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf
--- a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png
+++ b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png
--- a/docs/graphs/sup/dgemm_rrr_kbl_nt4.pdf
+++ b/docs/graphs/sup/dgemm_rrr_kbl_nt4.pdf
--- a/docs/graphs/sup/dgemm_rrr_kbl_nt4.png
+++ b/docs/graphs/sup/dgemm_rrr_kbl_nt4.png
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -667,7 +667,7 @@ if ( col_stored ) { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 	if ( bli_thread_work_id( thread ) == 1 ) \
 	{ \
 	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -678,7 +678,7 @@ bli_thread_obarrier( thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 } \
 else { \
 	if ( bli_thread_work_id( thread ) == 0 ) \
@@ -691,7 +691,7 @@ else { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 	if ( bli_thread_work_id( thread ) == 1 ) \
 	{ \
 	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -702,7 +702,7 @@ bli_thread_obarrier( thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 } \
 */
 /*
--- a/frame/1m/unpackm/bli_unpackm_int.c
+++ b/frame/1m/unpackm/bli_unpackm_int.c
@@ -73,6 +73,6 @@ void bli_unpackm_int
    }

 	// Barrier so that unpacking is done before computation.
-    bli_thread_obarrier( thread );
+    bli_thread_barrier( thread );
 }

--- a/frame/3/bli_l3_packm.c
+++ b/frame/3/bli_l3_packm.c
@@ -50,7 +50,7 @@ void bli_l3_packm
 	siz_t     size_needed;

 	// FGVZ: Not sure why we need this barrier, but we do.
-	bli_thread_obarrier( thread );
+	bli_thread_barrier( thread );

 	// Every thread initializes x_pack and determines the size of memory
 	// block needed (which gets embedded into the otherwise "blank" mem_t
@@ -102,7 +102,7 @@ void bli_l3_packm

 		// Broadcast the address of the chief thread's local mem_t entry to
 		// all threads.
-		local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
+		local_mem_p = bli_thread_broadcast( thread, &local_mem_s );

 		// Save the contents of the chief thread's local mem_t entry to the
 		// mem_t field in this thread's control tree node.
@@ -146,7 +146,7 @@ void bli_l3_packm

 			// Broadcast the address of the chief thread's local mem_t entry to
 			// all threads.
-			local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
+			local_mem_p = bli_thread_broadcast( thread, &local_mem_s );

 			// Save the chief thread's local mem_t entry to the mem_t field in
 			// this thread's control tree node.
@@ -159,7 +159,7 @@ void bli_l3_packm
 			// will already have the cached values in their local control
 			// trees' mem_t entries, currently pointed to by cntl_mem_p.

-			bli_thread_obarrier( thread );
+			bli_thread_barrier( thread );
 		}
 	}

@@ -182,6 +182,6 @@ void bli_l3_packm
 	);

 	// Barrier so that packing is done before computation.
-	bli_thread_obarrier( thread );
+	bli_thread_barrier( thread );
 }

--- a/frame/3/bli_l3_sup_packm_a.c
+++ b/frame/3/bli_l3_sup_packm_a.c
@@ -67,7 +67,7 @@ void PASTEMAC(ch,opname) \
 \
 		/* Barrier to make sure all threads are caught up and ready to begin
 		   the packm stage. */ \
-		bli_thread_obarrier( thread ); \
+		bli_thread_barrier( thread ); \
 \
 		/* Compute the size of the memory block eneded. */ \
 		siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -97,7 +97,7 @@ void PASTEMAC(ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -146,7 +146,7 @@ void PASTEMAC(ch,opname) \
 \
 				/* Broadcast the address of the chief thread's passed-in mem_t
 				   to all threads. */ \
-				mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
+				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 				/* Non-chief threads: Copy the contents of the chief thread's
 				   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -422,7 +422,7 @@ void PASTEMAC(ch,opname) \
 		} \
 \
 		/* Barrier so that packing is done before computation. */ \
-		bli_thread_obarrier( thread ); \
+		bli_thread_barrier( thread ); \
 	} \
 }

--- a/frame/3/bli_l3_sup_packm_b.c
+++ b/frame/3/bli_l3_sup_packm_b.c
@@ -67,7 +67,7 @@ void PASTEMAC(ch,opname) \
 \
 		/* Barrier to make sure all threads are caught up and ready to begin
 		   the packm stage. */ \
-		bli_thread_obarrier( thread ); \
+		bli_thread_barrier( thread ); \
 \
 		/* Compute the size of the memory block eneded. */ \
 		siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -97,7 +97,7 @@ void PASTEMAC(ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -146,7 +146,7 @@ void PASTEMAC(ch,opname) \
 \
 				/* Broadcast the address of the chief thread's passed-in mem_t
 				   to all threads. */ \
-				mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
+				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 				/* Non-chief threads: Copy the contents of the chief thread's
 				   passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -422,7 +422,7 @@ void PASTEMAC(ch,opname) \
 		} \
 \
 		/* Barrier so that packing is done before computation. */ \
-		bli_thread_obarrier( thread ); \
+		bli_thread_barrier( thread ); \
 	} \
 }

--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -237,7 +237,7 @@ if ( col_stored ) { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 	if ( bli_thread_work_id( thread ) == 1 ) \
 	{ \
 	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -248,7 +248,7 @@ bli_thread_obarrier( thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 } \
 else { \
 	if ( bli_thread_work_id( thread ) == 0 ) \
@@ -261,7 +261,7 @@ else { \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 	if ( bli_thread_work_id( thread ) == 1 ) \
 	{ \
 	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -272,7 +272,7 @@ bli_thread_obarrier( thread ); \
 	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
 	fflush( stdout ); \
 	} \
-bli_thread_obarrier( thread ); \
+bli_thread_barrier( thread ); \
 } \
 */
 /*
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -674,7 +674,7 @@ void PASTEMAC(ch,varname) \
 \
 			/* NOTE: This barrier is only needed if we are packing A (since
 			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packa ) bli_thread_obarrier( thread_pa ); \
+			if ( packa ) bli_thread_barrier( thread_pa ); \
 		} \
 	} \
 \
@@ -1292,7 +1292,7 @@ void PASTEMAC(ch,varname) \
 \
 			/* NOTE: This barrier is only needed if we are packing B (since
 			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packb ) bli_thread_obarrier( thread_pb ); \
+			if ( packb ) bli_thread_barrier( thread_pb ); \
 		} \
 	} \
 \
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -84,7 +84,7 @@ void bli_gemm_blk_var3
 		  bli_thrinfo_sub_node( thread )
 		);

-		bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
+		bli_thread_barrier( bli_thrinfo_sub_node( thread ) );

 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal beta scalar on matrix C is non-zero, we must use it
--- a/frame/3/gemm/bli_gemm_int.c
+++ b/frame/3/gemm/bli_gemm_int.c
@@ -66,7 +66,7 @@ void bli_gemm_int
 	{
        if ( bli_thread_am_ochief( thread ) )
 		    bli_scalm( beta, c );
-        bli_thread_obarrier( thread );
+        bli_thread_barrier( thread );
 		return;
 	}

@@ -80,7 +80,7 @@ void bli_gemm_int

        if ( bli_thread_am_ochief( thread ) )
 		    bli_scalm( beta, c );
-        bli_thread_obarrier( thread );
+        bli_thread_barrier( thread );
 		return;
 	}

--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -123,7 +123,7 @@ void bli_trsm_blk_var1
 	// We must execute a barrier here because the upcoming rank-k update
 	// requires the packed matrix B to be fully updated by the trsm
 	// subproblem.
-	bli_thread_obarrier( thread );
+	bli_thread_barrier( thread );

 	// Isolate the remaining part of the column panel matrix A, which we do by
 	// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -85,7 +85,7 @@ void bli_trsm_blk_var3
 		);

 		//bli_thread_ibarrier( thread );
-		bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
+		bli_thread_barrier( bli_thrinfo_sub_node( thread ) );

 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal alpha scalars on A/B and C are non-zero, we must ensure
--- a/frame/3/trsm/bli_trsm_int.c
+++ b/frame/3/trsm/bli_trsm_int.c
@@ -68,7 +68,7 @@ void bli_trsm_int
 	{
 		if ( bli_thread_am_ochief( thread ) )
 		    bli_scalm( beta, c );
-		bli_thread_obarrier( thread );
+		bli_thread_barrier( thread );
 		return;
 	}

@@ -119,7 +119,7 @@ void bli_trsm_int
 	}

 	// FGVZ->TMS: Is this barrier still needed?
-	bli_thread_obarrier( thread );
+	bli_thread_barrier( thread );

 	// Create the next node in the thrinfo_t structure.
 	bli_thrinfo_grow( rntm, cntl, thread );
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -74,6 +74,12 @@ void bli_arch_set_id_once( void )

 void bli_arch_set_id( void )
 {
+	// NOTE: Change this usage of getenv() to bli_env_get_var() after
+	// merging #351.
+	//bool_t do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 );
+	bool_t do_logging = getenv( "BLIS_ARCH_DEBUG" ) != NULL;
+	bli_arch_set_logging( do_logging );
+
 	// Architecture families.
 #if defined BLIS_FAMILY_INTEL64 || \
    defined BLIS_FAMILY_AMD64   || \
@@ -156,6 +162,10 @@ void bli_arch_set_id( void )
 	id = BLIS_ARCH_GENERIC;
 #endif

+	if ( bli_arch_get_logging() )
+		fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n",
+				 bli_arch_string( id ) );
+
 	//printf( "blis_arch_query_id(): id = %u\n", id );
 	//exit(1);
 }
@@ -200,3 +210,37 @@ char* bli_arch_string( arch_t id )
 	return config_name[ id ];
 }

+// -----------------------------------------------------------------------------
+
+static bool_t arch_dolog = 0;
+
+void bli_arch_set_logging( bool_t dolog )
+{
+	arch_dolog = dolog;
+}
+
+bool_t bli_arch_get_logging( void )
+{
+	return arch_dolog;
+}
+
+void bli_arch_log( char* fmt, ... )
+{
+	char prefix[] = "libblis: ";
+	int  n_chars  = strlen( prefix ) + strlen( fmt ) + 1;
+
+	if ( bli_arch_get_logging() && fmt )
+	{
+		char* prefix_fmt = malloc( n_chars );
+
+		snprintf( prefix_fmt, n_chars, "%s%s", prefix, fmt );
+
+		va_list ap;
+		va_start( ap, fmt );
+		vfprintf( stderr, prefix_fmt, ap );
+		va_end( ap );
+
+		free( prefix_fmt );
+	}
+}
+
--- a/frame/base/bli_arch.h
+++ b/frame/base/bli_arch.h
@@ -42,6 +42,9 @@ void   bli_arch_set_id( void );

 BLIS_EXPORT_BLIS char*  bli_arch_string( arch_t id );

+void   bli_arch_set_logging( bool_t dolog );
+bool_t bli_arch_get_logging( void );
+void   bli_arch_log( char*, ... );

 #endif

--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -6,6 +6,7 @@

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019, Dave Love, University of Manchester

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -52,6 +53,7 @@
  #include "bli_cpuid.h"
 #else
  #include "blis.h"
+  #include "bli_arch.h"
 #endif

 // -----------------------------------------------------------------------------
@@ -167,7 +169,22 @@ bool_t bli_cpuid_is_skx

 	int nvpu = vpu_count();

-	if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 )
+	if ( bli_cpuid_has_features( features, expected ) )
+	{
+		switch ( nvpu )
+		{
+		case 1:
+			bli_arch_log( "Hardware has 1 FMA unit; using 'haswell' (not 'skx') sub-config.\n" );
+			return FALSE;
+		case 2:
+			bli_arch_log( "Hardware has 2 FMA units; using 'skx' sub-config.\n" );
+			return TRUE;
+		default:
+			bli_arch_log( "Number of FMA units unknown; using 'haswell' (not 'skx') config.\n" );
+			return FALSE;
+		}
+	}
+	else
 		return FALSE;

 	return TRUE;
@@ -893,6 +910,10 @@ void get_cpu_name( char *cpu_name )
 	*( uint32_t* )&cpu_name[32+12] = edx;
 }

+// Return the number of FMA units _assuming avx512 is supported_.
+// This needs updating for new processor types, sigh.
+// See https://ark.intel.com/content/www/us/en/ark.html#@Processors
+// and also https://github.com/jeffhammond/vpu-count
 int vpu_count( void )
 {
 	char  cpu_name[48] = {};
@@ -904,49 +925,59 @@ int vpu_count( void )

 	if ( strstr( cpu_name, "Intel(R) Xeon(R)" ) != NULL )
 	{
-		loc = strstr( cpu_name, "Platinum" );
+		if (( loc = strstr( cpu_name, "Platinum" ) ))
+			return 2;
 		if ( loc == NULL )
-			loc = strstr( cpu_name, "Gold" );
+			loc = strstr( cpu_name, "Gold" ); // 1 or 2, tested below
 		if ( loc == NULL )
-			loc = strstr( cpu_name, "Silver" );
+			if (( loc = strstr( cpu_name, "Silver" ) ))
+				return 1;
 		if ( loc == NULL )
-			loc = strstr( cpu_name, "Bronze" );
+			if (( loc = strstr( cpu_name, "Bronze" ) ))
+				return 1;
 		if ( loc == NULL )
 			loc = strstr( cpu_name, "W" );
+		if ( loc == NULL )
+			if (( loc = strstr( cpu_name, "D" ) ))
+				// Fixme:  May be wrong
+				// <https://github.com/jeffhammond/vpu-count/issues/3#issuecomment-542044651>
+				return 1;
 		if ( loc == NULL )
 			return -1;

-		loc = strstr( loc+1, " " );
+		// We may have W-nnnn rather than, say, Gold nnnn
+		if ( 'W' == *loc && '-' == *(loc+1) )
+			loc++;
+		else
+			loc = strstr( loc+1, " " );
 		if ( loc == NULL )
 			return -1;

 		strncpy( model_num, loc+1, 4 );
-		model_num[4] = '\0';
+		model_num[4] = '\0'; // Things like i9-10900X matched above

 		sku = atoi( model_num );

+		// These were derived from ARK listings as of 2019-10-09, but
+		// may not be complete, especially as the ARK Skylake listing
+		// seems to be limited.
 		if      ( 8199 >= sku && sku >= 8100 ) return 2;
 		else if ( 6199 >= sku && sku >= 6100 ) return 2;
 		else if (                sku == 5122 ) return 2;
+		else if ( 6299 >= sku && sku >= 6200 ) return 2; // Cascade Lake Gold
+		else if ( 5299 >= sku && sku >= 5200 ) return 1; // Cascade Lake Gold
 		else if ( 5199 >= sku && sku >= 5100 ) return 1;
 		else if ( 4199 >= sku && sku >= 4100 ) return 1;
 		else if ( 3199 >= sku && sku >= 3100 ) return 1;
+		else if ( 3299 >= sku && sku >= 3200 ) return 2; // Cascade Lake W
+		else if ( 2299 >= sku && sku >= 2200 ) return 2; // Cascade Lake W
 		else if ( 2199 >= sku && sku >= 2120 ) return 2;
+		else if ( 2102 == sku || sku == 2104 ) return 2; // Gold exceptions
 		else if ( 2119 >= sku && sku >= 2100 ) return 1;
 		else return -1;
 	}
-	else if ( strstr( cpu_name, "Intel(R) Core(TM) i9" ) != NULL )
-	{
-		return 1;
-	}
-	else if ( strstr( cpu_name, "Intel(R) Core(TM) i7" ) != NULL )
-	{
-		if ( strstr( cpu_name, "7800X" ) != NULL ||
-		     strstr( cpu_name, "7820X" ) != NULL )
-			return 1;
-		else
-			return -1;
-	}
+	else if ( strstr( cpu_name, "Intel(R) Core(TM)" ) != NULL )
+		return 2; // All i7/i9 with avx512?
 	else
 	{
 		return -1;
@@ -1082,3 +1113,4 @@ char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath
 }

 #endif
+
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -126,8 +126,8 @@ static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want )
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)

 // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393
-// // for more information why this move was made.
-// //#include "cpuid.h"
+// for more information why this move was made.
+//#include "cpuid.h"

 void get_cpu_name( char *cpu_name );
 int  vpu_count( void );
--- a/frame/base/bli_mem.h
+++ b/frame/base/bli_mem.h
@@ -153,7 +153,7 @@ static void bli_mem_clear( mem_t* mem )
 	//Pass actual type instead
 	bli_mem_set_buf_type ( pb, mem );
 #else
-	bli_mem_set_buf_type( -1, mem );
+	bli_mem_set_buf_type( ( packbuf_t )-1, mem );
 #endif
 	bli_mem_set_pool( NULL, mem );
 	bli_mem_set_size( 0, mem );
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -264,10 +264,16 @@ float PASTEF77(sd,sdot)
       const float*   y, const f77_int* incy
     )
 {
-	float r = ( float )PASTEF77(d,sdot)( n,
-	                                     x, incx,
-	                                     y, incy );
-	return r + *sb;
+	return ( float )
+	       (
+	         ( double )(*sb) +
+	         PASTEF77(d,sdot)
+	         (
+	           n,
+	           x, incx,
+	           y, incy
+	         )
+	       );
 }

 // Input vectors stored in single precision, computed in double precision,
--- a/frame/compat/bli_blas.h
+++ b/frame/compat/bli_blas.h
@@ -99,6 +99,7 @@

 #include "bla_lsame.h"
 #include "bla_xerbla.h"
+#include "bla_xerbla_array.h"


 // -- Level-0 BLAS prototypes --
--- a/frame/compat/cblas/f77_sub/f77_dot_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_dot_sub.c
@@ -75,7 +75,7 @@ void PASTEF772(sds,dot,sub)
             float*   rval
     )
 {
-	*rval = *sb + PASTEF77(sds,dot)
+	*rval = PASTEF77(sds,dot)
 	(
 	  n,
 	  sb,
--- a/frame/compat/f2c/bla_xerbla_array.c
+++ b/frame/compat/f2c/bla_xerbla_array.c
@@ -0,0 +1,74 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_BLAS
+
+#define MAX_NUM_CHARS 32
+
+int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info)
+{
+	int  i;
+#if 1
+	//                                  01234567890123456789012345678901
+	char srname[ MAX_NUM_CHARS + 1 ] = "                                ";
+#else
+	char srname[ MAX_NUM_CHARS + 1 ];
+
+	// Initialize srname to contain blank characters.
+	for ( i = 0; i < MAX_NUM_CHARS; ++i ) srname[i] = ' ';
+#endif
+
+	// Compute the number of chars to copy as the minimum of the length of
+	// srname_array and MAX_NUM_CHARS.
+	const int n_copy = bli_min( srname_len, MAX_NUM_CHARS );
+
+	// Copy over each element of srname_array.
+	for ( i = 0; i < n_copy; ++i )
+	{
+		srname[i] = srname_array[i];
+	}
+
+	// NULL terminate.
+	srname[i] = '\0';
+
+	// Call xerbla_().
+	PASTEF770(xerbla)( srname, info, ( ftnlen )srname_len );
+
+	return 0;
+}
+
+#endif
+
--- a/frame/compat/f2c/bla_xerbla_array.h
+++ b/frame/compat/f2c/bla_xerbla_array.h
@@ -0,0 +1,39 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef BLIS_ENABLE_BLAS
+
+BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info);
+
+#endif
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -265,6 +265,9 @@ CNTX_INIT_PROTS( generic )

 // -- ARM architectures --

+#ifdef BLIS_KERNELS_ARMSVE
+#include "bli_kernels_armsve.h"
+#endif
 #ifdef BLIS_KERNELS_ARMV8A
 #include "bli_kernels_armv8a.h"
 #endif
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -56,8 +56,9 @@ thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads );
 void       bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm );
 void       bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm );
 void       bli_thrcomm_cleanup( thrcomm_t* comm );
-void       bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm );
-void*      bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm );
+
+BLIS_EXPORT_BLIS void  bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm );
+BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm );

 void       bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm );

--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -1062,7 +1062,6 @@ void bli_thread_partition_2x2
    {
        *nt1 = ( work1 >= work2 ? n_thread : 1 );
        *nt2 = ( work1 <  work2 ? n_thread : 1 );
-
 		return;
    }

--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -67,6 +67,7 @@ void bli_thread_finalize( void );

 // Thread range-related prototypes.

+BLIS_EXPORT_BLIS
 void bli_thread_range_sub
     (
       thrinfo_t* thread,
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -340,7 +340,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl

 	// Broadcast the temporary array to all threads in the parent's
 	// communicator.
-	new_comms = bli_thread_obroadcast( thread_par, new_comms );
+	new_comms = bli_thread_broadcast( thread_par, new_comms );

 	// Chiefs in the child communicator allocate the communicator
 	// object and store it in the array element corresponding to the
@@ -348,7 +348,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 	if ( child_comm_id == 0 )
 		new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );

-	bli_thread_obarrier( thread_par );
+	bli_thread_barrier( thread_par );

 	// All threads create a new thrinfo_t node using the communicator
 	// that was created by their chief, as identified by parent_work_id.
@@ -364,7 +364,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 	  NULL                         // sub_node
 	);

-	bli_thread_obarrier( thread_par );
+	bli_thread_barrier( thread_par );

 	// The parent's chief thread frees the temporary array of thrcomm_t
 	// pointers.
@@ -477,7 +477,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
 	const dim_t child_comm_id = parent_comm_id % child_nt_in;
 	const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );

-	bli_thread_obarrier( thread_par );
+	bli_thread_barrier( thread_par );

 	// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
 	// parent's chief-ness is equivalent to checking for chief-ness in the new
@@ -488,7 +488,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode

 	// Broadcast the new thrcomm_t address to the other threads in the
 	// parent's group.
-	new_comm = bli_thread_obroadcast( thread_par, new_comm );
+	new_comm = bli_thread_broadcast( thread_par, new_comm );

 	// All threads create a new thrinfo_t node using the communicator
 	// that was created by their chief, as identified by parent_work_id.
@@ -504,7 +504,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
 	  NULL           // sub_node
 	);

-	bli_thread_obarrier( thread_par );
+	bli_thread_barrier( thread_par );

 	return thread_chl;
 }
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -171,12 +171,12 @@ static void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t )

 // other thrinfo_t-related functions

-static void* bli_thread_obroadcast( thrinfo_t* t, void* p )
+static void* bli_thread_broadcast( thrinfo_t* t, void* p )
 {
 	return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
 }

-static void bli_thread_obarrier( thrinfo_t* t )
+static void bli_thread_barrier( thrinfo_t* t )
 {
 	bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
 }
--- a/frame/thread/bli_thrinfo_sup.c
+++ b/frame/thread/bli_thrinfo_sup.c
@@ -205,7 +205,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl

 	// Broadcast the temporary array to all threads in the parent's
 	// communicator.
-	new_comms = bli_thread_obroadcast( thread_par, new_comms );
+	new_comms = bli_thread_broadcast( thread_par, new_comms );

 	// Chiefs in the child communicator allocate the communicator
 	// object and store it in the array element corresponding to the
@@ -213,7 +213,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 	if ( child_comm_id == 0 )
 		new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );

-	bli_thread_obarrier( thread_par );
+	bli_thread_barrier( thread_par );

 	// All threads create a new thrinfo_t node using the communicator
 	// that was created by their chief, as identified by parent_work_id.
@@ -229,7 +229,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 	  NULL                         // sub_node
 	);

-	bli_thread_obarrier( thread_par );
+	bli_thread_barrier( thread_par );

 	// The parent's chief thread frees the temporary array of thrcomm_t
 	// pointers.
--- a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
@@ -0,0 +1,235 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Linaro Limited
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#else
+#error "No Arm SVE intrinsics support in compiler"
+#endif // __ARM_FEATURE_SVE
+
+// assumption:
+//   SVE vector length = 256 bits.
+//
+
+void bli_dpackm_armsve256_asm_8xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       void*   restrict kappa_,
+       void*   restrict a_, inc_t inca_, inc_t lda_,
+       void*   restrict p_,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    double*       a     = ( double* )a_;
+    double*       p     = ( double* )p_;
+    double*       kappa = ( double* )kappa_;
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 8;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+
+    double* restrict alpha1     = a;
+    double* restrict alpha1_4   = alpha1 + 4 * inca;
+    double* restrict pi1        = p;
+    const   svbool_t all_active = svptrue_b64();
+    svfloat64_t      z_a0;
+    svfloat64_t      z_a4;
+    svuint64_t       z_index;
+
+    // creating index for gather/scatter
+    //   with each element as: 0, 1*inca, 2*inca, 3*inca
+    z_index = svindex_u64( 0, inca * sizeof( double ) );
+
+    if ( cdim == mnr )
+    {
+        if ( bli_deq1( *kappa ) )
+        {
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // svld1_f64 retrieves all zero's into z_a0 and z_a4,
+                    //   which is not correct.
+                    //   qemu-aarch64 or gcc interpretation of svld1_f64
+                    //   should be blamed.
+
+                    // load 8 continuous elments from *a
+                    // z_a0 = svld1_f64( all_active, alpha1 );
+                    // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 );
+
+                    // as a workaround, using gather load
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+        }
+        else  // *kappa != 1.0
+        {
+            // load kappa into vector
+            svfloat64_t z_kappa;
+
+            z_kappa = svdup_f64( *kappa );
+
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // load 8 continuous elments from *a
+                    // z_a0 = svld1_f64( all_active, alpha1 );
+                    // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 );
+                    // same reason as above. as a workaround, using gather load
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+        } // end of if ( *kappa == 1.0 )
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
--- a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c
+++ b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c
@@ -0,0 +1,809 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Linaro Limited
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "blis.h"
+
+/*
+   o 8x8 Double precision micro-kernel
+   o Runnable on ARMv8a with SVE 256 feature, compiled with aarch64 GCC.
+   o Tested on qemu-aarch64 and armie for SVE.
+
+   Preconditions:
+    - to use this kernel, SVE with vector length of 256 bits is a must.
+
+   April 2020.
+*/
+void bli_dgemm_armsve256_asm_8x8
+     (
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a,
+       double*    restrict b,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+	void* a_next = bli_auxinfo_next_a( data );
+	void* b_next = bli_auxinfo_next_b( data );
+
+	// Typecast local copies of integers in case dim_t and inc_t are a
+	// different size than is expected by load instructions.
+	uint64_t k_iter = k0 / 4;
+	uint64_t k_left = k0 % 4;
+	uint64_t rs_c   = rs_c0;
+	uint64_t cs_c   = cs_c0;
+
+__asm__ volatile
+(
+"                                            \n\t" 
+" ldr x0,%[aaddr]                            \n\t" // Load address of A 
+" ldr x1,%[baddr]                            \n\t" // Load address of B
+" ldr x2,%[caddr]                            \n\t" // Load address of C
+"                                            \n\t"
+" ldr x3,%[a_next]                           \n\t" // Move pointer
+" ldr x4,%[b_next]                           \n\t" // Move pointer
+"                                            \n\t"
+" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
+" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
+"                                            \n\t" 
+" ldr x7,%[alpha]                            \n\t" // Alpha address      
+" ldr x8,%[beta]                             \n\t" // Beta address      
+"                                            \n\t" 
+" ldr x9,%[cs_c]                             \n\t" // Load cs_c
+" lsl x10,x9,#3                              \n\t" // cs_c * sizeof(double)
+"                                            \n\t"
+" ldr x13,%[rs_c]                            \n\t" // Load rs_c.
+" lsl x14,x13,#3                             \n\t" // rs_c * sizeof(double). 
+"                                            \n\t"
+" add x20,x2,x10                             \n\t" //Load address Column 1 of C
+" add x21,x20,x10                            \n\t" //Load address Column 2 of C
+" add x22,x21,x10                            \n\t" //Load address Column 3 of C
+" add x23,x22,x10                            \n\t" //Load address Column 4 of C
+" add x24,x23,x10                            \n\t" //Load address Column 5 of C
+" add x25,x24,x10                            \n\t" //Load address Column 6 of C
+" add x26,x25,x10                            \n\t" //Load address Column 7 of C
+"                                            \n\t"
+" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+"                                            \n\t"
+" ldr  z0, [x0]                              \n\t" // Load a
+" ldr  z1, [x0, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ptrue   p0.d, all                          \n\t"
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t" // PRFM, the following prefetch on [x1] and [x0]
+"                                            \n\t" //   is for b rows 4..7 and a columns 4..7.
+"                                            \n\t" //   both of them will be used in next iteration
+"                                            \n\t" //   of k_iter (unrolled per 4 loops)
+"                                            \n\t"
+" dup  z16.d, #0                             \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #256]              \n\t" // prefetch b row no.4
+" dup  z17.d, #0                             \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #320]              \n\t" // prefetch b row no.5
+" dup  z18.d, #0                             \n\t" // Vector for accummulating column 1
+" prfm    PLDL1KEEP, [x1, #384]              \n\t" // prefetch b row no.6
+" dup  z19.d, #0                             \n\t" // Vector for accummulating column 1
+" prfm    PLDL1KEEP, [x1, #448]              \n\t" // preftech b row no.7
+" dup  z20.d, #0                             \n\t" // Vector for accummulating column 2 
+" dup  z21.d, #0                             \n\t" // Vector for accummulating column 2
+"                                            \n\t"
+" dup  z22.d, #0                             \n\t" // Vector for accummulating column 3
+" prfm    PLDL1KEEP, [x0, #256]              \n\t" // prefetch a col. no.4
+" dup  z23.d, #0                             \n\t" // Vector for accummulating column 3
+" prfm    PLDL1KEEP, [x0, #320]              \n\t" // prefetch a col. no.5
+" dup  z24.d, #0                             \n\t" // Vector for accummulating column 4
+" prfm    PLDL1KEEP, [x0, #384]              \n\t" // prefetch a col. no.6
+" dup  z25.d, #0                             \n\t" // Vector for accummulating column 4
+" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.7
+" dup  z26.d, #0                             \n\t" // Vector for accummulating column 5 
+" dup  z27.d, #0                             \n\t" // Vector for accummulating column 5
+"                                            \n\t"
+" dup  z28.d, #0                             \n\t" // Vector for accummulating column 6
+" dup  z29.d, #0                             \n\t" // Vector for accummulating column 6
+" dup  z30.d, #0                             \n\t" // Vector for accummulating column 7
+" dup  z31.d, #0                             \n\t" // Vector for accummulating column 7
+"                                            \n\t"
+"                                            \n\t"
+" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
+" beq .DCONSIDERKLEFT                        \n\t"
+"                                            \n\t"
+" add x0, x0, #64                            \n\t" //update address of A
+" add x1, x1, #64                            \n\t" //update address of B
+"                                            \n\t"
+" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
+" beq .DLASTITER                             \n\t" // (as loop is do-while-like).
+"                                            \n\t"
+" DLOOP:                                     \n\t" // Body
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" prfm    PLDL1KEEP, [x1, #448]              \n\t" // prefetch b row no.8, 512-64=448
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" prfm    PLDL1KEEP, [x1, #512]              \n\t" // prefetch b row no.9
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" prfm    PLDL1KEEP, [x1, #576]              \n\t" // prefetch b row no.10
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 1
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" prfm    PLDL1KEEP, [x1, #640]              \n\t" // prefetch b row no.11
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.8
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" prfm    PLDL1KEEP, [x0, #512]              \n\t" // prefetch a col. no.9
+"                                            \n\t"
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"
+"                                            \n\t"                  //End it 2
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" prfm    PLDL1KEEP, [x0, #576]              \n\t" // prefetch a col. no.10
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" prfm    PLDL1KEEP, [x0, #640]              \n\t" // prefetch a col. no.11
+"                                            \n\t"
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+"                                            \n\t"
+" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
+"                                            \n\t" //   in range -128 to 112
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 3
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z0, [x0, #6, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z1, [x0, #7, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  //End it 4
+" add x0, x0, #256                           \n\t"
+" add x1, x1, #128                           \n\t"
+"                                            \n\t"
+" sub x5,x5,1                                \n\t" // i-=1
+" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
+" bne DLOOP                                  \n\t"
+"                                            \n\t"
+".DLASTITER:                                 \n\t"
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 1
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"
+"                                            \n\t"                  //End it 2
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
+"                                            \n\t" //   in range -128 to 112
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 3
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+"                                            \n\t"
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+"                                            \n\t"
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+"                                            \n\t"
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+"                                            \n\t"
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+"                                            \n\t"
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" add x1, x1, #64                            \n\t"
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+"                                            \n\t"
+"                                            \n\t"                  //End it 4
+" add x0, x0, #192                           \n\t"
+"                                            \n\t"
+" .DCONSIDERKLEFT:                           \n\t" 
+" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
+" beq .DPOSTACCUM                            \n\t" // else, we enter the k_left loop.
+"                                            \n\t"
+".DLOOPKLEFT:                                \n\t"
+"                                            \n\t"
+" ldr  z0, [x0]                              \n\t" // Load a
+" ldr  z1, [x0, #1, MUL VL]                  \n\t"
+" add x0, x0, #64                            \n\t"
+"                                            \n\t"
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+" add x1, x1, #64                            \n\t"
+"                                            \n\t"
+" sub x6,x6,1                                \n\t"
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+"                                            \n\t"
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+"                                            \n\t"
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+"                                            \n\t"
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+"                                            \n\t"
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+"                                            \n\t"
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+"                                            \n\t"
+" cmp x6,0                                   \n\t" // Iterate again.
+" bne .DLOOPKLEFT                            \n\t" // if i!=0.
+"                                            \n\t"
+" .DPOSTACCUM:                               \n\t"
+"                                            \n\t"
+" ld1rd {z6.d}, p0/z, [x7]                   \n\t" // Load alpha.
+" ld1rd {z7.d}, p0/z, [x8]                   \n\t" // Load beta
+"                                            \n\t"
+" cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
+" bne .DGENSTORED                            \n\t"
+"                                            \n\t"
+" .DCOLSTORED:                               \n\t" // C is column-major.
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t"
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z0, [x2]                               \n\t" //Load column 0 of C
+" ldr z1, [x2, #1, MUL VL]                   \n\t"
+"                                            \n\t"
+" ldr z2, [x20]                              \n\t" //Load column 1 of C
+" ldr z3, [x20, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS1:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z16.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z17.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" str z0, [x2]                               \n\t" //Store column 0 of C
+" str z1, [x2, #1, MUL VL]                   \n\t"
+"                                            \n\t"
+" str z2, [x20]                              \n\t" //Store column 1 of C
+" str z3, [x20, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" dup  z8.d,  #0                             \n\t"
+" dup  z9.d,  #0                             \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z8, [x21]                              \n\t" //Load column 2 of C
+" ldr z9, [x21, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ldr z10, [x22]                             \n\t" //Load column 3 of C
+" ldr z11, [x22, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla z8.d,  z20.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z21.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z22.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z23.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str z8, [x21]                              \n\t" //Store column 2 of C
+" str z9, [x21, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" str z10, [x22]                             \n\t" //Store column 3 of C
+" str z11, [x22, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t"
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z0, [x23]                              \n\t" //Load column 4 of C
+" ldr z1, [x23, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ldr z2, [x24]                              \n\t" //Load column 5 of C
+" ldr z3, [x24, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z24.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z25.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z26.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z27.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" str z0, [x23]                              \n\t" //Store column 4 of C
+" str z1, [x23, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" str z2, [x24]                              \n\t" //Store column 5 of C
+" str z3, [x24, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" dup  z8.d,  #0                             \n\t"
+" dup  z9.d,  #0                             \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z8, [x25]                              \n\t" //Load column 6 of C
+" ldr z9, [x25, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ldr z10, [x26]                             \n\t" //Load column 7 of C
+" ldr z11, [x26, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS4:                     \n\t"
+"                                            \n\t"
+" prfm pldl2keep,[x3]                        \n\t"
+" prfm pldl2keep,[x4]                        \n\t"
+"                                            \n\t"
+" fmla z8.d,  z28.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z29.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z30.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z31.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str z8, [x25]                              \n\t" //Store column 6 of C
+" str z9, [x25, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" str z10, [x26]                             \n\t" //Store column 7 of C
+" str z11, [x26, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" b .DEND                                    \n\t"
+"                                            \n\t"
+" .DGENSTORED:                               \n\t" // C is general-stride stored.
+"                                            \n\t"
+"                                            \n\t" // x14 is row-stride in number of bytes.
+" lsl x15,x14,#2                             \n\t" // x15 is 4-row-stride, which is the address offset 
+"                                            \n\t" //     btw c(4,*) and c(0,*)
+" index z4.d, xzr, x14                       \n\t" // z4  is address offsets of four contiguous elements
+"                                            \n\t" //     in a column. such as c( 0:3,* ).
+"                                            \n\t" //     z4 is used as vector index for gather/scatter
+"                                            \n\t" //     loading/storing from column of *c
+"                                            \n\t"
+"                                            \n\t" // C's each column's address:
+"                                            \n\t" //     x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7)
+"                                            \n\t" //     x5, x6,  x7,  x8,  x16, x17, x18, x19: are addresses of c(4,0:7)
+" add  x5,  x15, x2                          \n\t" // x5  is address of c(4,0)
+" add  x6,  x15, x20                         \n\t" // x6  is address of c(4,1)
+" add  x7,  x15, x21                         \n\t" // x7  is address of c(4,2)
+" add  x8,  x15, x22                         \n\t" // x8  is address of c(4,3)
+" add  x16, x15, x23                         \n\t" // x16 is address of c(4,4)
+" add  x17, x15, x24                         \n\t" // x17 is address of c(4,5)
+" add  x18, x15, x25                         \n\t" // x18 is address of c(4,6)
+" add  x19, x15, x26                         \n\t" // x19 is address of c(4,7)
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t" // C column 0, 1
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x2  is address of c(0,0)
+"                                            \n\t" // x5  is address of c(4,0)
+"                                            \n\t" // x20 is address of c(0,1)
+"                                            \n\t" // x6  is address of c(4,1)
+" ld1d {z0.d}, p0/z, [x2, z4.d]              \n\t" // Load c( 0:3,0 ) into z0
+" ld1d {z1.d}, p0/z, [x5, z4.d]              \n\t" // Load c( 4:7,0 ) into z1
+" ld1d {z2.d}, p0/z, [x20, z4.d]             \n\t" // Load c( 0:3,1 ) into z2
+" ld1d {z3.d}, p0/z, [x6 , z4.d]             \n\t" // Load c( 4:7,1 ) into z3
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS1:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z16.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z17.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z0.d}, p0, [x2 , z4.d]               \n\t" // Store c( 0:3,0 ) <- z0
+" st1d {z1.d}, p0, [x5 , z4.d]               \n\t" // Store c( 4:7,0 ) <- z1
+" st1d {z2.d}, p0, [x20, z4.d]               \n\t" // Store c( 0:3,1 ) <- z2
+" st1d {z3.d}, p0, [x6 , z4.d]               \n\t" // Store c( 4:7,1 ) <- z3
+"                                            \n\t"
+"                                            \n\t"
+"                                            \n\t"
+" dup  z8.d, #0                              \n\t" // C column 2, 3
+" dup  z9.d, #0                              \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x21 is address of c(0,2)
+"                                            \n\t" // x7  is address of c(4,2)
+"                                            \n\t" // x22 is address of c(0,3)
+"                                            \n\t" // x8  is address of c(4,3)
+" ld1d {z8.d},  p0/z, [x21, z4.d]            \n\t" // Load c( 0:3,2 ) into z8
+" ld1d {z9.d},  p0/z, [x7 , z4.d]            \n\t" // Load c( 4:7,2 ) into z9
+" ld1d {z10.d}, p0/z, [x22, z4.d]            \n\t" // Load c( 0:3,3 ) into z10
+" ld1d {z11.d}, p0/z, [x8 , z4.d]            \n\t" // Load c( 4:7,3 ) into z11
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla z8.d,  z20.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z21.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z22.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z23.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z8.d},  p0, [x21, z4.d]              \n\t" // Store c( 0:3,2 ) <- z8
+" st1d {z9.d},  p0, [x7 , z4.d]              \n\t" // Store c( 4:7,2 ) <- z9
+" st1d {z10.d}, p0, [x22, z4.d]              \n\t" // Store c( 0:3,3 ) <- z10
+" st1d {z11.d}, p0, [x8 , z4.d]              \n\t" // Store c( 4:7,3 ) <- z11
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t" // C column 4, 5
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x23 is address of c(0,4)
+"                                            \n\t" // x16 is address of c(4,4)
+"                                            \n\t" // x24 is address of c(0,5)
+"                                            \n\t" // x17 is address of c(4,5)
+" ld1d {z0.d}, p0/z, [x23, z4.d]             \n\t" // Load c( 0:3,4 ) into z0
+" ld1d {z1.d}, p0/z, [x16, z4.d]             \n\t" // Load c( 4:7,4 ) into z1
+" ld1d {z2.d}, p0/z, [x24, z4.d]             \n\t" // Load c( 0:3,5 ) into z2
+" ld1d {z3.d}, p0/z, [x17, z4.d]             \n\t" // Load c( 4:7,5 ) into z3
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z24.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z25.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z26.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z27.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z0.d}, p0, [x23, z4.d]               \n\t" // Store c( 0:3,4 ) <- z0
+" st1d {z1.d}, p0, [x16, z4.d]               \n\t" // Store c( 4:7,4 ) <- z1
+" st1d {z2.d}, p0, [x24, z4.d]               \n\t" // Store c( 0:3,5 ) <- z2
+" st1d {z3.d}, p0, [x17, z4.d]               \n\t" // Store c( 4:7,5 ) <- z3
+"                                            \n\t"
+" dup  z8.d, #0                              \n\t" // C column 6, 7
+" dup  z9.d, #0                              \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x25 is address of c(0,6)
+"                                            \n\t" // x18 is address of c(4,6)
+"                                            \n\t" // x26 is address of c(0,7)
+"                                            \n\t" // x19 is address of c(4,7)
+" ld1d {z8.d},  p0/z, [x25, z4.d]            \n\t" // Load c( 0:3,6 ) into z8
+" ld1d {z9.d},  p0/z, [x18, z4.d]            \n\t" // Load c( 4:7,6 ) into z9
+" ld1d {z10.d}, p0/z, [x26, z4.d]            \n\t" // Load c( 0:3,7 ) into z10
+" ld1d {z11.d}, p0/z, [x19, z4.d]            \n\t" // Load c( 4:7,7 ) into z11
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS4:                     \n\t"
+"                                            \n\t"
+" fmla z8.d,  z28.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z29.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z30.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z31.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z8.d},  p0, [x25, z4.d]              \n\t" // Store c( 0:3,6 ) <- z8
+" st1d {z9.d},  p0, [x18, z4.d]              \n\t" // Store c( 4:7,6 ) <- z9
+" st1d {z10.d}, p0, [x26, z4.d]              \n\t" // Store c( 0:3,7 ) <- z10
+" st1d {z11.d}, p0, [x19, z4.d]              \n\t" // Store c( 4:7,7 ) <- z11
+"                                            \n\t"
+" .DEND:                                     \n\t" // Done!
+"                                            \n\t"
+:// output operands (none)
+:// input operands
+ [aaddr]  "m" (a),      // 0
+ [baddr]  "m" (b),      // 1
+ [caddr]  "m" (c),      // 2
+ [k_iter] "m" (k_iter), // 3
+ [k_left] "m" (k_left), // 4
+ [alpha]  "m" (alpha),  // 5
+ [beta]   "m" (beta),   // 6
+ [rs_c]   "m" (rs_c),   // 6
+ [cs_c]   "m" (cs_c),   // 7
+ [a_next] "m" (a_next), // 8
+ [b_next] "m" (b_next)  // 9
+:// Register clobber list
+ "x0","x1","x2","x3",
+ "x4","x5","x6",
+ "x7","x8","x9",
+ "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19",
+ "x20","x21","x22","x23","x24","x25","x26",
+ "x27",       
+ "v0","v1","v2",
+ "v3","v4","v5",
+ "v6","v7","v8",
+ "v9","v10","v11",
+ "v12","v13","v14",
+ "v15","v16","v17","v18","v19",
+ "v20","v21","v22","v23",
+ "v24","v25","v26","v27",
+ "v28","v29","v30","v31"
+);
+
+}
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -32,12 +32,6 @@

 */

-#ifndef BLIS_KERNEL_H
-#define BLIS_KERNEL_H
-
-
-
-
-
-#endif
+GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )

+PACKM_KER_PROT( double,   d, packm_armsve256_asm_8xk )
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -108,13 +108,6 @@ __asm__ volatile
 " add x25,x24,x10                            \n\t" //Load address Column 10 of C
 " add x26,x25,x10                            \n\t" //Load address Column 11 of C
 "                                            \n\t"
-" ldr q0, [x0]                               \n\t"
-" ldr q1, [x0, #16]                          \n\t" // Load a
-"                                            \n\t"
-" ldr q2, [x1]                               \n\t" // Load b
-" ldr q3, [x1, #16]                          \n\t"
-" ldr q4, [x1, #32]                          \n\t"
-"                                            \n\t"
 " prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
 " prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
@@ -164,8 +157,15 @@ __asm__ volatile
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
 " beq .SCONSIDERKLEFT                        \n\t"
 "                                            \n\t"
-"add x0, x0, #32                             \n\t" //update address of A
-"add x1, x1, #48                             \n\t" //update address of B
+" ldr q0, [x0]                               \n\t"
+" ldr q1, [x0, #16]                          \n\t" // Load a
+"                                            \n\t"
+" ldr q2, [x1]                               \n\t" // Load b
+" ldr q3, [x1, #16]                          \n\t"
+" ldr q4, [x1, #32]                          \n\t"
+"                                            \n\t"
+" add x0, x0, #32                            \n\t" //update address of A
+" add x1, x1, #48                            \n\t" //update address of B
 "                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
 " beq .SLASTITER                             \n\t" // (as loop is do-while-like).
@@ -1166,15 +1166,6 @@ __asm__ volatile
 " prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
 "                                            \n\t"
-" ldr q0, [x0]                               \n\t"
-" ldr q1, [x0, #16]                          \n\t" // Load a
-" ldr q2, [x0, #32]                          \n\t"
-"                                            \n\t"
-" ldr q3, [x1]                               \n\t" // Load b
-" ldr q4, [x1, #16]                          \n\t"
-" ldr q5, [x1, #32]                          \n\t"
-" ldr q6, [x1, #48]                          \n\t"
-"                                            \n\t"
 " dup  v8.2d, xzr                            \n\t" // Vector for accummulating column 0
 " prfm    PLDL1KEEP, [x1, #256]              \n\t" 
 " dup  v9.2d, xzr                            \n\t" // Vector for accummulating column 0
@@ -1214,8 +1205,17 @@ __asm__ volatile
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
 " beq .DCONSIDERKLEFT                        \n\t"
 "                                            \n\t"
-"add x0, x0, #48                             \n\t" //update address of A
-"add x1, x1, #64                             \n\t" //update address of B
+" ldr q0, [x0]                               \n\t" // Load a
+" ldr q1, [x0, #16]                          \n\t"
+" ldr q2, [x0, #32]                          \n\t"
+"                                            \n\t"
+" ldr q3, [x1]                               \n\t" // Load b
+" ldr q4, [x1, #16]                          \n\t"
+" ldr q5, [x1, #32]                          \n\t"
+" ldr q6, [x1, #48]                          \n\t"
+"                                            \n\t"
+" add x0, x0, #48                            \n\t" //update address of A
+" add x1, x1, #64                            \n\t" //update address of B
 "                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
 " beq .DLASTITER                             \n\t" // (as loop is do-while-like).
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -65,6 +65,38 @@ typedef union
 	double  d[2];
 }v2dd_t;

+// return a mask which indicates either:
+// - v1 > v2
+// - v1 is NaN and v2 is not
+// assumes that idx(v1) > idx(v2)
+// all "OQ" comparisons false if either operand NaN
+#define CMP256( dt, v1, v2 ) \
+	_mm256_or_p##dt( _mm256_cmp_p##dt( v1, v2, _CMP_GT_OQ ),                        /* v1 > v2  ||     */ \
+	                 _mm256_andnot_p##dt( _mm256_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \
+	                                      _mm256_cmp_p##dt( v1, v1, _CMP_UNORD_Q )  /*    isnan(v1) )  */ \
+	                                    ) \
+	               );
+
+// return a mask which indicates either:
+// - v1 > v2
+// - v1 is NaN and v2 is not
+// - v1 == v2 (maybe == NaN) and i1 < i2
+// all "OQ" comparisons false if either operand NaN
+#define CMP128( dt, v1, v2, i1, i2 ) \
+	_mm_or_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_GT_OQ ),                      /* ( v1 > v2 ||           */ \
+	                            _mm_andnot_p##dt( _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ),  /*   ( !isnan(v2) &&      */ \
+	                                              _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q )   /*      isnan(v1) ) ) ||  */ \
+	                                            ) \
+	                          ), \
+	              _mm_and_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_EQ_OQ ),                  /* ( ( v1 == v2 ||        */ \
+	                                           _mm_and_p##dt( _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ), /*     ( isnan(v1) &&     */ \
+	                                                          _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q )  /*       isnan(v2) ) ) && */ \
+	                                                        ) \
+	                                         ), \
+	                             _mm_cmp_p##dt( i1, i2, _CMP_LT_OQ )                                 /*   i1 < i2 )            */ \
+	                           ) \
+	            );
+
 // -----------------------------------------------------------------------------

 void bli_samaxv_zen_int
@@ -122,8 +154,8 @@ void bli_samaxv_zen_int
 			   the previous largest, save it and its index. If NaN is
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
-			   behavior mimics that of LAPACK's ?lange(). */
-			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			   behavior mimics that of LAPACK's i?amax(). */
+			if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
 			{
 				abs_chi1_max = abs_chi1;
 				i_max_l      = i;
@@ -157,7 +189,7 @@ void bli_samaxv_zen_int
 			// Get the absolute value of the vector element.
 			x_vec.v      = _mm256_andnot_ps( sign_mask.v, x_vec.v );

-			mask_vec.v   = _mm256_cmp_ps( x_vec.v, max_vec.v, _CMP_GT_OS );
+			mask_vec.v   = CMP256( s, x_vec.v, max_vec.v );

 			max_vec.v    = _mm256_blendv_ps( max_vec.v, x_vec.v, mask_vec.v );
 			maxInx_vec.v = _mm256_blendv_ps( maxInx_vec.v, idx_vec.v, mask_vec.v );
@@ -166,33 +198,34 @@ void bli_samaxv_zen_int
 			x         += num_vec_elements;
 		}

-		max_vec_lo.v  = _mm256_extractf128_ps( max_vec.v, 0 );
-		max_vec_hi.v  = _mm256_extractf128_ps( max_vec.v, 1 );
-		mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
-
-		max_vec_lo.v  = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
-
+		max_vec_lo.v    = _mm256_extractf128_ps( max_vec.v, 0 );
+		max_vec_hi.v    = _mm256_extractf128_ps( max_vec.v, 1 );
 		maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 );
 		maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 );
-		maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
-
-		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 14 );
-		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
-		mask_vec_lo.v   = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
+		
+		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );

 		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
 		maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );

-		if ( max_vec_lo.f[0] > max_vec_lo.f[1] )
-		{
-			abs_chi1_max = max_vec_lo.f[0];
-			i_max_l      = maxInx_vec_lo.f[0];
-		}
-		else
-		{
-			abs_chi1_max = max_vec_lo.f[1];
-			i_max_l      = maxInx_vec_lo.f[1];
-		}
+		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 14 );
+		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
+		
+		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
+
+		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
+		maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
+
+		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 1 );
+		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 );
+		
+		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
+
+		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
+		maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
+
+		abs_chi1_max = max_vec_lo.f[0];
+		i_max_l      = maxInx_vec_lo.f[0];

 		for ( i = n - n_left; i < n; i++ )
 		{
@@ -208,8 +241,8 @@ void bli_samaxv_zen_int
 			   the previous largest, save it and its index. If NaN is
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
-			   behavior mimics that of LAPACK's ?lange(). */
-			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			   behavior mimics that of LAPACK's i?amax(). */
+			if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
 			{
 				abs_chi1_max = abs_chi1;
 				i_max_l      = i;
@@ -286,8 +319,8 @@ void bli_damaxv_zen_int
 			   the previous largest, save it and its index. If NaN is
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
-			   behavior mimics that of LAPACK's ?lange(). */
-			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			   behavior mimics that of LAPACK's i?amax(). */
+			if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
 			{
 				abs_chi1_max = abs_chi1;
 				i_max_l      = i;
@@ -321,7 +354,7 @@ void bli_damaxv_zen_int
 			// Get the absolute value of the vector element.
 			x_vec.v      = _mm256_andnot_pd( sign_mask.v, x_vec.v );

-			mask_vec.v   = _mm256_cmp_pd( x_vec.v, max_vec.v, _CMP_GT_OS );
+			mask_vec.v   = CMP256( d, x_vec.v, max_vec.v );

 			max_vec.v    = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v );
 			maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v );
@@ -330,26 +363,26 @@ void bli_damaxv_zen_int
 			x         += num_vec_elements;
 		}

-		max_vec_lo.v  = _mm256_extractf128_pd( max_vec.v, 0 );
-		max_vec_hi.v  = _mm256_extractf128_pd( max_vec.v, 1 );
-		mask_vec_lo.v = _mm_cmp_pd( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
-
-		max_vec_lo.v  = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
-
+		max_vec_lo.v    = _mm256_extractf128_pd( max_vec.v, 0 );
+		max_vec_hi.v    = _mm256_extractf128_pd( max_vec.v, 1 );
 		maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
 		maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
+		
+		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
+
+		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
+		maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
+		
+		max_vec_hi.v    = _mm_permute_pd( max_vec_lo.v, 1 );
+		maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 );
+		
+		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
+
+		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
 		maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );

-		if ( max_vec_lo.d[0] > max_vec_lo.d[1] )
-		{
-			abs_chi1_max = max_vec_lo.d[0];
-			i_max_l      = maxInx_vec_lo.d[0];
-		}
-		else
-		{
-			abs_chi1_max = max_vec_lo.d[1];
-			i_max_l      = maxInx_vec_lo.d[1];
-		}
+		abs_chi1_max = max_vec_lo.d[0];
+		i_max_l      = maxInx_vec_lo.d[0];

 		for ( i = n - n_left; i < n; i++ )
 		{
@@ -363,10 +396,9 @@ void bli_damaxv_zen_int

 			/* If the absolute value of the current element exceeds that of
 			   the previous largest, save it and its index. If NaN is
-			   encountered, then treat it the same as if it were a valid
-			   value that was smaller than any previously seen. This
-			   behavior mimics that of LAPACK's ?lange(). */
-			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			   encountered, return the index of the first NaN. This
+			   behavior mimics that of LAPACK's i?amax(). */
+			if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
 			{
 				abs_chi1_max = abs_chi1;
 				i_max_l      = i;
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
 			   behavior mimics that of LAPACK's ?lange(). */ \
-			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
+			if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
 				i_max_l      = i; \
@@ -129,7 +129,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			   encountered, then treat it the same as if it were a valid
 			   value that was smaller than any previously seen. This
 			   behavior mimics that of LAPACK's ?lange(). */ \
-			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
+			if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
 				i_max_l      = i; \
--- a/sandbox/ref99/packm/blx_l3_packm.c
+++ b/sandbox/ref99/packm/blx_l3_packm.c
@@ -51,7 +51,7 @@ void blx_l3_packm
 	siz_t     size_needed;

 	// FGVZ: Not sure why we need this barrier, but we do.
-	bli_thread_obarrier( thread );
+	bli_thread_barrier( thread );

 	// Every thread initializes x_pack and determines the size of memory
 	// block needed (which gets embedded into the otherwise "blank" mem_t
@@ -102,7 +102,7 @@ void blx_l3_packm

 		// Broadcast the address of the chief thread's local mem_t entry to
 		// all threads.
-		local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
+		local_mem_p = bli_thread_broadcast( thread, &local_mem_s );

 		// Save the contents of the chief thread's local mem_t entry to the
 		// mem_t field in this thread's control tree node.
@@ -142,7 +142,7 @@ void blx_l3_packm

 			// Broadcast the address of the chief thread's local mem_t entry to
 			// all threads.
-			local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
+			local_mem_p = bli_thread_broadcast( thread, &local_mem_s );

 			// Save the chief thread's local mem_t entry to the mem_t field in
 			// this thread's control tree node.
@@ -155,7 +155,7 @@ void blx_l3_packm
 			// will already have the cached values in their local control
 			// trees' mem_t entries, currently pointed to by cntl_mem_p.

-			bli_thread_obarrier( thread );
+			bli_thread_barrier( thread );
 		}
 	}

@@ -178,6 +178,6 @@ void blx_l3_packm
 	);

 	// Barrier so that packing is done before computation.
-	bli_thread_obarrier( thread );
+	bli_thread_barrier( thread );
 }

--- a/sandbox/ref99/vars/blx_gemm_blk_var3.c
+++ b/sandbox/ref99/vars/blx_gemm_blk_var3.c
@@ -73,7 +73,7 @@ void blx_gemm_blk_var3
 		  bli_thrinfo_sub_node( thread )
 		);

-		bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
+		bli_thread_barrier( bli_thrinfo_sub_node( thread ) );

 		// This variant executes multiple rank-k updates. Therefore, if the
 		// internal beta scalar on matrix C is non-zero, we must use it
--- a/test/1m4m/runme.sh
+++ b/test/1m4m/runme.sh
@@ -7,18 +7,22 @@ delay=0.1

 #sys="blis"
 #sys="stampede2"
-sys="lonestar5"
+#sys="lonestar5"
 #sys="ul252"
-#sys="ul264"
+sys="ul264"

 # Bind threads to processors.
 #export OMP_PROC_BIND=true
 #export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
 #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"

+# Most systems don't run the executables through anything else, but ul264
+# uses numactl.
+runcmd=""
+
 if [ ${sys} = "blis" ]; then

-	export GOMP_CPU_AFFINITY="0 1 2 3"
+	export GOMP_CPU_AFFINITY="0-3"

 	threads="jc1ic1jr1_2400
 	         jc2ic3jr2_6000
@@ -35,7 +39,7 @@ elif [ ${sys} = "stampede2" ]; then

 elif [ ${sys} = "lonestar5" ]; then

-	export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23"
+	export GOMP_CPU_AFFINITY="0-23"

 	# A hack to use libiomp5 with gcc.
 	#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
@@ -45,12 +49,11 @@ elif [ ${sys} = "lonestar5" ]; then
 	#         jc4ic3jr2_9600"
 	threads="jc1ic1jr1_2400
 	         jc4ic3jr2_7200"
-	threads="jc4ic3jr2_7200"

 elif [ ${sys} = "ul252" ]; then

 	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
-	export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51"
+	export GOMP_CPU_AFFINITY="0-51"

 	threads="jc1ic1jr1_2400
 	         jc2ic13jr1_6000
@@ -59,12 +62,14 @@ elif [ ${sys} = "ul252" ]; then
 elif [ ${sys} = "ul264" ]; then

 	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
-	export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63"
+	export GOMP_CPU_AFFINITY="0-63"

+	#threads="jc1ic1jr1_2400"
 	threads="jc1ic1jr1_2400
-	         jc1ic8jr4_6000
-	         jc2ic8jr4_8000"
+	         jc1ic8jr4_4800
+	         jc2ic8jr4_7200"

+	#runcmd="numactl -i all"
 fi

 # Datatypes to test.
@@ -75,34 +80,11 @@ test_dts="s d c z"
 test_ops="gemm"

 # Implementations to test.
-#impls="blis"
-#impls="other"
-#impls="eigen"
-impls="all"
-
-if [ "${impls}" = "blis" ]; then
-
-	test_impls="asm_blis"
-
-elif [ "${impls}" = "eigen" ]; then
-
-	test_impls="eigen"
-
-elif [ "${impls}" = "other" ]; then
-
-	test_impls="openblas vendor"
-
-elif [ "${impls}" = "eigen" ]; then
-
-	test_impls="eigen"
-
-else
-
-	test_impls="openblas vendor asm_blis 4m1a_blis 1m_blis"
-	#test_impls="openblas"
-	#test_impls="asm_blis 4m1a_blis 1m_blis"
-	#test_impls="asm_blis 1m_blis"
-fi
+#test_impls="openblas vendor asm_blis 1m_blis 4m1a_blis"
+#test_impls="asm_blis 1m_blis 4m1a_blis"
+#test_impls="asm_blis"
+#test_impls="4m1a_blis"
+test_impls="asm_blis 4m1a_blis 1m_blis"

 # Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
 # restore the value.
@@ -181,7 +163,9 @@ for th in ${threads}; do

 					# Set the threading parameters based on the implementation
 					# that we are preparing to run.
-					if   [ "${im}" = "asm_blis" ]; then
+					if   [ "${im}" = "asm_blis"  ] || \
+					     [ "${im}" = "1m_blis"   ] || \
+					     [ "${im}" = "4m1a_blis" ]; then
 						unset  OMP_NUM_THREADS
 						export BLIS_JC_NT=${jc_nt}
 						export BLIS_PC_NT=${pc_nt}
@@ -228,10 +212,12 @@ for th in ${threads}; do
 				out_file="${out_root}_${suf}_${dt}${op}_${im}.m"

 				#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
-				echo "Running ./${exec_name} > ${out_file}"
+				echo "Running: ${runcmd} ./${exec_name} > ${out_file}"

 				# Run executable.
-				./${exec_name} > ${out_file}
+				#./${exec_name} > ${out_file}
+				#numactl -i all ./${exec_name} > ${out_file}
+				eval "${runcmd} ./${exec_name} > ${out_file}"

 				sleep ${delay}

--- a/Show More
+++ b/Show More