Merge branch 'master' into dev

2026-04-19 23:28:52 +00:00 · 2021-06-13 19:44:14 -05:00
parent b683d01b9c d10e05bbd1
commit 689fa0f403
81 changed files with 8915 additions and 79 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,40 +1,50 @@
 language: c
 sudo: required
-dist: trusty
+dist: focal
 matrix:
  include:
  # full testsuite (all tests except for mixed datatype)
  - os: linux
    compiler: gcc
-    env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto"
+    env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" \
+      PACKAGES="gcc-8 binutils"
  # mixed-datatype testsuite (gemm_nn only)
  - os: linux
    compiler: gcc
-    env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto"
+    env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto" \
+      PACKAGES="gcc-8 binutils"
  # salt testsuite (fast set of operations+parameters)
  - os: linux
    compiler: gcc
-    env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto"
+    env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto" \
+      PACKAGES="gcc-8 binutils"
  # test x86_64 ukrs with SDE
  - os: linux
    compiler: gcc
-    env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64"
+    env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64" \
+      PACKAGES="gcc-8 binutils"
  # openmp build
  - os: linux
    compiler: gcc
-    env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto"
+    env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto" \
+      PACKAGES="gcc-8 binutils"
  # pthreads build
  - os: linux
    compiler: gcc
-    env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto"
+    env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto" \
+      PACKAGES="gcc-8 binutils"
  # out-of-tree build
  - os: linux
    compiler: gcc
-    env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto"
+    env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto" \
+      PACKAGES="gcc-8 binutils"
  # clang build
  - os: linux
    compiler: clang
    env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto"
+      # There seems to be some difficulty installing 2 Clang toolchains of different versions.
+      # Use the TravisCI default.
+      # PACKAGES="clang-8 binutils"
  # macOS with system compiler (clang)
  - os: osx
    compiler: clang
@@ -43,29 +53,23 @@ matrix:
  - os: linux
    compiler: arm-linux-gnueabihf-gcc
    env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa15" \
-      PACKAGES="gcc-arm-linux-gnueabihf qemu-system-arm qemu-user" \
+      PACKAGES="gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \
      TESTSUITE_WRAPPER="qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/"
  # cortexa57 build and fast testsuite (qemu)
  - os: linux
    compiler: aarch64-linux-gnu-gcc
    env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \
-      PACKAGES="gcc-aarch64-linux-gnu qemu-system-arm qemu-user" \
+      PACKAGES="gcc-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
      TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
+  # armsve build and fast testsuite (qemu)
+  - os: linux
+    compiler: aarch64-linux-gnu-gcc-10
+    env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \
+      PACKAGES="gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
+      TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/"
 install:
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/as; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/as /usr/bin/as; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/ld; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/ld /usr/bin/ld; fi
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-6"; fi
- if [ -n "$PACKAGES" ]; then sudo apt-get install -y $PACKAGES; fi
-addons:
-  apt:
-    sources:
-    - ubuntu-toolchain-r-test
-    packages:
-    - gcc-6
-    - binutils-2.26
-    - clang
+- if [ "$CC" = "gcc"  ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi
+- if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
 script:
 - export DIST_PATH=.
 - pwd
@@ -76,5 +80,7 @@ script:
 - ls -l
 - $CC --version
 - make -j 2
+# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx).
+- if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
 - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
 - if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi
--- a/1
+++ b/1
@@ -104,6 +104,7 @@ but many others have contributed code and feedback, including
  Costas Yamin             @cosstas
  Chenhan Yu               @ChenhanYu          (The University of Texas at Austin)
  Roman Yurchak            @rth                (Symerio)
+  Stefano Zampini          @stefanozampini
  M. Zhou                  @cdluminate

 BLIS's development was partially funded by grants from industry
--- a/2
+++ b/2
@@ -461,7 +461,7 @@ endif

 flat-header: check-env $(BLIS_H_FLAT)

-$(BLIS_H_FLAT): $(FRAME_H99_FILES)
+$(BLIS_H_FLAT): $(ALL_H99_FILES)
 ifeq ($(ENABLE_VERBOSE),yes)
 	$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
 else
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Contents
 * **[Key Features](#key-features)**
 * **[How to Download BLIS](#how-to-download-blis)**
 * **[Getting Started](#getting-started)**
+* **[Performance](#performance)**
 * **[Documentation](#documentation)**
 * **[External Packages](#external-packages)**
 * **[Discussion](#discussion)**
@@ -393,6 +394,24 @@ If/when you have time, we *strongly* encourage you to read the detailed
 walkthrough of the build system found in our [Build System](docs/BuildSystem.md)
 guide.

+Performance
+-----------
+
+We provide graphs that report performance of several implementations across a
+range of hardware types, multithreading configurations, problem sizes,
+operations, and datatypes. These pages also document most of the details needed
+to reproduce these experiments.
+
+ * **[Performance](docs/Performance.md).** This document reports empirically
+measured performance of a representative set of level-3 operations on a variety
+of hardware architectures, as implemented within BLIS and other BLAS libraries
+for all four of the standard floating-point datatypes.
+
+ * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
+empirically measured performance of `gemm` on select hardware architectures
+within BLIS and other BLAS libraries when performing matrix problems where one
+or two dimensions is exceedingly small.
+
 Documentation
 -------------

--- a/common.mk
+++ b/common.mk
@@ -202,12 +202,6 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
 files-that-contain      = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),)))
 files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f))))

-# Define a function that removes duplicate words from a list.
-# NOTE: This function was obtained via [1]; thanks bobbogo for this
-# concise definition.
-# [1] https://stackoverflow.com/questions/16144115/makefile-remove-duplicate-words-without-sorting
-rm-dupls = $(if $1,$(firstword $1) $(call rm-dupls,$(filter-out $(firstword $1),$1)))
-

 #
 # --- Include makefile configuration file --------------------------------------
--- a/config/a64fx/bli_a64fx_sector_cache.h
+++ b/config/a64fx/bli_a64fx_sector_cache.h
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+    // A64FX: set up cache sizes
+    //
+    // Reference: A64FX (TM) specification Fujitsu HPC Extension
+    // Link:      https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf
+    //
+    // 63:15 |    14:12    |  11  |    10:08    |  07  |    06:04    |  03  |    02:00    |
+    // RES0  | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max |
+    //
+    // the bits set number of maximum sectors from 0-7
+    // 000 - 0
+    // 001 - 1
+    // 010 - 2
+    // 011 - 3
+    // 100 - 4
+    // 101 - 5
+    // 110 - 6
+    // 111 - 7
+    //
+    // For L1 we want to maximize the number of sectors for B
+    // Configuration 1: 1 sector for  C (sector 3)
+    //                  1 sector for  A (sector 1)
+    //                  6 sectors for B (sector 2)
+    //                  0 sectors for the rest (sector 0)
+    // 
+    // 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000
+    //
+    // Configuration 2: 1 sector for  C (sector 3)
+    //                  1 sector for  A (sector 1)
+    //                  5 sectors for B (sector 2)
+    //                  1 sectors for the rest (sector 0)
+    // 
+    // 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001
+    //
+    // accessing the control register:
+    //
+    // MRS <Xt>, S3_3_C11_C8_2
+    // MSR S3_3_C11_C8_2, <Xt>
+    //
+    // TODO: First tests showed no change in performance, a deeper investigation
+    //       is necessary
+#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\
+{\
+    uint64_t sector_cache_config = config_bitfield;\
+    __asm__ volatile(\
+            "msr s3_3_c11_c8_2,%[sector_cache_config]"\
+            :\
+            : [sector_cache_config] "r" (sector_cache_config)\
+            :\
+            );\
+}
+
+#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\
+{\
+    uint64_t sector_cache_config = config_bitfield;\
+    __asm__ volatile(\
+            "msr s3_3_c15_c8_2,%[sector_cache_config]"\
+            :\
+            : [sector_cache_config] "r" (sector_cache_config)\
+            :\
+            );\
+}
+
+
+#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\
+" mov "#sparereg", "#tag"      \n\t"\
+" lsl "#sparereg", "#sparereg", 56  \n\t"\
+" orr "#areg", "#areg", "#sparereg"   \n\t"
+
+#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\
+__asm__ volatile(\
+        "mrs %["#output_uint64"],s3_3_c11_c8_2"\
+        : [output_uint64] "=r" (output_uint64)\
+        : \
+        :\
+        );
+
+#define A64FX_SCC(sec0,sec1,sec2,sec3)\
+    (uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12))
+
+#define A64FX_SCC_L2(sec02,sec13)\
+    (uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8))
+
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -0,0 +1,151 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_a64fx_sector_cache.h"
+
+void bli_cntx_init_a64fx( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+	blksz_t thresh[ BLIS_NUM_THRESH ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_a64fx_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native gemm micro-kernels and
+	// their storage preferences.
+	bli_cntx_set_l3_nat_ukrs
+	(
+	  2,
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  cntx
+	);
+
+	// Set SVE-512 packing routine.
+	bli_cntx_set_packm_kers
+	(
+	  3,
+	  BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+	  BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
+	  BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+	  cntx
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    16,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    10,    10,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   256,   128,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  2048,  2048,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  BLIS_NAT, 5,
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  cntx
+	);
+
+#if 0
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,   65,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,   65,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,   65,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  4,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  cntx
+	);
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    10,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,    16,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
+#endif
+
+	// Set A64FX cache sector sizes for each PE/CMG
+	// SC Fugaku might disable users' setting cache sizes.
+#if !defined(CACHE_SECTOR_SIZE_READONLY)
+#pragma omp parallel
+	{
+	  A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0))
+	  A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28))
+	}
+#endif
+
+}
+
--- a/config/a64fx/bli_family_a64fx.h
+++ b/config/a64fx/bli_family_a64fx.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+#define BLIS_SIMD_ALIGN_SIZE    256
+#define BLIS_SIMD_NUM_REGISTERS 32
+
+
+//#endif
+
--- a/config/a64fx/make_defs.mk
+++ b/config/a64fx/make_defs.mk
@@ -0,0 +1,82 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := a64fx
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -D_GNU_SOURCE -D_A64FX
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3 -ftree-vectorize -march=armv8-a+sve
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
--- a/config/armsve/bli_armsve_config_utils.c
+++ b/config/armsve/bli_armsve_config_utils.c
@@ -0,0 +1,92 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+
+dim_t bli_vl_bits_armsve(void)
+{ \
+    uint64_t vl = 0;
+    __asm__ (
+      " mov  x0, xzr   \n\t"
+      " incb x0        \n\t"
+      " mov  %[vl], x0 \n\t"
+    : [vl] "=r" (vl)
+    : 
+    : "x0"
+     );
+    return vl;
+}
+
+
+#define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \
+void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
+                                  dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \
+{ \
+    dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \
+    dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \
+    dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \
+    dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \
+    dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \
+    dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \
+    dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \
+    dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \
+    dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \
+\
+    dim_t vl_b = bli_vl_bits_armsve(); \
+    dim_t vl = vl_b / S_Data; \
+    dim_t m_r = 2 * vl; \
+    dim_t n_r = 10; \
+\
+    dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \
+        / (n_r * S_Data); \
+\
+    dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \
+    dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \
+    m_c -= m_c % m_r; \
+\
+    dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \
+    dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \
+    n_c -= n_c % n_r; \
+\
+    *m_r_ = m_r; \
+    *n_r_ = n_r; \
+    *k_c_ = k_c; \
+    *m_c_ = m_c; \
+    *n_c_ = n_c; \
+}
+
+EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
+EXPANDMAC_BLKSZ_ARMSVE( d, 8 )
+
--- a/config/armsve/bli_armsve_config_utils.h
+++ b/config/armsve/bli_armsve_config_utils.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+
+dim_t bli_vl_bits_armsve(void);
+
+void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
+void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
+
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_armsve_config_utils.h"
+
+void bli_cntx_init_armsve( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+#if 0
+	blksz_t thresh[ BLIS_NUM_THRESH ];
+#endif
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_armsve_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Block size.
+	dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
+	dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
+	bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
+	bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
+
+	// Update the context with optimized native gemm micro-kernels and
+	// their storage preferences.
+	bli_cntx_set_l3_nat_ukrs
+	(
+	  2,
+	  // These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  cntx
+	);
+
+	// Set VL-specific packing routines if applicable.
+	if (m_r_d==16)
+	  bli_cntx_set_packm_kers
+	  (
+		3,
+		BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+		BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
+		BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+		cntx
+	  );
+	else if (m_r_d==8)
+	  bli_cntx_set_packm_kers
+	  (
+		1,
+		BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk,
+		cntx
+	  );
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  BLIS_NAT, 5,
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  cntx
+	);
+
+#if 0
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  101,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  101,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  101,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  4,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  cntx
+	);
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1, n_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1, m_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  2048,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
+#endif
+}
+
--- a/config/armsve/bli_family_armsve.h
+++ b/config/armsve/bli_family_armsve.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+#define BLIS_SIMD_ALIGN_SIZE    256
+#define BLIS_SIMD_NUM_REGISTERS 32
+
+// SVE-specific configs.
+#define N_L1_SVE_DEFAULT 64
+#define W_L1_SVE_DEFAULT 4
+#define C_L1_SVE_DEFAULT 256
+#define N_L2_SVE_DEFAULT 2048
+#define W_L2_SVE_DEFAULT 16
+#define C_L2_SVE_DEFAULT 256
+#define N_L3_SVE_DEFAULT 8192
+#define W_L3_SVE_DEFAULT 16
+#define C_L3_SVE_DEFAULT 256
+
+//#endif
+
--- a/config/armsve/make_defs.mk
+++ b/config/armsve/make_defs.mk
@@ -0,0 +1,82 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := armsve
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -D_GNU_SOURCE
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3 -ftree-vectorize -march=armv8-a+sve
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
--- a/2
+++ b/2
@@ -32,6 +32,8 @@ piledriver:  piledriver
 bulldozer:   bulldozer

 # ARM architectures.
+armsve:      armsve/armsve
+a64fx:       a64fx/armsve
 thunderx2:   thunderx2/armv8a
 cortexa57:   cortexa57/armv8a
 cortexa53:   cortexa53/armv8a
--- a/5
+++ b/5
@@ -2373,6 +2373,11 @@ main()
 	fi

 	echo "${script_name}: using '${found_cc}' C compiler."
+	
+	# Also check the compiler to see if we are (cross-)compiling for Windows
+	if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then
+		is_win=yes
+	fi


 	# -- Find a C++ compiler ---------------------------------------------------
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -154,7 +154,7 @@ Originally, BLIS did indeed require the application to explicitly setup (initial

 Yes! BLIS supports multithreading (via OpenMP or POSIX threads) for all of its level-3 operations. For more information on enabling and controlling multithreading, please see the [Multithreading](Multithreading.md) guide.

-BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives is thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution.
+BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives its thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution.

 ### Does BLIS support NUMA environments?

--- a/docs/Performance.md
+++ b/docs/Performance.md
@@ -21,6 +21,9 @@
  * **[Zen2](Performance.md#zen2)**
    * **[Experiment details](Performance.md#zen2-experiment-details)**
    * **[Results](Performance.md#zen2-results)**
+  * **[A64fx](Performance.md#a64fx)**
+    * **[Experiment details](Performance.md#a64fx-experiment-details)**
+    * **[Results](Performance.md#a64fx-results)**
 * **[Feedback](Performance.md#feedback)**

 # Introduction
@@ -526,6 +529,78 @@ The `runthese.m` file will contain example invocations of the function.

 ---

+## A64fx
+
+### A64fx experiment details
+
+* Location: RIKEN Center of Computational Science in Kobe, Japan
+  * These test results were gathered on the Fugaku supercomputer under project "量子物質の創発と機能のための基礎科学 ―「富岳」と最先端実験の密連携による革新的強相関電子科学" (hp200132) (Basic Science for Emergence and Functionality in Quantum Matter: Innovative Strongly-Correlated Electron Science by Integration of "Fugaku" and Frontier Experiments)
+* Processor model: Fujitsu A64fx
+* Core topology: one socket, 4 NUMA groups per socket, 13 cores per group (one reserved for the OS), 48 cores total
+* SMT status: Unknown
+* Max clock rate: 2.2GHz (single- and multicore, observed)
+* Max vector register length: 512 bits (SVE)
+* Max FMA vector IPC: 2
+* Peak performance:
+  * single-core: 70.4 GFLOPS (double-precision), 140.8 GFLOPS (single-precision)
+  * multicore: 70.4 GFLOPS/core (double-precision), 140.8 GFLOPS/core (single-precision)
+* Operating system: RHEL 8.3
+* Page size: 256 bytes
+* Compiler: gcc 10.1.0
+* Results gathered: 2 April 2021; BLIS and SSL2 updated on 20 May 2021
+* Implementations tested:
+  * BLIS 61584de (post-0.8.1)
+    * configured with:
+      * `../configure -t none CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (single-threaded)
+      * `../configure -t openmp CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (multithreaded)
+    * sub-configuration exercised: `a64fx`
+    * Single-threaded (1 core) execution requested via no change in environment variables
+    * Multithreaded (12 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=12`
+    * Multithreaded (48 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=4 BLIS_JR_NT=12`
+  * Eigen 3.3.9
+    * Obtained via the [Eigen GitLab homepage](https://gitlab.com/libeigen/eigen)
+    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
+    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
+    * Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
+    * **NOTE**: This version of Eigen does not provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm`, and therefore those curves are omitted from the multithreaded graphs.
+  * ARMPL (20.1.0 for A64fx)
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
+    * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
+    * Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
+    * **NOTE**: While this version of ARMPL does provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm` (with the exception `dtrsm`), but these implementations yield very low performance, and their long run times led us to skip collecting these data altogether.
+  * Fujitsu SSL2 (Fujitsu toolchain 1.2.31)
+    * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1 NPARALLEL=1`
+    * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12 NPARALLEL=12`
+    * Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48 NPARALLEL=48`
+* Affinity:
+  * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="12-23 24-35 36-47 48-59"`.
+  * All executables were run through `numactl --interleave=all` (multithreaded only).
+* Frequency throttling: No change made. No frequency lowering observed.
+* Comments:
+  * Special thanks to Stepan Nassyr and RuQing G. Xu for their work in developing and optimizing A64fx support. Also, thanks to RuQing G. Xu for collecting the data that appear in these graphs.
+
+### A64fx results
+
+#### pdf
+
+* [A64fx single-threaded](graphs/large/l3_perf_a64fx_nt1.pdf)
+* [A64fx multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf)
+* [A64fx multithreaded (48 cores)](graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf)
+
+#### png (inline)
+
+* **A64fx single-threaded**
+![single-threaded](graphs/large/l3_perf_a64fx_nt1.png)
+* **A64fx multithreaded (12 cores)**
+![multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png)
+* **A64fx multithreaded (48 cores)**
+![multithreaded (48 cores)](graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png)
+
+---
+
 # Feedback

 Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers.
--- a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf
+++ b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf
--- a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png
+++ b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png
--- a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf
+++ b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf
--- a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png
+++ b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png
--- a/docs/graphs/large/l3_perf_a64fx_nt1.pdf
+++ b/docs/graphs/large/l3_perf_a64fx_nt1.pdf
--- a/docs/graphs/large/l3_perf_a64fx_nt1.png
+++ b/docs/graphs/large/l3_perf_a64fx_nt1.png
--- a/frame/3/bli_l3_sup_packm_a.c
+++ b/frame/3/bli_l3_sup_packm_a.c
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
 	} \
 	else /* if ( will_pack == TRUE ) */ \
 	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
+		/* NOTE: This "rounding up" of the last upanel is actually optional
 		   for the rrc/crc cases, but absolutely necessary for the other cases
 		   since we NEED that last micropanel to have the same ldim (cs_p) as
 		   the other micropanels. Why? So that millikernels can use the same
--- a/frame/3/bli_l3_sup_packm_b.c
+++ b/frame/3/bli_l3_sup_packm_b.c
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
 	} \
 	else /* if ( will_pack == TRUE ) */ \
 	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
+		/* NOTE: This "rounding up" of the last upanel is actually optional
 		   for the rrc/crc cases, but absolutely necessary for the other cases
 		   since we NEED that last micropanel to have the same ldim (cs_p) as
 		   the other micropanels. Why? So that millikernels can use the same
@@ -280,15 +280,15 @@ void PASTEMAC(ch,opname) \
 		} \
 		else \
 		{ \
-			/* All other stor3_t ids: pack A to column-stored row-panels. */ \
+			/* All other stor3_t ids: pack B to row-stored column-panels. */ \
 			*rs_p = nr; \
 			*cs_p = 1; \
 \
 			*pd_p = nr; \
 			*ps_p = k * nr; \
 \
-			/* Set the schema to "packed row panels" to indicate packing to
-			   conventional column-stored row panels. */ \
+			/* Set the schema to "packed column panels" to indicate packing to
+			   conventional row-stored column panels. */ \
 			*schema = BLIS_PACKED_COL_PANELS; \
 		} \
 \
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -173,6 +173,12 @@ void bli_arch_set_id( void )
 		#endif

 		// ARM microarchitectures.
+		#ifdef BLIS_FAMILY_ARMSVE
+		id = BLIS_ARCH_ARMSVE;
+		#endif
+		#ifdef BLIS_FAMILY_A64FX
+		id = BLIS_ARCH_A64FX;
+		#endif
 		#ifdef BLIS_FAMILY_THUNDERX2
 		id = BLIS_ARCH_THUNDERX2;
 		#endif
@@ -242,6 +248,8 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
    "thunderx2",
    "cortexa57",
    "cortexa53",
+    "armsve",
+    "a64fx",
    "cortexa15",
    "cortexa9",

--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -76,7 +76,7 @@ arch_t bli_cpuid_query_id( void )
 	printf( "vendor   = %s\n", vendor==1 ? "AMD": "INTEL" );
 	printf("family    = %x\n", family );
 	printf( "model    = %x\n", model );
-	
+
 	printf( "features = %x\n", features );
 #endif

@@ -455,6 +455,14 @@ arch_t bli_cpuid_query_id( void )
 		{
 			// Check for each ARMv8 configuration that is enabled, check for that
 			// microarchitecture. We check from most recent to most dated.
+#ifdef BLIS_CONFIG_ARMSVE
+			if ( bli_cpuid_is_armsve( model, part, features ) )
+				return BLIS_ARCH_ARMSVE;
+#endif
+#ifdef BLIS_CONFIG_A64FX
+			if ( bli_cpuid_is_a64fx( model, part, features ) )
+				return BLIS_ARCH_A64FX;
+#endif
 #ifdef BLIS_CONFIG_THUNDERX2
 			if ( bli_cpuid_is_thunderx2( model, part, features ) )
 				return BLIS_ARCH_THUNDERX2;
@@ -537,6 +545,36 @@ bool bli_cpuid_is_cortexa53
 	return TRUE;
 }

+bool bli_cpuid_is_armsve
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
+{
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_SVE;
+
+	if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
+
+	return TRUE;
+}
+
+bool bli_cpuid_is_a64fx
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
+{
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_SVE;
+
+	if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
+
+	return TRUE;
+}
+
 bool bli_cpuid_is_cortexa15
     (
       uint32_t family,
@@ -1032,6 +1070,10 @@ uint32_t bli_cpuid_query
 	     strstr( feat_str, "asimd" ) != NULL )
 		*features |= FEATURE_NEON;

+	// Parse the feature string to check for SVE features.
+	if ( strstr( feat_str, "sve" ) != NULL )
+		*features |= FEATURE_SVE;
+
 	//printf( "bli_cpuid_query(): features var: %u\n", *features );

 	// Parse the processor string to uncover the model.
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -72,6 +72,8 @@ bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features
 bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features );
+bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features );
+bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );

@@ -175,7 +177,8 @@ enum
 };
 enum
 {
-	FEATURE_NEON = 0x1
+	FEATURE_NEON = 0x01,
+	FEATURE_SVE  = 0x02
 };

 #endif
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -144,6 +144,16 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_cortexa53_ref,
 		                                              bli_cntx_init_cortexa53_ind );
 #endif
+#ifdef BLIS_CONFIG_ARMSVE
+		bli_gks_register_cntx( BLIS_ARCH_ARMSVE,      bli_cntx_init_armsve,
+		                                              bli_cntx_init_armsve_ref,
+		                                              bli_cntx_init_armsve_ind );
+#endif
+#ifdef BLIS_CONFIG_A64FX
+		bli_gks_register_cntx( BLIS_ARCH_A64FX,       bli_cntx_init_a64fx,
+		                                              bli_cntx_init_a64fx_ref,
+		                                              bli_cntx_init_a64fx_ind );
+#endif
 #ifdef BLIS_CONFIG_CORTEXA15
 		bli_gks_register_cntx( BLIS_ARCH_CORTEXA15,   bli_cntx_init_cortexa15,
 		                                              bli_cntx_init_cortexa15_ref,
--- a/frame/base/noopt/bli_dlamch.c
+++ b/frame/base/noopt/bli_dlamch.c
@@ -1,12 +1,14 @@
-/* dlamch.f -- translated by f2c (version 19991025).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
+#include "blis.h"
+
+#include <float.h>
+#include <fenv.h>
+#include <ctype.h>

 #ifdef __cplusplus
 extern "C" {
 #endif
-#include "blis.h"
+
+#ifdef BLIS_ENABLE_LEGACY_LAMCH

 double bli_pow_di( bla_double* a, bla_integer* n );

@@ -1027,6 +1029,59 @@ L10:

 } /* bli_dlamc5_ */

-#ifdef __cplusplus
+#else
+
+bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len)
+{
+/*          = 'E' or 'e',   DLAMCH := eps */
+/*          = 'S' or 's ,   DLAMCH := sfmin */
+/*          = 'B' or 'b',   DLAMCH := base */
+/*          = 'P' or 'p',   DLAMCH := eps*base */
+/*          = 'N' or 'n',   DLAMCH := t */
+/*          = 'R' or 'r',   DLAMCH := rnd */
+/*          = 'M' or 'm',   DLAMCH := emin */
+/*          = 'U' or 'u',   DLAMCH := rmin */
+/*          = 'L' or 'l',   DLAMCH := emax */
+/*          = 'O' or 'o',   DLAMCH := rmax */
+
+/*          where */
+
+/*          eps   = relative machine precision */
+/*          sfmin = safe minimum, such that 1/sfmin does not overflow */
+/*          base  = base of the machine */
+/*          prec  = eps*base */
+/*          t     = number of (base) digits in the mantissa */
+/*          rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise */
+/*          emin  = minimum exponent before (gradual) underflow */
+/*          rmin  = underflow threshold - base**(emin-1) */
+/*          emax  = largest exponent before overflow */
+/*          rmax  = overflow threshold  - (base**emax)*(1-eps) */
+
+	double safe_min = DBL_MIN;
+	double small = 1.0f / DBL_MAX;
+
+	if ( small >= safe_min )
+		safe_min = small * ( 1.0 + DBL_EPSILON );
+
+	switch ( toupper( *cmach ) )
+	{
+		case 'E': return DBL_EPSILON;
+		case 'S': return safe_min;
+		case 'B': return FLT_RADIX;
+		case 'P': return FLT_RADIX*DBL_EPSILON;
+		case 'N': return DBL_MANT_DIG;
+		case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0 : 0.0;
+		case 'M': return DBL_MIN_EXP;
+		case 'U': return DBL_MIN;
+		case 'L': return DBL_MAX_EXP;
+		case 'O': return DBL_MAX;
 	}
+	
+	return 0.0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
 #endif
--- a/frame/base/noopt/bli_slamch.c
+++ b/frame/base/noopt/bli_slamch.c
@@ -1,12 +1,14 @@
-/* slamch.f -- translated by f2c (version 19991025).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
+#include "blis.h"
+
+#include <float.h>
+#include <fenv.h>
+#include <ctype.h>

 #ifdef __cplusplus
 extern "C" {
 #endif
-#include "blis.h"
+
+#ifdef BLIS_ENABLE_LEGACY_LAMCH

 double bli_pow_ri( bla_real* a, bla_integer* n );

@@ -1022,6 +1024,59 @@ L10:

 } /* bli_slamc5_ */

-#ifdef __cplusplus
+#else
+
+bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len)
+{
+/*          = 'E' or 'e',   SLAMCH := eps */
+/*          = 'S' or 's ,   SLAMCH := sfmin */
+/*          = 'B' or 'b',   SLAMCH := base */
+/*          = 'P' or 'p',   SLAMCH := eps*base */
+/*          = 'N' or 'n',   SLAMCH := t */
+/*          = 'R' or 'r',   SLAMCH := rnd */
+/*          = 'M' or 'm',   SLAMCH := emin */
+/*          = 'U' or 'u',   SLAMCH := rmin */
+/*          = 'L' or 'l',   SLAMCH := emax */
+/*          = 'O' or 'o',   SLAMCH := rmax */
+
+/*          where */
+
+/*          eps   = relative machine precision */
+/*          sfmin = safe minimum, such that 1/sfmin does not overflow */
+/*          base  = base of the machine */
+/*          prec  = eps*base */
+/*          t     = number of (base) digits in the mantissa */
+/*          rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise */
+/*          emin  = minimum exponent before (gradual) underflow */
+/*          rmin  = underflow threshold - base**(emin-1) */
+/*          emax  = largest exponent before overflow */
+/*          rmax  = overflow threshold  - (base**emax)*(1-eps) */
+
+	float safe_min = FLT_MIN;
+	float small = 1.0f / FLT_MAX;
+
+	if ( small >= safe_min )
+		safe_min = small * ( 1.0f + FLT_EPSILON );
+
+	switch ( toupper( *cmach ) )
+	{
+		case 'E': return FLT_EPSILON;
+		case 'S': return safe_min;
+		case 'B': return FLT_RADIX;
+		case 'P': return FLT_RADIX*FLT_EPSILON;
+		case 'N': return FLT_MANT_DIG;
+		case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0f : 0.0f;
+		case 'M': return FLT_MIN_EXP;
+		case 'U': return FLT_MIN;
+		case 'L': return FLT_MAX_EXP;
+		case 'O': return FLT_MAX;
 	}
+	
+	return 0.0f;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
 #endif
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -83,6 +83,12 @@ CNTX_INIT_PROTS( bulldozer )

 // -- ARM architectures --

+#ifdef BLIS_CONFIG_ARMSVE
+CNTX_INIT_PROTS( armsve )
+#endif
+#ifdef BLIS_CONFIG_A64FX
+CNTX_INIT_PROTS( a64fx )
+#endif
 #ifdef BLIS_CONFIG_THUNDERX2
 CNTX_INIT_PROTS( thunderx2 )
 #endif
@@ -183,6 +189,12 @@ CNTX_INIT_PROTS( generic )

 // -- ARM architectures --

+#ifdef BLIS_FAMILY_ARMSVE
+#include "bli_family_armsve.h"
+#endif
+#ifdef BLIS_FAMILY_A64FX
+#include "bli_family_a64fx.h"
+#endif
 #ifdef BLIS_FAMILY_THUNDERX2
 #include "bli_family_thunderx2.h"
 #endif
--- a/frame/include/bli_genarray_macro_defs.h
+++ b/frame/include/bli_genarray_macro_defs.h
@@ -128,6 +128,20 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \



+// -- One-operand macro (with custom prefix) --
+
+#define GENARRAY_PREF(arrayname,prefix,op) \
+\
+arrayname[BLIS_NUM_FP_TYPES] = \
+{ \
+	PASTECH2(prefix,s,op), \
+	PASTECH2(prefix,c,op), \
+	PASTECH2(prefix,d,op), \
+	PASTECH2(prefix,z,op)  \
+}
+
+
+
 // -- Two-operand macros --


--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -1190,7 +1190,7 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
 // -- Initialization-related macros --

 // Finish the initialization started by the matrix-specific static initializer
-// (e.g. BLIS_OBJECT_PREINITIALIZER)
+// (e.g. BLIS_OBJECT_INITIALIZER)
 // NOTE: This is intended only for use in the BLAS compatibility API and typed
 // BLIS API.

@@ -1223,7 +1223,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
 }

 // Finish the initialization started by the 1x1-specific static initializer
-// (e.g. BLIS_OBJECT_PREINITIALIZER_1X1)
+// (e.g. BLIS_OBJECT_INITIALIZER_1X1)
 // NOTE: This is intended only for use in the BLAS compatibility API and typed
 // BLIS API.

--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1008,6 +1008,8 @@ typedef enum
 	BLIS_ARCH_BULLDOZER,

 	// ARM
+	BLIS_ARCH_ARMSVE,
+	BLIS_ARCH_A64FX,
 	BLIS_ARCH_THUNDERX2,
 	BLIS_ARCH_CORTEXA57,
 	BLIS_ARCH_CORTEXA53,
@@ -1032,7 +1034,7 @@ typedef enum

 // NOTE: This value must be updated to reflect the number of enum values
 // listed above for arch_t!
-#define BLIS_NUM_ARCHS 22
+//#define BLIS_NUM_ARCHS 25


 //
--- a/frame/include/bli_x86_asm_macros.h
+++ b/frame/include/bli_x86_asm_macros.h
@@ -885,6 +885,8 @@
 #define VADDSUBPD(_0, _1, _2) INSTR_(vaddsubpd, _0, _1, _2)
 #define VHADDPD(_0, _1, _2) INSTR_(vhaddpd, _0, _1, _2)
 #define VHADDPS(_0, _1, _2) INSTR_(vhaddps, _0, _1, _2)
+#define VHSUBPD(_0, _1, _2) INSTR_(vhsubpd, _0, _1, _2)
+#define VHSUBPS(_0, _1, _2) INSTR_(vhsubps, _0, _1, _2)
 #define VADDPS(_0, _1, _2) INSTR_(vaddps, _0, _1, _2)
 #define VADDPD(_0, _1, _2) INSTR_(vaddpd, _0, _1, _2)
 #define VSUBPS(_0, _1, _2) INSTR_(vsubps, _0, _1, _2)
@@ -1015,6 +1017,8 @@
 #define vaddsubpd(_0, _1, _2) VADDSUBPD(_0, _1, _2)
 #define vhaddpd(_0, _1, _2) VHADDPD(_0, _1, _2)
 #define vhaddps(_0, _1, _2) VHADDPS(_0, _1, _2)
+#define vhsubpd(_0, _1, _2) VHSUBPD(_0, _1, _2)
+#define vhsubps(_0, _1, _2) VHSUBPS(_0, _1, _2)
 #define vaddps(_0, _1, _2) VADDPS(_0, _1, _2)
 #define vaddpd(_0, _1, _2) VADDPD(_0, _1, _2)
 #define vsubps(_0, _1, _2) VSUBPS(_0, _1, _2)
--- a/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
+++ b/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
@@ -0,0 +1,45 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \
+  "trn1    " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
+  "trn2    " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
+  "compact " #DST2".d, " #P2C", " #DST0".d \n\t" \
+  "compact " #DST3".d, " #P2C", " #DST1".d \n\t" \
+  "compact " #DST4".d, " #P4C", " #DST0".d \n\t" \
+  "compact " #DST5".d, " #P4C", " #DST1".d \n\t" \
+  "compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \
+  "compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t"
+
--- a/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
+++ b/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
@@ -0,0 +1,97 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
+  "ptrue   " #PT".d \n\t" \
+  "mov     " #XTMP", #2 \n\t" \
+  "whilelo " #P2C".d, xzr, " #XTMP" \n\t" \
+  "mov     " #XTMP", #4 \n\t" \
+  "whilelo " #P4".d, xzr, " #XTMP" \n\t" \
+  "mov     " #XTMP", #6 \n\t" \
+  "whilelo " #P6".d, xzr, " #XTMP" \n\t" \
+ \
+  "eor     " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \
+  "orr     " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \
+ \
+  "not     " #P2C".b, " #PT"/z, " #P2C".b \n\t" \
+  "not     " #P4C".b, " #PT"/z, " #P4".b \n\t" \
+  "not     " #P6C".b, " #PT"/z, " #P6".b \n\t" \
+
+#define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
+  "trn1    " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \
+  "trn2    " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \
+  "trn1    " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \
+  "trn2    " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \
+  "trn1    " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \
+  "trn2    " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \
+  "trn1    " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \
+  "trn2    " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \
+ \
+  "compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \
+  "compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \
+  "ext     " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \
+  "ext     " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \
+  "compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \
+  "compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \
+  "ext     " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \
+  "ext     " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \
+ \
+  "sel     " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \
+  "sel     " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \
+  "sel     " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \
+  "sel     " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \
+  "sel     " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \
+  "sel     " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \
+  "sel     " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \
+  "sel     " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \
+ \
+  "compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \
+  "compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \
+  "compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \
+  "compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \
+  "ext     " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \
+  "ext     " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \
+  "ext     " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \
+  "ext     " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \
+ \
+  "sel     " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \
+  "sel     " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \
+  "sel     " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \
+  "sel     " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \
+  "sel     " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \
+  "sel     " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \
+  "sel     " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \
+  "sel     " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t"
+
--- a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
@@ -52,15 +52,12 @@ void bli_dpackm_armsve256_asm_8xk
       dim_t            cdim_,
       dim_t            n_,
       dim_t            n_max_,
-       void*   restrict kappa_,
-       void*   restrict a_, inc_t inca_, inc_t lda_,
-       void*   restrict p_,              inc_t ldp_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
       cntx_t* restrict cntx
     )
 {
-    double*       a     = ( double* )a_;
-    double*       p     = ( double* )p_;
-    double*       kappa = ( double* )kappa_;
    const int64_t cdim  = cdim_;
    const int64_t mnr   = 8;
    const int64_t n     = n_;
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
@@ -0,0 +1,365 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "armsve512_asm_transpose_d8x8.h"
+#include "armsve512_asm_transpose_d8x2.h"
+
+// assumption:
+//   SVE vector length = 512 bits.
+
+void bli_dpackm_armsve512_asm_10xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 10;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+    const bool    gs    = inca != 1 && lda != 1;
+    const bool    unitk = bli_deq1( *kappa );
+
+#ifdef _A64FX
+    if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
+    {
+        // A twisted way to infer whether A or B is being packed.
+        if ( schema == bli_cntx_schema_a_block(cntx) )
+            p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
+        if ( schema == bli_cntx_schema_b_panel(cntx) )
+            p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
+    }
+#endif
+
+    if ( cdim == mnr && !gs && unitk )
+    {
+        uint64_t n_mker = n / 8;
+        uint64_t n_left = n % 8;
+        __asm__ volatile (
+            "mov  x0, %[a] \n\t"
+            "mov  x1, %[p] \n\t"
+            "mov  x2, %[ldp] \n\t"
+            "mov  x3, %[lda] \n\t"
+            "mov  x4, %[inca] \n\t"
+            "cmp  x4, #1 \n\t"
+            // Skips by sizeof(double).
+            "mov  x8, #8 \n\t"
+            "madd x2, x2, x8, xzr \n\t"
+            "madd x3, x3, x8, xzr \n\t"
+            "madd x4, x4, x8, xzr \n\t"
+            // Loop constants.
+            "mov  x8, %[n_mker] \n\t"
+            "mov  x9, %[n_left] \n\t"
+            "ptrue p0.d \n\t"
+            "b.ne .AROWSTOR \n\t"
+            // A stored in columns.
+            " .ACOLSTOR: \n\t"
+            // Prefetch distance.
+            "mov  x17, #8 \n\t"
+            "madd x17, x17, x3, xzr \n\t"
+#ifdef _A64FX
+            // Disable hardware prefetch for A.
+            "mov  x16, 0x6 \n\t"
+            "lsl  x16, x16, #60 \n\t"
+            "orr  x0, x0, x16 \n\t"
+#endif
+            " .ACOLSTORMKER: \n\t"
+            "cmp  x8, xzr \n\t"
+            "b.eq .ACOLSTORMKEREND \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ldr  q1, [x0, #64] \n\t"
+            "ld1d z2.d, p0/z, [x5] \n\t"
+            "ldr  q3, [x5, #64] \n\t"
+            "ld1d z4.d, p0/z, [x6] \n\t"
+            "ldr  q5, [x6, #64] \n\t"
+            "ld1d z6.d, p0/z, [x7] \n\t"
+            "ldr  q7, [x7, #64] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x0, x7, x3 \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z8.d, p0/z, [x0] \n\t"
+            "ldr  q9, [x0, #64] \n\t"
+            "ld1d z10.d, p0/z, [x5] \n\t"
+            "ldr  q11, [x5, #64] \n\t"
+            "ld1d z12.d, p0/z, [x6] \n\t"
+            "ldr  q13, [x6, #64] \n\t"
+            "ld1d z14.d, p0/z, [x7] \n\t"
+            "ldr  q15, [x7, #64] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            // Plain storage
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "st1d z2.d, p0, [x10] \n\t"
+            "str  q3, [x10, #64] \n\t"
+            "st1d z4.d, p0, [x11] \n\t"
+            "str  q5, [x11, #64] \n\t"
+            "st1d z6.d, p0, [x12] \n\t"
+            "str  q7, [x12, #64] \n\t"
+            "st1d z8.d, p0, [x13] \n\t"
+            "str  q9, [x13, #64] \n\t"
+            "st1d z10.d, p0, [x14] \n\t"
+            "str  q11, [x14, #64] \n\t"
+            "st1d z12.d, p0, [x15] \n\t"
+            "str  q13, [x15, #64] \n\t"
+            "st1d z14.d, p0, [x16] \n\t"
+            "str  q15, [x16, #64] \n\t"
+            "add  x1, x16, x2 \n\t"
+            // Realign and store.
+            // "ext  z1.b, z1.b, z1.b, #16 \n\t"
+            // "ext  z1.b, z1.b, z2.b, #48 \n\t"
+            // "ext  z2.b, z2.b, z3.b, #16 \n\t"
+            // "ext  z2.b, z2.b, z4.b, #32 \n\t"
+            // "ext  z4.b, z4.b, z5.b, #16 \n\t"
+            // "ext  z4.b, z4.b, z6.b, #16 \n\t"
+            // "ext  z6.b, z6.b, z7.b, #16 \n\t"
+            // "ext  z9.b, z9.b, z9.b, #16 \n\t"
+            // "ext  z9.b, z9.b, z10.b, #48 \n\t"
+            // "ext  z10.b, z10.b, z11.b, #16 \n\t"
+            // "ext  z10.b, z10.b, z12.b, #32 \n\t"
+            // "ext  z12.b, z12.b, z13.b, #16 \n\t"
+            // "ext  z12.b, z12.b, z14.b, #16 \n\t"
+            // "ext  z14.b, z14.b, z15.b, #16 \n\t"
+            // "st1d z0.d, p0, [x1] \n\t"
+            // "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            // "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
+            // "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
+            // "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
+            // "add  x1, x1, #320 \n\t"
+            // "st1d z8.d, p0, [x1] \n\t"
+            // "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
+            // "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
+            // "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
+            // "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
+            // "add  x1, x1, #320 \n\t"
+            "add  x0, x7, x3 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .ACOLSTORMKER \n\t"
+            " .ACOLSTORMKEREND: \n\t"
+            " .ACOLSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ldr  q1, [x0, #64] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "add  x0, x0, x3 \n\t"
+            "add  x1, x1, x2 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .ACOLSTORLEFT \n\t"
+            // A stored in rows.
+            " .AROWSTOR: \n\t"
+            // Prepare predicates for in-reg transpose.
+            SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
+            " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
+            "cmp  x8, xzr \n\t"
+            "b.eq .AROWSTORMKEREND \n\t"
+            "add  x10, x0, x4 \n\t"
+            "add  x11, x10, x4 \n\t"
+            "add  x12, x11, x4 \n\t"
+            "add  x13, x12, x4 \n\t"
+            "add  x14, x13, x4 \n\t"
+            "add  x15, x14, x4 \n\t"
+            "add  x16, x15, x4 \n\t"
+            "add  x17, x16, x4 \n\t"
+            "add  x18, x17, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x10] \n\t"
+            "ld1d z2.d, p0/z, [x11] \n\t"
+            "ld1d z3.d, p0/z, [x12] \n\t"
+            "ld1d z4.d, p0/z, [x13] \n\t"
+            "ld1d z5.d, p0/z, [x14] \n\t"
+            "ld1d z6.d, p0/z, [x15] \n\t"
+            "ld1d z7.d, p0/z, [x16] \n\t"
+            "ld1d z22.d, p0/z, [x17] \n\t"
+            "ld1d z23.d, p0/z, [x18] \n\t"
+            // Transpose first 8 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
+            // Transpose last 2 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
+            // Plain storage.
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z8.d, p0, [x1] \n\t"
+            "str  q16, [x1, #64] \n\t"
+            "st1d z9.d, p0, [x10] \n\t"
+            "str  q17, [x10, #64] \n\t"
+            "st1d z10.d, p0, [x11] \n\t"
+            "str  q18, [x11, #64] \n\t"
+            "st1d z11.d, p0, [x12] \n\t"
+            "str  q19, [x12, #64] \n\t"
+            "st1d z12.d, p0, [x13] \n\t"
+            "str  q20, [x13, #64] \n\t"
+            "st1d z13.d, p0, [x14] \n\t"
+            "str  q21, [x14, #64] \n\t"
+            "st1d z14.d, p0, [x15] \n\t"
+            "str  q22, [x15, #64] \n\t"
+            "st1d z15.d, p0, [x16] \n\t"
+            "str  q23, [x16, #64] \n\t"
+            "add  x1, x16, x2 \n\t"
+            "add  x0, x0, #64 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .AROWSTORMKER \n\t"
+            " .AROWSTORMKEREND: \n\t"
+            "mov  x4, %[inca] \n\t" // Restore unshifted inca.
+            "index z30.d, xzr, x4 \n\t" // Generate index.
+            "lsl  x4, x4, #3 \n\t" // Shift again.
+            "lsl  x5, x4, #3 \n\t" // Virtual column vl.
+            " .AROWSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "add  x6, x0, x5 \n\t"
+            "add  x7, x6, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
+            "ldr  d1, [x6] \n\t"
+            "ldr  d2, [x7] \n\t"
+            "trn1 v1.2d, v1.2d, v2.2d \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "add  x1, x1, x2 \n\t"
+            "add  x0, x0, #8 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .AROWSTORLEFT \n\t"
+            " .UNITKDONE: \n\t"
+            "mov  x0, #0 \n\t"
+            :
+            : [a]      "r" (a),
+              [p]      "r" (p),
+              [lda]    "r" (lda),
+              [ldp]    "r" (ldp),
+              [inca]   "r" (inca),
+              [n_mker] "r" (n_mker),
+              [n_left] "r" (n_left)
+            : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+              "x8", "x9", "x10","x11","x12","x13","x14","x15",
+              "x16","x17","x18",
+              "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+              "z8", "z9", "z10","z11","z12","z13","z14","z15",
+              "z16","z17","z18","z19","z20","z21","z22","z23",
+              // "z24","z25","z26","z27","z28","z29",
+              "z30","z31",
+              "p0", "p1", "p2", "p3", "p4", // "p5",
+              "p6", "p7", "p8"
+            );
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
@@ -0,0 +1,359 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Linaro Limited
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <stdio.h>
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#else
+#error "No Arm SVE intrinsics support in compiler"
+#endif // __ARM_FEATURE_SVE
+
+// assumption:
+//   SVE vector length = 512 bits.
+// TODO:
+//   2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed.
+//   prefetching is needed.
+
+void bli_dpackm_armsve512_asm_12xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 12;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+
+    double* restrict alpha1     = a;
+    double* restrict alpha1_8   = alpha1 + 8 * inca;
+    double* restrict alpha1_p4  = alpha1 + 4 * inca;
+    double* restrict alpha1_m4  = alpha1 - 4 * inca;
+    double* restrict pi1        = p;
+    const   svbool_t all_active = svptrue_b64();
+    const   svbool_t first_half_active = svwhilelt_b64(0, 4);
+    const   svbool_t last_half_active  = svnot_z(all_active, first_half_active);
+    svfloat64_t      z_a0;
+    svfloat64_t      z_a8;
+    svfloat64_t      z_a8_lh;
+    svfloat64_t      z_a16;
+    svuint64_t       z_index;
+
+    // creating index for gather/scatter
+    //   with each element as: 0, 1*inca, 2*inca, 3*inca
+    z_index = svindex_u64( 0, inca * sizeof( double ) );
+
+    if ( cdim == mnr )
+    {
+        if ( bli_deq1( *kappa ) )
+        {
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                dim_t k = n;
+                // 2 pack into 3 case.
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // load 12 continuous elments from *a
+                        z_a0 = svld1_f64( all_active, alpha1 );
+                        z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // load 12 continuous elments from *a, filling last half of z8.
+                        z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_f64( all_active, alpha1_p4 );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                // line-by-line packing case.
+                for ( ; k != 0; --k )
+                {
+                    // load 12 continuous elments from *a
+                    z_a0 = svld1_f64( all_active, alpha1 );
+                    z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                dim_t k = n;
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // gather load from *a
+                        z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                        z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // gather load from *a, filling last half of z8.
+                        z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                for ( ; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+        }
+        else  // *kappa != 1.0
+        {
+            // load kappa into vector
+            svfloat64_t z_kappa;
+
+            z_kappa = svdup_f64( *kappa );
+
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                dim_t k = n;
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // load 12 continuous elments from *a
+                        z_a0 = svld1_f64( all_active, alpha1 );
+                        z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // load 12 continuous elments from *a, filling last half of z8.
+                        z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_f64( all_active, alpha1_p4 );
+
+                        // multiply by *kappa
+                        z_a0  = svmul_lane_f64( z_a0, z_kappa, 0 );
+                        z_a8  = svmul_lane_f64( z_a8, z_kappa, 0 );
+                        z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                for ( ; k != 0; --k )
+                {
+                    // load 12 continuous elments from *a
+                    z_a0 = svld1_f64( all_active, alpha1 );
+                    z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                dim_t k = n;
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // gather load from *a
+                        z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                        z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // gather load from *a, filling last half of z8.
+                        z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
+
+                        // multiply by *kappa
+                        z_a0  = svmul_lane_f64( z_a0, z_kappa, 0 );
+                        z_a8  = svmul_lane_f64( z_a8, z_kappa, 0 );
+                        z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                for ( ; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+        } // end of if ( *kappa == 1.0 )
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
@@ -0,0 +1,363 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "armsve512_asm_transpose_d8x8.h"
+
+// assumption:
+//   SVE vector length = 512 bits.
+
+void bli_dpackm_armsve512_asm_16xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 16;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+    const bool    gs    = inca != 1 && lda != 1;
+    const bool    unitk = bli_deq1( *kappa );
+
+#ifdef _A64FX
+    if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
+    {
+        // A twisted way to infer whether A or B is being packed.
+        if ( schema == bli_cntx_schema_a_block(cntx) )
+            p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
+        if ( schema == bli_cntx_schema_b_panel(cntx) )
+            p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
+    }
+#endif
+
+    if ( cdim == mnr && !gs && unitk )
+    {
+        uint64_t n_mker = n / 8;
+        uint64_t n_left = n % 8;
+        __asm__ volatile (
+            "mov  x0, %[a] \n\t"
+            "mov  x1, %[p] \n\t"
+            "mov  x2, %[ldp] \n\t"
+            "mov  x3, %[lda] \n\t"
+            "mov  x4, %[inca] \n\t"
+            "cmp  x4, #1 \n\t"
+            // Skips by sizeof(double).
+            "mov  x8, #8 \n\t"
+            "madd x2, x2, x8, xzr \n\t"
+            "madd x3, x3, x8, xzr \n\t"
+            "madd x4, x4, x8, xzr \n\t"
+
+            // "mov  x8, 0x8 \n\t" // Control#0 for A address.
+            // "mov  x8, 0x24 \n\t" // Higher 6bit for Control#0:
+            // "lsl  x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong
+            // "orr  x8, x8, x3 \n\t" // Stride.
+            // "msr  S3_3_C11_C6_0, x8 \n\t" // Write system register.
+
+            // Loop constants.
+            "mov  x8, %[n_mker] \n\t"
+            "mov  x9, %[n_left] \n\t"
+            "ptrue p0.d \n\t"
+            "b.ne .AROWSTOR \n\t"
+            // A stored in columns.
+            " .ACOLSTOR: \n\t"
+            // Prefetch distance.
+            "mov  x17, #8 \n\t"
+            "madd x17, x17, x3, xzr \n\t"
+#ifdef _A64FX
+            "mov  x16, 0x6 \n\t" // Disable hardware prefetch for A.
+            "lsl  x16, x16, #60 \n\t"
+            "orr  x0, x0, x16 \n\t"
+#endif
+            // "add  x5, x0, x3 \n\t"
+            // "add  x6, x5, x3 \n\t"
+            // "add  x7, x6, x3 \n\t"
+            // "prfm PLDL1STRM, [x0] \n\t"
+            // "prfm PLDL1STRM, [x5] \n\t"
+            // "prfm PLDL1STRM, [x6] \n\t"
+            // "prfm PLDL1STRM, [x7] \n\t"
+            // "add  x18, x7, x3 \n\t"
+            // "add  x5, x18, x3 \n\t"
+            // "add  x6, x5, x3 \n\t"
+            // "add  x7, x6, x3 \n\t"
+            // "prfm PLDL1STRM, [x18] \n\t"
+            // "prfm PLDL1STRM, [x5] \n\t"
+            // "prfm PLDL1STRM, [x6] \n\t"
+            // "prfm PLDL1STRM, [x7] \n\t"
+            " .ACOLSTORMKER: \n\t"
+            "cmp  x8, xzr \n\t"
+            "b.eq .ACOLSTORMKEREND \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
+            "ld1d z2.d, p0/z, [x5] \n\t"
+            "ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t"
+            "ld1d z4.d, p0/z, [x6] \n\t"
+            "ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t"
+            "ld1d z6.d, p0/z, [x7] \n\t"
+            "ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x0, x7, x3 \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z8.d, p0/z, [x0] \n\t"
+            "ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t"
+            "ld1d z10.d, p0/z, [x5] \n\t"
+            "ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t"
+            "ld1d z12.d, p0/z, [x6] \n\t"
+            "ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t"
+            "ld1d z14.d, p0/z, [x7] \n\t"
+            "ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            "st1d z2.d, p0, [x10] \n\t"
+            "st1d z3.d, p0, [x10, #1, mul vl] \n\t"
+            "st1d z4.d, p0, [x11] \n\t"
+            "st1d z5.d, p0, [x11, #1, mul vl] \n\t"
+            "st1d z6.d, p0, [x12] \n\t"
+            "st1d z7.d, p0, [x12, #1, mul vl] \n\t"
+            "st1d z8.d, p0, [x13] \n\t"
+            "st1d z9.d, p0, [x13, #1, mul vl] \n\t"
+            "st1d z10.d, p0, [x14] \n\t"
+            "st1d z11.d, p0, [x14, #1, mul vl] \n\t"
+            "st1d z12.d, p0, [x15] \n\t"
+            "st1d z13.d, p0, [x15, #1, mul vl] \n\t"
+            "st1d z14.d, p0, [x16] \n\t"
+            "st1d z15.d, p0, [x16, #1, mul vl] \n\t"
+            "add  x0, x7, x3 \n\t"
+            "add  x1, x16, x2 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .ACOLSTORMKER \n\t"
+            " .ACOLSTORMKEREND: \n\t"
+            " .ACOLSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            "add  x0, x0, x3 \n\t"
+            "add  x1, x1, x2 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .ACOLSTORLEFT \n\t"
+            // A stored in rows.
+            " .AROWSTOR: \n\t"
+            // Prepare predicates for in-reg transpose.
+            SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
+            " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
+            "cmp  x8, xzr \n\t"
+            "b.eq .AROWSTORMKEREND \n\t"
+            "add  x10, x0, x4 \n\t"
+            "add  x11, x10, x4 \n\t"
+            "add  x12, x11, x4 \n\t"
+            "add  x13, x12, x4 \n\t"
+            "add  x14, x13, x4 \n\t"
+            "add  x15, x14, x4 \n\t"
+            "add  x16, x15, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x10] \n\t"
+            "ld1d z2.d, p0/z, [x11] \n\t"
+            "ld1d z3.d, p0/z, [x12] \n\t"
+            "ld1d z4.d, p0/z, [x13] \n\t"
+            "ld1d z5.d, p0/z, [x14] \n\t"
+            "ld1d z6.d, p0/z, [x15] \n\t"
+            "ld1d z7.d, p0/z, [x16] \n\t"
+            "add  x5, x16, x4 \n\t"
+            "add  x10, x5, x4 \n\t"
+            "add  x11, x10, x4 \n\t"
+            "add  x12, x11, x4 \n\t"
+            "add  x13, x12, x4 \n\t"
+            "add  x14, x13, x4 \n\t"
+            "add  x15, x14, x4 \n\t"
+            "add  x16, x15, x4 \n\t"
+            "ld1d z16.d, p0/z, [x5] \n\t"
+            "ld1d z17.d, p0/z, [x10] \n\t"
+            "ld1d z18.d, p0/z, [x11] \n\t"
+            "ld1d z19.d, p0/z, [x12] \n\t"
+            "ld1d z20.d, p0/z, [x13] \n\t"
+            "ld1d z21.d, p0/z, [x14] \n\t"
+            "ld1d z22.d, p0/z, [x15] \n\t"
+            "ld1d z23.d, p0/z, [x16] \n\t"
+            // Transpose first 8 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
+            // Transpose last 8 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6)
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z8.d, p0, [x1] \n\t"
+            "st1d z24.d, p0, [x1, #1, mul vl] \n\t"
+            "st1d z9.d, p0, [x10] \n\t"
+            "st1d z25.d, p0, [x10, #1, mul vl] \n\t"
+            "st1d z10.d, p0, [x11] \n\t"
+            "st1d z26.d, p0, [x11, #1, mul vl] \n\t"
+            "st1d z11.d, p0, [x12] \n\t"
+            "st1d z27.d, p0, [x12, #1, mul vl] \n\t"
+            "st1d z12.d, p0, [x13] \n\t"
+            "st1d z28.d, p0, [x13, #1, mul vl] \n\t"
+            "st1d z13.d, p0, [x14] \n\t"
+            "st1d z29.d, p0, [x14, #1, mul vl] \n\t"
+            "st1d z14.d, p0, [x15] \n\t"
+            "st1d z30.d, p0, [x15, #1, mul vl] \n\t"
+            "st1d z15.d, p0, [x16] \n\t"
+            "st1d z31.d, p0, [x16, #1, mul vl] \n\t"
+            "add  x0, x0, #64 \n\t"
+            "add  x1, x16, x2 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .AROWSTORMKER \n\t"
+            " .AROWSTORMKEREND: \n\t"
+            "mov  x4, %[inca] \n\t" // Restore unshifted inca.
+            "index z30.d, xzr, x4 \n\t" // Generate index.
+            "lsl  x4, x4, #3 \n\t" // Shift again.
+            "lsl  x5, x4, #3 \n\t" // Virtual column vl.
+            " .AROWSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "add  x6, x0, x5 \n\t"
+            "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
+            "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            "add  x1, x1, x2 \n\t"
+            "add  x0, x0, #8 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .AROWSTORLEFT \n\t"
+            " .UNITKDONE: \n\t"
+            "mov  x0, #0 \n\t"
+            :
+            : [a]      "r" (a),
+              [p]      "r" (p),
+              [lda]    "r" (lda),
+              [ldp]    "r" (ldp),
+              [inca]   "r" (inca),
+              [n_mker] "r" (n_mker),
+              [n_left] "r" (n_left)
+            : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+              "x8", "x9", "x10","x11","x12","x13","x14","x15",
+              "x16","x17","x18",
+              "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+              "z8", "z9", "z10","z11","z12","z13","z14","z15",
+              // "z16","z17","z18","z19","z20","z21","z22","z23",
+              // "z24","z25","z26","z27","z28","z29","z30","z31",
+              "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7"
+            );
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
--- a/kernels/armsve/3/armsve_asm_2vx10.h
+++ b/kernels/armsve/3/armsve_asm_2vx10.h
@@ -0,0 +1,191 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \
+  GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+  GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \
+  GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \
+  GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \
+  GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \
+  GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \
+  GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \
+  \
+  GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \
+  GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7)
+
+// Second through forth microkernels are the first one with B vectors rotated.
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT)
+// NOTE:
+//  The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit
+//  (sth. akin to loop-invariant):
+//   - BV[0-7] holds B[0:7, 4*k_cur]
+//   - B's address stops at B[0, 4*k_cur+1]
+
+// Final loop inside K=4 microkernels.
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \
+  GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+  GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
+  GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
+  GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
+  GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
+  GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
+  GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
+  GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
+  GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
+
+// K=4 MKer loop with B memory scattered.
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+" mov             "#BELMADDR", "#BADDR"           \n\t" \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
+  \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+" mov             "#BELMADDR", "#BADDR"           \n\t" \
+  GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
+  GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
+  GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
+  GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
+  GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
+  GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
+  GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
+  GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
+
+
+#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \
+  CLEAR_COL4(Z00,Z01,Z02,Z03) \
+  CLEAR_COL4(Z04,Z05,Z06,Z07) \
+  CLEAR_COL4(Z08,Z09,Z10,Z11) \
+  CLEAR_COL4(Z12,Z13,Z14,Z15) \
+  CLEAR_COL4(Z16,Z17,Z18,Z19)
+
+#define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \
+  SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \
+  SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \
+  SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \
+  SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \
+  SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR)
+
+#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE)
+
+#define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
+
+#define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
+
+#define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS)
+
+#define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
+
+#define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
+
+#define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
+
--- a/kernels/armsve/3/armsve_asm_macros.h
+++ b/kernels/armsve/3/armsve_asm_macros.h
@@ -0,0 +1,123 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#define CLEAR_COL2(Z0,Z1) \
+" dup  "#Z0"."DT", #0 \n\t" \
+" dup  "#Z1"."DT", #0 \n\t"
+
+#define CLEAR_COL4(Z0,Z1,Z2,Z3) \
+  CLEAR_COL2(Z0,Z1) \
+  CLEAR_COL2(Z2,Z3)
+
+#define SCALE_COL2(Z0,Z1,ZFACTOR) \
+" fmul  "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
+" fmul  "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
+
+#define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \
+  SCALE_COL2(Z0,Z1,ZFACTOR) \
+  SCALE_COL2(Z2,Z3,ZFACTOR)
+
+// Prefetch or not.
+#define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT)
+#define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \
+" prfm  PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t"
+
+#define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" fmla  "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \
+" fmla  "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */
+
+#define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
+  GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" "LD1R"  "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
+
+#define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \
+  GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" "LD1R"  "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \
+" add     "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */
+
+#define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
+" "LD1"  "#ZFH"."DT", "#PFH"/z, ["#AADDR"]            \n\t" \
+" "LD1"  "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t"
+
+#define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
+" "LD1"  "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
+" add    "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
+" "LD1"  "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t"
+
+// Prefetch or not.
+#define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
+#define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
+" "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \
+" add    "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
+" "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t"
+
+#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \
+" add  "#ATEMP", "#AADDR", "#A4KS" \n\t" \
+" add  "#AADDR", "#AADDR", "#ACS"  \n\t" /* Forward A's address to the next column. */ \
+  GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
+  PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0)
+
+#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \
+" add  "#ATEMP", "#AADDR", "#A4KS" \n\t" \
+  GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
+" add  "#ATEMP", "#AADDR", "#APS"  \n\t" \
+  GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
+" add  "#AADDR", "#AADDR", "#ACS"  \n\t" /* Forward A's address to the next column. */ \
+  GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
+
+#define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
+  GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \
+" add  "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */
+
+#define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
+" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"]             \n\t" \
+" "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \
+" add    "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */
+
+#define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \
+" fmad  "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \
+" fmad  "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t"
+
+#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \
+" add  "#CADDR", "#CADDR", "#CCS"      \n\t"
+
+#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
+" add   "#CTEMP", "#CADDR", "#CVSKIP"   \n\t" \
+" "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \
+" add   "#CADDR", "#CADDR", "#CCS"      \n\t"
+
+
--- a/kernels/armsve/3/armsve_asm_macros_double.h
+++ b/kernels/armsve/3/armsve_asm_macros_double.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use double precision.
+#define DT    "d"
+#define LD1   "ld1d"
+#define ST1   "st1d"
+#define LD1R  "ld1rd"
+#define PRFG  "prfd"
+#define SZ    "8"
+#define OFFS  "lsl #3"
+// Include macros.
+#include "armsve_asm_macros.h"
+
--- a/kernels/armsve/3/armsve_asm_macros_half.h
+++ b/kernels/armsve/3/armsve_asm_macros_half.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use half precision.
+#define DT    "h"
+#define LD1   "ld1h"
+#define ST1   "st1h"
+#define LD1R  "ld1rh"
+#define PRFG  "prfh"
+#define SZ    "2"
+// #define OFFS UNSUPPORTED
+// Include macros.
+#include "armsve_asm_macros.h"
+
--- a/kernels/armsve/3/armsve_asm_macros_single.h
+++ b/kernels/armsve/3/armsve_asm_macros_single.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use single precision.
+#define DT    "s"
+#define LD1   "ld1w"
+#define ST1   "st1w"
+#define LD1R  "ld1rw"
+#define PRFG  "prfw"
+#define SZ    "4"
+#define OFFS  "uxtw #2"
+// Include macros.
+#include "armsve_asm_macros.h"
+
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -0,0 +1,318 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Double-precision composite instructions.
+#include "armsve_asm_macros_double.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10.h"
+
+void bli_dgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a,
+       double*    restrict b,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  __asm__ volatile (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incd            x2, ALL, MUL #2                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x8, 0x3                         \n\t" // Tag C address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x5, x5, x8                      \n\t"
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" ptrue           p0.d                            \n\t"
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+
+" ld1rd           z20.d, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rd           z21.d, p0/z, [x1, 8]            \n\t"
+" ld1rd           z22.d, p0/z, [x1, 16]           \n\t"
+" ld1rd           z23.d, p0/z, [x1, 24]           \n\t"
+" ld1rd           z24.d, p0/z, [x1, 32]           \n\t"
+" ld1rd           z25.d, p0/z, [x1, 40]           \n\t"
+" ld1rd           z26.d, p0/z, [x1, 48]           \n\t"
+" ld1rd           z27.d, p0/z, [x1, 56]           \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x8, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+" ld1rd           z20.d, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rd           z21.d, p0/z, [x1, 8]            \n\t"
+" ld1rd           z22.d, p0/z, [x1, 16]           \n\t"
+" ld1rd           z23.d, p0/z, [x1, 24]           \n\t"
+" ld1rd           z24.d, p0/z, [x1, 32]           \n\t"
+" ld1rd           z25.d, p0/z, [x1, 40]           \n\t"
+" ld1rd           z26.d, p0/z, [x1, 48]           \n\t"
+" ld1rd           z27.d, p0/z, [x1, 56]           \n\t"
+" ld1rd           z28.d, p0/z, [x1, 64]           \n\t"
+" ld1rd           z29.d, p0/z, [x1, 72]           \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x0, x0, x2                      \n\t" // Forward A.
+" add             x1, x1, x3                      \n\t" // Forward B.
+" sub             x8, x8, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ldr             x4, [x4]                        \n\t" // Load alpha & beta (value).
+" ldr             x8, [x8]                        \n\t"
+" dup             z30.d, x4                       \n\t" // Broadcast alpha & beta into vectors.
+" dup             z31.d, x8                       \n\t"
+" fmov            d28, #1.0                       \n\t" // Prepare FP 1.0.
+" fmov            x16, d28                        \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+#ifdef _A64FX
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+" prfm            PLDL1STRM, [x0]                 \n\t"
+" prfm            PLDL1STRM, [x0, 256*1]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*2]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*3]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*4]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*5]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*6]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*7]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*8]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*9]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*10]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*11]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*12]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*13]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*14]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*15]         \n\t"
+" prfm            PLDL1STRM, [x1]                 \n\t"
+" prfm            PLDL1STRM, [x1, 256*1]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+"                                                 \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
+" b.ne            WRITE_MEM                       \n\t"
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" cmp             x16, x4                         \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+// First half of C is already loaded in this case.
+GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+   );
+}
+
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -0,0 +1,307 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+   Copyright (C) 2019, Forschunszentrum Juelich
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Single-precision composite instructions.
+#include "armsve_asm_macros_single.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10.h"
+
+void bli_sgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,
+       float*     restrict b,
+       float*     restrict beta,
+       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  __asm__ volatile (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incw            x2, ALL, MUL #2                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x8, 0x3                         \n\t" // Tag C address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x5, x5, x8                      \n\t"
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #4                          \n\t" // Multiply some address skips by sizeof(float).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" ptrue           p0.s                            \n\t"
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+
+" ld1rw           z20.s, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rw           z21.s, p0/z, [x1, 4]            \n\t"
+" ld1rw           z22.s, p0/z, [x1, 8]            \n\t"
+" ld1rw           z23.s, p0/z, [x1, 12]           \n\t"
+" ld1rw           z24.s, p0/z, [x1, 16]           \n\t"
+" ld1rw           z25.s, p0/z, [x1, 20]           \n\t"
+" ld1rw           z26.s, p0/z, [x1, 24]           \n\t"
+" ld1rw           z27.s, p0/z, [x1, 28]           \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x8, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+" ld1rw           z20.s, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rw           z21.s, p0/z, [x1, 4]            \n\t"
+" ld1rw           z22.s, p0/z, [x1, 8]            \n\t"
+" ld1rw           z23.s, p0/z, [x1, 12]           \n\t"
+" ld1rw           z24.s, p0/z, [x1, 16]           \n\t"
+" ld1rw           z25.s, p0/z, [x1, 20]           \n\t"
+" ld1rw           z26.s, p0/z, [x1, 24]           \n\t"
+" ld1rw           z27.s, p0/z, [x1, 28]           \n\t"
+" ld1rw           z28.s, p0/z, [x1, 32]           \n\t"
+" ld1rw           z29.s, p0/z, [x1, 36]           \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x0, x0, x2                      \n\t" // Forward A.
+" add             x1, x1, x3                      \n\t" // Forward B.
+" sub             x8, x8, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ldr             w4, [x4]                        \n\t" // Load alpha & beta (value).
+" ldr             w8, [x8]                        \n\t"
+" dup             z30.s, w4                       \n\t" // Broadcast alpha & beta into vectors.
+" dup             z31.s, w8                       \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+" prfm            PLDL2KEEP, [x0]                 \n\t"
+" prfm            PLDL2KEEP, [x0, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            s28, #1.0                       \n\t"
+" fmov            w16, s28                        \n\t"
+" cmp             w16, w4                         \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+" index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+   );
+}
+
--- a/kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
@@ -0,0 +1,343 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+   Copyright (C) 2019, Forschunszentrum Juelich
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Half-precision composite instructions.
+#include "armsve_asm_macros_half.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10.h"
+
+// Gather-load / scatter-store instruction for half-precision
+//  needs being defined separately.
+#undef GEMM_CCOL_GATHER_LOAD_FWD
+#undef GEMM_CCOL_SCATTER_STORE_FWD
+
+#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
+" add   x28, "#CADDR", "#CRS2"        \n\t" \
+" ld1h  z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
+" ld1h  "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1]   \n\t" \
+" revh  "#ZFH".s, "#PT"/m, "#ZFH".s                     \n\t" \
+" fadd  "#ZFH".h, "#ZFH".h, z31.h     \n\t" \
+" add   "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
+" add   x28, "#CTEMP", "#CRS2"        \n\t" \
+" ld1h  z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
+" ld1h  "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1]   \n\t" \
+" revh  "#ZLH".s, "#PT"/m, "#ZLH".s                     \n\t" \
+" fadd  "#ZLH".h, "#ZLH".h, z31.h     \n\t" \
+" add  "#CADDR", "#CADDR", "#CCS"     \n\t"
+
+#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
+" add   x28, "#CADDR", "#CRS2"        \n\t" \
+" st1h  "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
+" revh  "#ZFH".s, "#PT"/m, "#ZFH".s   \n\t" \
+" st1h  "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1]      \n\t" \
+" add   "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
+" add   x28, "#CTEMP", "#CRS2"        \n\t" \
+" st1h  "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
+" revh  "#ZLH".s, "#PT"/m, "#ZLH".s   \n\t" \
+" st1h  "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1]      \n\t" \
+" add   "#CADDR", "#CADDR", "#CCS"    \n\t"
+
+
+void bli_shgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       void*      restrict alpha,
+       void*      restrict a,
+       void*      restrict b,
+       void*      restrict beta,
+       void*      restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  __asm__ volatile (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" inch            x2, ALL, MUL #2                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x8, 0x3                         \n\t" // Tag C address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x5, x5, x8                      \n\t"
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #2                          \n\t" // Multiply some address skips by sizeof(float16_t).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" ptrue           p0.b                            \n\t"
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+
+" ld1rh           z20.h, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rh           z21.h, p0/z, [x1, 2]            \n\t"
+" ld1rh           z22.h, p0/z, [x1, 4]            \n\t"
+" ld1rh           z23.h, p0/z, [x1, 6]            \n\t"
+" ld1rh           z24.h, p0/z, [x1, 8]            \n\t"
+" ld1rh           z25.h, p0/z, [x1, 10]           \n\t"
+" ld1rh           z26.h, p0/z, [x1, 12]           \n\t"
+" ld1rh           z27.h, p0/z, [x1, 14]           \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x8, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+" ld1rh           z20.h, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rh           z21.h, p0/z, [x1, 2]            \n\t"
+" ld1rh           z22.h, p0/z, [x1, 4]            \n\t"
+" ld1rh           z23.h, p0/z, [x1, 6]            \n\t"
+" ld1rh           z24.h, p0/z, [x1, 8]            \n\t"
+" ld1rh           z25.h, p0/z, [x1, 10]           \n\t"
+" ld1rh           z26.h, p0/z, [x1, 12]           \n\t"
+" ld1rh           z27.h, p0/z, [x1, 14]           \n\t"
+" ld1rh           z28.h, p0/z, [x1, 16]           \n\t"
+" ld1rh           z29.h, p0/z, [x1, 18]           \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x0, x0, x2                      \n\t" // Forward A.
+" add             x1, x1, x3                      \n\t" // Forward B.
+" sub             x8, x8, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1rh           z30.h, p0/z, [x4]               \n\t" // Load alpha & beta into vectors.
+" ld1rh           z31.h, p0/z, [x8]               \n\t"
+" fmov            w4, h28                         \n\t" // Copy alpha & beta to GP registers.
+" fmov            w8, h29                         \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+" prfm            PLDL2KEEP, [x0]                 \n\t"
+" prfm            PLDL2KEEP, [x0, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            h28, #1.0                       \n\t"
+" fmov            w16, h28                        \n\t"
+" cmp             w16, w4                         \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x10, xzr                        \n\t"
+" incb            x10                             \n\t"
+" madd            x10, x10, x6, xzr               \n\t" // C-column's logical 1-vector skip.
+" mov             x28, #2                         \n\t"
+" madd            x6, x28, x6, xzr                \n\t" // Double index skip for half-precision case.
+" index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16)
+" dup             z31.h, w8                       \n\t" // Restore beta destroyed by loading.
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16)
+"                                                 \n\t"
+" dup             z31.h, w8                       \n\t" // Restore beta destroyed by loading.
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x16","x10","x28",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+   );
+}
+
--- a/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
+++ b/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
@@ -0,0 +1,450 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// Separate instantiation for ArmSVE reference kernels.
+// Temporary workaround. Will be removed after upstream has switched to a better way
+//  of exposing gemmsup interface.
+
+//
+// -- Row storage case ---------------------------------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       conj_t              conja, \
+       conj_t              conjb, \
+       dim_t               m, \
+       dim_t               n, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	/* NOTE: This microkernel can actually handle arbitrarily large
+       values of m, n, and k. */ \
+\
+	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
+				PASTEMAC(ch,conjs)( ab ); \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 )
+
+//
+// -- Column storage case ------------------------------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       conj_t              conja, \
+       conj_t              conjb, \
+       dim_t               m, \
+       dim_t               n, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	/* NOTE: This microkernel can actually handle arbitrarily large
+       values of m, n, and k. */ \
+\
+	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
+				PASTEMAC(ch,conjs)( ab ); \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 )
+
--- a/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c
@@ -0,0 +1,528 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+#include <assert.h>
+
+// Double-precision composite instructions.
+#include "../armsve_asm_macros_double.h"
+
+// 2vx10 microkernels.
+#include "../armsve_asm_2vx10.h"
+
+// Prototype reference kernel.
+GEMMSUP_KER_PROT( double,   d, gemmsup_c_armsve_ref2 )
+
+void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  static int called = 0;
+  if ( !called )
+  {
+    fprintf(stderr, "rv called.\n");
+    called = 1;
+  }
+  // c*c requires A to be stored in columns.
+  assert( rs_a0 == 1 );
+
+  dim_t n0_mker = n0 / 10;
+  dim_t n0_left = n0 % 10;
+
+  if ( n0_left )
+  {
+    // A[:, ::]
+    // B[::, n0_mker*10:n0]
+    // C[: , n0_mker*10:n0]
+    double *ai = a;
+    double *bi = b + n0_mker * 10 * cs_b0;
+    double *ci = c + n0_mker * 10 * cs_c0;
+    bli_dgemmsup_c_armsve_ref2
+    (
+      conja, conjb,
+      m0, n0_left, k0,
+      alpha,
+      ai, rs_a0, cs_a0,
+      bi, rs_b0, cs_b0,
+      beta,
+      ci, rs_c0, cs_c0,
+      data,
+      cntx
+    );
+  }
+  // Return if it's a pure edge case.
+  if ( !n0_mker )
+    return;
+
+  // Determine VL.
+  uint64_t vlen2;
+  __asm__ (
+    " mov  x0, xzr          \n\t"
+    " incd x0, ALL, MUL #2  \n\t"
+    " mov  %[vlen2], x0     \n\t"
+  : [vlen2] "=r" (vlen2)
+  :
+  : "x0"
+   );
+
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  // uint64_t rs_a   = 1;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  uint64_t cs_b   = cs_b0;
+
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t n_mker = n0_mker;
+
+  dim_t m0_mker = m0 / vlen2;
+  dim_t m0_left = m0 % vlen2;
+  if ( m0_left )
+  {
+    // Edge case on A side can be handled with one more (predicated) loop.
+    m0_mker++;
+  } else
+    m0_left = vlen2;
+  // uint64_t ps_a = bli_auxinfo_ps_a( data );
+  uint64_t ps_b = bli_auxinfo_ps_b( data );
+
+  for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker )
+  {
+    uint64_t m_curr = vlen2;
+    if ( im0_mker == m0_mker - 1 )
+    {
+      // Last m-loop. Maybe unnecessary.
+      m_curr = m0_left;
+    }
+    double *ai = a + im0_mker * vlen2 * rs_a0;
+    double *bi = b;
+    double *ci = c + im0_mker * vlen2 * rs_c0;
+
+    void* a_next = bli_auxinfo_next_a( data );
+    void* b_next = bli_auxinfo_next_b( data );
+
+    __asm__ volatile (
+" ldr             x0, %[bi]                       \n\t"
+" ldr             x1, %[rs_b]                     \n\t" // Row-skip of B.
+" ldr             x2, %[cs_b]                     \n\t" // Column-skip of B (element skip of B[l, :]).
+" ldr             x3, %[ps_b]                     \n\t" // Panel-skip (10*k) of B.
+" ldr             x4, %[cs_a]                     \n\t" // Column-Skip of A.
+"                                                 \n\t" // Element skip of A[:, l] is guaranteed to be 1.
+" ldr             x5, %[ci]                       \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x5, x5, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x0, x0, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
+" madd            x1, x8, x1, xzr                 \n\t" // rs_b
+" madd            x2, x8, x2, xzr                 \n\t" // cs_b
+" madd            x3, x8, x3, xzr                 \n\t" // ps_b
+" madd            x4, x8, x4, xzr                 \n\t" // cs_a
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" mov             x8, #4                          \n\t"
+" madd            x15, x8, x4, xzr                \n\t" // Logical K=4 microkernel skip for A.
+"                                                 \n\t"
+#ifdef _A64FX
+" mov             x16, 0x20                       \n\t" // Higher 6bit for Control#2:
+" lsl             x16, x16, #58                   \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong
+" orr             x16, x16, x4                    \n\t" // Stride.
+" msr             S3_3_C11_C6_2, x16              \n\t" // Write system register.
+#endif
+"                                                 \n\t"
+" ldr             x8, %[m_curr]                   \n\t" // Size of first dimension.
+" mov             x9, xzr                         \n\t"
+" incd            x9                              \n\t"
+" ptrue           p0.d                            \n\t"
+" whilelo         p1.d, xzr, x8                   \n\t"
+" whilelo         p2.d, x9, x8                    \n\t"
+"                                                 \n\t"
+" ldr             x8, %[n_mker]                   \n\t" // Number of N-loops.
+"                                                 \n\t"
+" ldr             x20, %[ai]                      \n\t" // Parameters to be reloaded
+" ldr             x21, %[k_mker]                  \n\t" //  within each millikernel loop.
+" ldr             x22, %[k_left]                  \n\t"
+" ldr             x23, %[alpha]                   \n\t"
+" ldr             x24, %[beta]                    \n\t"
+" ldr             x25, %[a_next]                  \n\t"
+" ldr             x26, %[b_next]                  \n\t"
+" ldr             x23, [x23]                      \n\t" // Directly load alpha and beta.
+" ldr             x24, [x24]                      \n\t"
+"                                                 \n\t"
+" MILLIKER_MLOOP:                                 \n\t"
+"                                                 \n\t"
+" mov             x11, x0                         \n\t" // B's address.
+// " ldr             x10, %[ai]                      \n\t" // A's address.
+" mov             x10, x20                        \n\t"
+// " ldr             x12, %[k_mker]                  \n\t"
+" mov             x12, x21                        \n\t"
+// " ldr             x13, %[k_left]                  \n\t"
+" mov             x13, x22                        \n\t"
+#ifdef _A64FX
+" mov             x16, 0x3                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x10, x10, x16                   \n\t"
+" mov             x16, 0xa                        \n\t" // Control#2 for A address.
+" lsl             x16, x16, #60                   \n\t"
+" orr             x10, x10, x16                   \n\t"
+#endif
+"                                                 \n\t"
+" cmp             x12, #0                         \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+"                                                 \n\t"
+" mov             x14, x11                        \n\t"
+" ld1rd           z20.d, p0/z, [x14]              \n\t" // Load 8/10 of first B row.
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z21.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z22.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z23.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z24.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z25.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z26.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z27.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" prfm            PLDL1KEEP, [x14]                \n\t" // And prefetch the 2/10 left.
+" add             x14, x14, x2                    \n\t"
+" prfm            PLDL1KEEP, [x14]                \n\t"
+" sub             x14, x14, x2                    \n\t" // Restore x14 to load edge.
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10)
+" add             x16, x10, x4                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t" // Prefetch 3/4 of A.
+" add             x16, x10, x4                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x10, x4                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x12, #0                         \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+" add             x10, x10, x4                    \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x13, #0                         \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10)
+" mov             x14, x11                        \n\t"
+" ld1rd           z20.d, p0/z, [x14]              \n\t" // Load 10/10 B.
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z21.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z22.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z23.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z24.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z25.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z26.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z27.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z28.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z29.d, p0/z, [x14]              \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x10, x10, x4                    \n\t" // Forward A.
+" add             x11, x11, x1                    \n\t" // Forward B.
+" sub             x13, x13, #1                    \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+// " ldr             x10, %[ai]                      \n\t"
+" mov             x10, x20                        \n\t"
+" add             x11, x0, x3                     \n\t"
+" dup             z30.d, x23                      \n\t" // Broadcast alpha & beta into vectors.
+" dup             z31.d, x24                      \n\t"
+"                                                 \n\t"
+" cmp             x8, #1                          \n\t"
+" b.eq            PREFETCH_ABNEXT                 \n\t"
+" prfm            PLDL1STRM, [x10]                \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" b               WRITE_MEM                       \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+// " ldr             x1, %[a_next]                   \n\t" // Final Millikernel loop, x1 and x2 not needed.
+" mov             x1, x25                         \n\t"
+// " ldr             x2, %[b_next]                   \n\t"
+" mov             x2, x26                         \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x2]                 \n\t"
+" prfm            PLDL2KEEP, [x2, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            d28, #1.0                       \n\t"
+" fmov            x16, d28                        \n\t"
+" cmp             x16, x23                        \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+" mov             x13, xzr                        \n\t" // C-column's physical 1-vector skip.
+" incb            x13                             \n\t"
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x12, xzr                        \n\t"
+" incb            x12                             \n\t"
+" madd            x13, x12, x6, xzr               \n\t" // C-column's logical 1-vector skip.
+" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" subs            x8, x8, #1                      \n\t"
+" b.eq            END_EXEC                        \n\t"
+"                                                 \n\t" // Address of C already forwarded to next column.
+" add             x0, x0, x3                      \n\t" // Forward B's base address to the next logic panel.
+" b               MILLIKER_MLOOP                  \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [bi]     "m" (bi),
+  [rs_b]   "m" (rs_b),
+  [cs_b]   "m" (cs_b),
+  [ps_b]   "m" (ps_b),
+  [cs_a]   "m" (cs_a),
+  [ci]     "m" (ci),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [m_curr] "m" (m_curr),
+  [n_mker] "m" (n_mker),
+  [ai]     "m" (ai),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x10","x11","x12","x13","x14","x15","x16","x17",
+  "x20","x21","x22","x23","x24","x25","x26",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+     );
+  }
+}
+
+void bli_dgemmsup_rv_armsve_10x2v_unindexed
+     (
+       conj_t              conjat,
+       conj_t              conjbt,
+       dim_t               m0t,
+       dim_t               n0t,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict at, inc_t rs_at0, inc_t cs_at0,
+       double*    restrict bt, inc_t rs_bt0, inc_t cs_bt0,
+       double*    restrict beta,
+       double*    restrict ct, inc_t rs_ct0, inc_t cs_ct0,
+       auxinfo_t* restrict datat,
+       cntx_t*    restrict cntx
+     )
+{
+  auxinfo_t data;
+  bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data );
+  bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data );
+  bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data );
+  bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data );
+  bli_dgemmsup_cv_armsve_2vx10_unindexed
+  (
+    conjbt, conjat,
+    n0t, m0t, k0,
+    alpha,
+    bt, cs_bt0, rs_bt0,
+    at, cs_at0, rs_at0,
+    beta,
+    ct, cs_ct0, rs_ct0,
+    &data,
+    cntx
+  );
+}
+
--- a/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c
@@ -0,0 +1,412 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+#include <assert.h>
+
+// Double-precision composite instructions.
+#include "../armsve_asm_macros_double.h"
+
+// 2vx10 microkernels.
+#include "../armsve_asm_2vx10.h"
+
+// Prototype reference kernel.
+GEMMSUP_KER_PROT( double,   d, gemmsup_r_armsve_ref2 )
+
+void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  static int called = 0;
+  if ( !called )
+  {
+    fprintf(stderr, "rv called.\n");
+    called = 1;
+  }
+  // r*r requires B to be stored in rows.
+  assert(cs_b0 == 1);
+
+  dim_t n0_mker = n0 / 10;
+  dim_t n0_left = n0 % 10;
+
+  if ( n0_left )
+  {
+    // A[:, ::]
+    // B[::, n0_mker*10:n0]
+    // C[: , n0_mker*10:n0]
+    double *ai = a;
+    double *bi = b + n0_mker * 10 * cs_b0;
+    double *ci = c + n0_mker * 10 * cs_c0;
+    bli_dgemmsup_r_armsve_ref2
+    (
+      conja, conjb,
+      m0, n0_left, k0,
+      alpha,
+      ai, rs_a0, cs_a0,
+      bi, rs_b0, cs_b0,
+      beta,
+      ci, rs_c0, cs_c0,
+      data,
+      cntx
+    );
+  }
+  // Return if it's a pure edge case.
+  if ( !n0_mker )
+    return;
+
+  // Determine VL.
+  uint64_t vlen2;
+  __asm__ (
+    " mov  x0, xzr          \n\t"
+    " incd x0, ALL, MUL #2  \n\t"
+    " mov  %[vlen2], x0     \n\t"
+  : [vlen2] "=r" (vlen2)
+  :
+  : "x0"
+   );
+
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  uint64_t rs_a   = rs_a0;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  // uint64_t cs_b   = 1;
+
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t m_mker = m0 / vlen2;
+  uint64_t m_left = m0 % vlen2;
+  if ( m_left )
+  {
+    // Edge case on A side can be handled with one more (predicated) loop.
+    m_mker++;
+  } else
+    m_left = vlen2;
+  uint64_t ps_a = bli_auxinfo_ps_a( data );
+  // uint64_t ps_b = bli_auxinfo_ps_b( data );
+
+  for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker )
+  {
+    double *ai = a;
+    double *bi = b + in0_mker * 10 * cs_b0;
+    double *ci = c + in0_mker * 10 * cs_c0;
+
+    void* a_next = bli_auxinfo_next_a( data );
+    void* b_next = bli_auxinfo_next_b( data );
+
+    __asm__ volatile (
+" ldr             x0, %[ai]                       \n\t"
+" ldr             x1, %[rs_a]                     \n\t" // Row-skip of A (element skip of A[:, l]).
+" ldr             x2, %[cs_a]                     \n\t" // Column-skip of A.
+" ldr             x3, %[ps_a]                     \n\t" // Panel-skip (vlen2*k) of A.
+" ldr             x4, %[rs_b]                     \n\t" // Row-Skip of B.
+"                                                 \n\t" // Element skip of B[l, :] is guaranteed to be 1.
+" ldr             x5, %[ci]                       \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x5, x5, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x0, x0, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // ps_a
+" madd            x4, x8, x4, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x14, x8, x1, xzr                \n\t" // A-column's logical 1-vector skip.
+" mov             x8, #4                          \n\t"
+" madd            x15, x8, x2, xzr                \n\t" // Logical K=4 microkernel skip for A.
+// " mov             x8, #4                          \n\t"
+// " madd            x17, x8, x4, xzr                \n\t" // Logical K=4 microkernel skip for B.
+"                                                 \n\t"
+" ldr             x8, %[m_mker]                   \n\t" // Number of M-loops.
+" ptrue           p0.d                            \n\t"
+" ptrue           p1.d                            \n\t"
+" ptrue           p2.d                            \n\t"
+"                                                 \n\t"
+" MILLIKER_MLOOP:                                 \n\t"
+"                                                 \n\t"
+" cmp             x8, #1                          \n\t"
+" b.ne            UKER_BEGIN                      \n\t"
+"                                                 \n\t"
+" ldr             x10, %[m_left]                  \n\t" // Final (incomplete) millikernel loop.
+" mov             x11, xzr                        \n\t"
+" incd            x11                             \n\t"
+" whilelo         p1.d, xzr, x10                  \n\t" // Overwrite p1/p2.
+" whilelo         p2.d, x11, x10                  \n\t"
+"                                                 \n\t"
+" UKER_BEGIN:                                     \n\t"
+" mov             x10, x0                         \n\t" // A's address.
+" ldr             x11, %[bi]                      \n\t" // B's address.
+" ldr             x12, %[k_mker]                  \n\t"
+" ldr             x13, %[k_left]                  \n\t"
+#ifdef _A64FX
+" mov             x16, 0x3                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x11, x11, x16                   \n\t"
+#endif
+"                                                 \n\t"
+" mov             x16, x11                        \n\t" // Prefetch first kernel of B.
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+"                                                 \n\t"
+" ld1rd           z20.d, p0/z, [x11]              \n\t" // (Partial) first B row.
+" ld1rd           z21.d, p0/z, [x11, #8]          \n\t"
+" ld1rd           z22.d, p0/z, [x11, #16]         \n\t"
+" ld1rd           z23.d, p0/z, [x11, #24]         \n\t"
+" ld1rd           z24.d, p0/z, [x11, #32]         \n\t"
+" ld1rd           z25.d, p0/z, [x11, #40]         \n\t"
+" ld1rd           z26.d, p0/z, [x11, #48]         \n\t"
+" ld1rd           z27.d, p0/z, [x11, #56]         \n\t"
+"                                                 \n\t"
+" index           z29.d, xzr, x1                  \n\t" // First A column.
+"                                                 \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16)
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x12, #0                         \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t" // Unroll the 4-loop.
+"                                                 \n\t"
+" index           z31.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+"                                                 \n\t"
+" index           z29.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+"                                                 \n\t"
+" index           z31.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" index           z29.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+" add             x10, x10, x2                    \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x13, #0                         \n\t"
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+" index           z31.d, xzr, x1                  \n\t"
+GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16)
+" ld1rd           z20.d, p0/z, [x11]              \n\t"
+" ld1rd           z21.d, p0/z, [x11, #8]          \n\t"
+" ld1rd           z22.d, p0/z, [x11, #16]         \n\t"
+" ld1rd           z23.d, p0/z, [x11, #24]         \n\t"
+" ld1rd           z24.d, p0/z, [x11, #32]         \n\t"
+" ld1rd           z25.d, p0/z, [x11, #40]         \n\t"
+" ld1rd           z26.d, p0/z, [x11, #48]         \n\t"
+" ld1rd           z27.d, p0/z, [x11, #56]         \n\t"
+" ld1rd           z28.d, p0/z, [x11, #64]         \n\t"
+" ld1rd           z29.d, p0/z, [x11, #72]         \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x10, x10, x2                    \n\t" // Forward A.
+" add             x11, x11, x4                    \n\t" // Forward B.
+" sub             x13, x13, #1                    \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x11, %[bi]                      \n\t"
+" ldr             x12, %[alpha]                   \n\t" // Load alpha & beta.
+" ldr             x13, %[beta]                    \n\t"
+" ld1rd           z30.d, p0/z, [x12]              \n\t"
+" ld1rd           z31.d, p0/z, [x13]              \n\t"
+" ldr             x12, [x12]                      \n\t"
+"                                                 \n\t"
+" cmp             x8, #1                          \n\t"
+" b.eq            PREFETCH_ABNEXT                 \n\t"
+" prfm            PLDL2STRM, [x11]                \n\t"
+" b               WRITE_MEM                       \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x1, %[a_next]                   \n\t" // Final Millikernel loop, x1 and x2 not needed.
+" ldr             x2, %[b_next]                   \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x2]                 \n\t"
+" prfm            PLDL2KEEP, [x2, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            d28, #1.0                       \n\t"
+" fmov            x16, d28                        \n\t"
+" cmp             x16, x12                        \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+" mov             x10, x5                         \n\t" // C address for storing.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+" mov             x13, xzr                        \n\t" // C-column's physical 1-vector skip.
+" incb            x13                             \n\t"
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x12, xzr                        \n\t"
+" incb            x12                             \n\t"
+" madd            x13, x12, x6, xzr               \n\t" // C-column's logical 1-vector skip.
+" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" subs            x8, x8, #1                      \n\t"
+" b.eq            END_EXEC                        \n\t"
+"                                                 \n\t"
+" add             x0, x0, x3                      \n\t" // Forward A's base address to the next logic panel.
+" add             x5, x5, x13                     \n\t" // Forward C's base address to the next logic panel.
+" add             x5, x5, x13                     \n\t"
+" b               MILLIKER_MLOOP                  \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [ai]     "m" (ai),
+  [rs_a]   "m" (rs_a),
+  [cs_a]   "m" (cs_a),
+  [ps_a]   "m" (ps_a),
+  [rs_b]   "m" (rs_b),
+  [ci]     "m" (ci),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [m_mker] "m" (m_mker),
+  [m_left] "m" (m_left),
+  [bi]     "m" (bi),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x10","x11","x12","x13","x14","x15","x16",//"x17",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+     );
+  }
+}
+
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -33,5 +33,13 @@
 */

 GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )
+GEMM_UKR_PROT( double,   d, gemm_armsve_asm_2vx10_unindexed )
+GEMM_UKR_PROT( float,    s, gemm_armsve_asm_2vx10_unindexed )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_2vx10_unindexed )
+GEMMSUP_KER_PROT( double,   d, gemmsup_cv_armsve_2vx10_unindexed )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_10x2v_unindexed )

 PACKM_KER_PROT( double,   d, packm_armsve256_asm_8xk )
+PACKM_KER_PROT( double,   d, packm_armsve512_asm_16xk )
+PACKM_KER_PROT( double,   d, packm_armsve512_asm_12xk )
+PACKM_KER_PROT( double,   d, packm_armsve512_asm_10xk )
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_3xk
 		
 		mov(var(kappa), rcx)               // load address of kappa
 		vbroadcastss(mem(rcx, 0), ymm10)   // load kappa_r and duplicate
-		vbroadcastss(mem(rcx, 8), ymm11)   // load kappa_i and duplicate
+		vbroadcastss(mem(rcx, 4), ymm11)   // load kappa_i and duplicate
 		

 										   // now branch on kappa == 1.0
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_8xk
 		
 		mov(var(kappa), rcx)               // load address of kappa
 		vbroadcastss(mem(rcx, 0), ymm10)   // load kappa_r and duplicate
-		vbroadcastss(mem(rcx, 8), ymm11)   // load kappa_i and duplicate
+		vbroadcastss(mem(rcx, 4), ymm11)   // load kappa_i and duplicate
 		

 										   // now branch on kappa == 1.0
--- a/sandbox/gemmlike/bli_gemmnat.c
+++ b/sandbox/gemmlike/bli_gemmnat.c
@@ -0,0 +1,88 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the
+// entry point to any sandbox implementation.
+
+// NOTE: This function is implemented identically to the function that it
+// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are
+// forgoing the option of customizing the implementations that underlie
+// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox
+// directory, however, will be included in the BLIS.
+
+#include "blis.h"
+
+#undef  GENFRONT
+#define GENFRONT( opname, cname, imeth ) \
+\
+void PASTEMAC(opname,imeth) \
+     ( \
+       obj_t*  alpha, \
+       obj_t*  a, \
+       obj_t*  b, \
+       obj_t*  beta, \
+       obj_t*  c, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
+     ) \
+{ \
+\
+	/* A switch to easily toggle whether we use the sandbox implementation
+	   of bls_gemm() as the implementation for bli_gemm(). (This allows for
+	   easy testing of bls_gemm() via the testsuite.) */ \
+	if ( 1 ) \
+	{ \
+		bls_gemm_ex( alpha, a, b, beta, c, cntx, rntm ); \
+		return; \
+	} \
+\
+	bli_init_once(); \
+\
+	/* Obtain a valid (native) context from the gks if necessary. */ \
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
+\
+	/* Initialize a local runtime with global settings if necessary. Note
+	   that in the case that a runtime is passed in, we make a local copy. */ \
+	rntm_t rntm_l; \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
+\
+	/* Invoke the operation's front end. */ \
+	PASTEMAC(opname,_front) \
+	( \
+	  alpha, a, b, beta, c, cntx, rntm, NULL \
+	); \
+}
+
+GENFRONT( gemm, gemm, nat )
--- a/sandbox/gemmlike/bli_sandbox.h
+++ b/sandbox/gemmlike/bli_sandbox.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of copyright holder(s) nor the names
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SANDBOX_H
+#define BLIS_SANDBOX_H
+
+// NOTE: This header is the only header required to be present in the sandbox
+// implementation directory.
+
+// This header should contain (or #include) any definitions that must be
+// folded into blis.h. Typically, it will remain empty since any header
+// definitions specific to the sandbox implementation will not need to be
+// made available to applications (or the framework) during compilation.
+
+#include "bls_gemm.h"
+#include "bls_gemm_var.h"
+
+#include "bls_l3_packm_a.h"
+#include "bls_l3_packm_b.h"
+#include "bls_l3_packm_var.h"
+
+#include "bls_l3_decor.h"
+
+
+#endif
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -0,0 +1,304 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// -- Define the gemm-like operation's object API ------------------------------
+//
+
+void bls_gemm
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c
+     )
+{
+	bls_gemm_ex
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  NULL,
+	  NULL
+	);
+}
+
+void bls_gemm_ex
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// -- bli_gemmnat() --------------------------------------------------------
+
+	// Obtain a valid (native) context from the gks if necessary.
+	// NOTE: This must be done before calling the _check() function, since
+	// that function assumes the context pointer is valid.
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// -- bli_gemm_front() -----------------------------------------------------
+
+	obj_t a_local;
+	obj_t b_local;
+	obj_t c_local;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+	{
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
+	}
+
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( a, &a_local );
+	bli_obj_alias_to( b, &b_local );
+	bli_obj_alias_to( c, &c_local );
+
+	// Induce a transposition of A if it has its transposition property set.
+	// Then clear the transposition bit in the object.
+	if ( bli_obj_has_trans( &a_local ) )
+	{
+		bli_obj_induce_trans( &a_local );
+		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
+	}
+
+	// Induce a transposition of B if it has its transposition property set.
+	// Then clear the transposition bit in the object.
+	if ( bli_obj_has_trans( &b_local ) )
+	{
+		bli_obj_induce_trans( &b_local );
+		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local );
+	}
+
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+
+		// NOTE: This is probably not needed within the sandbox.
+		// We must also swap the pack schemas, which were set by bli_gemm_md()
+		// or the inlined code above.
+		//bli_obj_swap_pack_schemas( &a_local, &b_local );
+	}
+
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop, and then make any
+	// additional modifications necessary for the current operation.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm/hemm/symm
+	  bli_obj_length( &c_local ),
+	  bli_obj_width( &c_local ),
+	  bli_obj_width( &a_local ),
+	  rntm
+	);
+
+	// Spawn threads (if applicable), where bls_gemm_int() is the thread entry
+	// point function for each thread. This also begins the process of creating
+	// the thrinfo_t tree, which contains thread communicators.
+	bls_l3_thread_decorator
+	(
+	  bls_gemm_int,
+	  BLIS_GEMM, // operation family id
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  rntm
+	);
+}
+
+//
+// -- Define the gemm-like operation's thread entry point ----------------------
+//
+
+void bls_gemm_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	// In this function, we choose the gemm implementation that is executed
+	// on each thread.
+
+#if 1
+	// Call the block-panel algorithm that calls the kernel directly, which
+	// exposes edge-case handling.
+	bls_gemm_bp_var1
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+#else
+	// Call the block-panel algorithm that calls the kernel indirectly via a
+	// wrapper function, which hides edge-case handling.
+	bls_gemm_bp_var2
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+#endif
+}
+
+//
+// -- Define the gemm-like operation's typed API -------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       trans_t transa, \
+       trans_t transb, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  b, inc_t rs_b, inc_t cs_b, \
+       ctype*  beta, \
+       ctype*  c, inc_t rs_c, inc_t cs_c  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	/* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on
+	   the macro parameter 'ch' (e.g. s, d, etc). */ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	obj_t       alphao, ao, bo, betao, co; \
+\
+	dim_t       m_a, n_a; \
+	dim_t       m_b, n_b; \
+\
+	/* Adjust the dimensions of matrices A and B according to the transa and
+	   transb parameters. */ \
+	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
+	bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
+\
+	/* Create bufferless scalar objects and attach the provided scalar pointers
+	   to those scalar objects. */ \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, beta,  &betao  ); \
+\
+	/* Create bufferless matrix objects and attach the provided matrix pointers
+	   to those matrix objects. */ \
+	bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
+	bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
+	bli_obj_create_with_attached_buffer( dt, m,   n,   c, rs_c, cs_c, &co ); \
+\
+	/* Set the transposition/conjugation properties of the objects for matrices
+	   A and B. */ \
+	bli_obj_set_conjtrans( transa, &ao ); \
+	bli_obj_set_conjtrans( transb, &bo ); \
+\
+	/* Call the object interface. */ \
+	PASTECH(bls_,opname) \
+	( \
+	  &alphao, \
+	  &ao, \
+	  &bo, \
+	  &betao, \
+	  &co  \
+	); \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemm )
+GENTFUNC( float,    s, gemm )
+GENTFUNC( double,   d, gemm )
+GENTFUNC( scomplex, c, gemm )
+GENTFUNC( dcomplex, z, gemm )
+
--- a/sandbox/gemmlike/bls_gemm.h
+++ b/sandbox/gemmlike/bls_gemm.h
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//
+// -- Prototype the gemm-like operation's object API ---------------------------
+//
+
+void bls_gemm
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c
+     );
+
+void bls_gemm_ex
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     );
+
+//
+// -- Prototype the gemm-like operation's thread entry point -------------------
+//
+
+void bls_gemm_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+//
+// -- Prototype the gemm-like operation's typed API ----------------------------
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       trans_t transa, \
+       trans_t transb, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  b, inc_t rs_b, inc_t cs_b, \
+       ctype*  beta, \
+       ctype*  c, inc_t rs_c, inc_t cs_c  \
+     );
+
+//INSERT_GENTPROT_BASIC0( gemm )
+GENTPROT( float,    s, gemm )
+GENTPROT( double,   d, gemm )
+GENTPROT( scomplex, c, gemm )
+GENTPROT( dcomplex, z, gemm )
+
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -0,0 +1,521 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       conj_t           conja,
+       conj_t           conjb,
+       dim_t            m,
+       dim_t            n,
+       dim_t            k,
+       void*   restrict alpha,
+       void*   restrict a, inc_t rs_a, inc_t cs_a,
+       void*   restrict b, inc_t rs_b, inc_t cs_b,
+       void*   restrict beta,
+       void*   restrict c, inc_t rs_c, inc_t cs_c,
+       cntx_t* restrict cntx,
+       rntm_t* restrict rntm,
+       thrinfo_t* restrict thread
+     );
+
+//
+// -- gemm-like block-panel algorithm (object interface) -----------------------
+//
+
+// Define a function pointer array named ftypes and initialize its contents with
+// the addresses of the typed functions defined below, bls_?gemm_bp_var1().
+static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var1);
+
+void bls_gemm_bp_var1
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	const num_t    dt        = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	const dim_t    k         = bli_obj_width( a );
+
+	void* restrict buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t    rs_a      = bli_obj_row_stride( a );
+	const inc_t    cs_a      = bli_obj_col_stride( a );
+
+	void* restrict buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t    rs_b      = bli_obj_row_stride( b );
+	const inc_t    cs_b      = bli_obj_col_stride( b );
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+
+	// Index into the function pointer array to extract the correct
+	// typed function pointer based on the chosen datatype.
+	FUNCPTR_T f = ftypes[dt];
+
+	// Invoke the function.
+	f
+	(
+	  conja,
+	  conjb,
+	  m,
+	  n,
+	  k,
+	  buf_alpha,
+	  buf_a, rs_a, cs_a,
+	  buf_b, rs_b, cs_b,
+	  buf_beta,
+	  buf_c, rs_c, cs_c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+}
+
+//
+// -- gemm-like block-panel algorithm (typed interface) ------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bls_,ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	/* Query the context for various blocksizes. */ \
+	const dim_t NR  = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t MR  = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t NC  = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
+	const dim_t MC  = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
+	const dim_t KC  = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+	/* Query the context for the microkernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
+	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
+\
+	/* Compute partitioning step values for each matrix of each loop. */ \
+	const inc_t jcstep_c = cs_c; \
+	const inc_t jcstep_b = cs_b; \
+\
+	const inc_t pcstep_a = cs_a; \
+	const inc_t pcstep_b = rs_b; \
+\
+	const inc_t icstep_c = rs_c; \
+	const inc_t icstep_a = rs_a; \
+\
+	const inc_t jrstep_c = cs_c * NR; \
+\
+	const inc_t irstep_c = rs_c * MR; \
+\
+	ctype* restrict a_00       = a; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	/* Make local copies of the scalars to prevent any unnecessary sharing of
+	   cache lines between the cores' caches. */ \
+	ctype           alpha_local = *alpha_cast; \
+	ctype           beta_local  = *beta_cast; \
+	ctype           one_local   = *PASTEMAC(ch,1); \
+	ctype           zero_local  = *PASTEMAC(ch,0); \
+\
+	auxinfo_t       aux; \
+\
+	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
+	   needed for the matrix we will be packing (if any), but we do it
+	   unconditionally to be safe. */ \
+	mem_t mem_a = BLIS_MEM_INITIALIZER; \
+	mem_t mem_b = BLIS_MEM_INITIALIZER; \
+\
+	/* Define an array of bszid_t ids, which will act as our substitute for
+	   the cntl_t tree. */ \
+	bszid_t bszids[8] = { BLIS_NC,      /* 5th loop */ \
+	                      BLIS_KC,      /* 4th loop */ \
+	                      BLIS_NO_PART, /* pack B */ \
+	                      BLIS_MC,      /* 3rd loop */ \
+	                      BLIS_NO_PART, /* pack A */ \
+	                      BLIS_NR,      /* 2nd loop */ \
+	                      BLIS_MR,      /* 1st loop */ \
+	                      BLIS_KR };    /* microkernel loop */  \
+\
+	bszid_t* restrict bszids_jc = &bszids[0]; \
+	bszid_t* restrict bszids_pc = &bszids[1]; \
+	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
+	bszid_t* restrict bszids_ic = &bszids[3]; \
+	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
+	bszid_t* restrict bszids_jr = &bszids[5]; \
+	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
+\
+	thrinfo_t* restrict thread_jc = NULL; \
+	thrinfo_t* restrict thread_pc = NULL; \
+	thrinfo_t* restrict thread_pb = NULL; \
+	thrinfo_t* restrict thread_ic = NULL; \
+	thrinfo_t* restrict thread_pa = NULL; \
+	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* restrict thread_ir = NULL; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
+\
+	/* Identify the current thrinfo_t node and then grow the tree. */ \
+	thread_jc = thread; \
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+\
+	/* Compute the JC loop thread range for the current thread. */ \
+	dim_t jc_start, jc_end; \
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
+	const dim_t n_local = jc_end - jc_start; \
+\
+	/* Compute number of primary and leftover components of the JC loop. */ \
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
+	const dim_t jc_left =   n_local % NC; \
+\
+	/* Loop over the n dimension (NC rows/columns at a time). */ \
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
+	{ \
+		/* Calculate the thread's current JC block dimension. */ \
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
+\
+		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+		/* Identify the current thrinfo_t node and then grow the tree. */ \
+		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
+\
+		/* Compute the PC loop thread range for the current thread. */ \
+		const dim_t pc_start = 0, pc_end = k; \
+		const dim_t k_local = k; \
+\
+		/* Compute number of primary and leftover components of the PC loop. */ \
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
+		const dim_t pc_left =   k_local % KC; \
+\
+		/* Loop over the k dimension (KC rows/columns at a time). */ \
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
+		{ \
+			/* Calculate the thread's current PC block dimension. */ \
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
+\
+			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+\
+			/* Only apply beta to the first iteration of the pc loop. */ \
+			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+\
+			ctype* b_use; \
+			inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+			/* Identify the current thrinfo_t node. Note that the thrinfo_t
+			   node will have already been created by a previous call to
+			   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+			   cause the tree to grow by two (e.g. to the next bszid that is
+			   a normal bszid_t value). */ \
+			thread_pb = bli_thrinfo_sub_node( thread_pc ); \
+			/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
+\
+			/* Determine the packing buffer and related parameters for matrix
+			   B. Then call the packm implementation. */ \
+			PASTECH2(bls_,ch,packm_b) \
+			( \
+			  conjb, \
+			  KC,     NC, \
+			  kc_cur, nc_cur, NR, \
+			  &one_local, \
+			  b_pc,   rs_b,      cs_b, \
+			  &b_use, &rs_b_use, &cs_b_use, \
+			                     &ps_b_use, \
+			  cntx, \
+			  rntm, \
+			  &mem_b, \
+			  thread_pb  \
+			); \
+\
+			/* Alias b_use so that it's clear this is our current block of
+			   matrix B. */ \
+			ctype* restrict b_pc_use = b_use; \
+\
+			/* Identify the current thrinfo_t node and then grow the tree. */ \
+			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
+\
+			/* Compute the IC loop thread range for the current thread. */ \
+			dim_t ic_start, ic_end; \
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
+			const dim_t m_local = ic_end - ic_start; \
+\
+			/* Compute number of primary and leftover components of the IC loop. */ \
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
+			const dim_t ic_left =   m_local % MC; \
+\
+			/* Loop over the m dimension (MC rows at a time). */ \
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
+			{ \
+				/* Calculate the thread's current IC block dimension. */ \
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
+\
+				ctype* restrict a_ic = a_pc + ii * icstep_a; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+\
+				ctype* a_use; \
+				inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+				/* Identify the current thrinfo_t node. Note that the thrinfo_t
+				   node will have already been created by a previous call to
+				   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+				   cause the tree to grow by two (e.g. to the next bszid that is
+				   a normal bszid_t value). */ \
+				thread_pa = bli_thrinfo_sub_node( thread_ic ); \
+				/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
+\
+				/* Determine the packing buffer and related parameters for matrix
+				   A. Then call the packm implementation. */ \
+				PASTECH2(bls_,ch,packm_a) \
+				( \
+				  conja, \
+				  MC,     KC, \
+				  mc_cur, kc_cur, MR, \
+				  &one_local, \
+				  a_ic,   rs_a,      cs_a, \
+				  &a_use, &rs_a_use, &cs_a_use, \
+				                     &ps_a_use, \
+				  cntx, \
+				  rntm, \
+				  &mem_a, \
+				  thread_pa  \
+				); \
+\
+				/* Alias a_use so that it's clear this is our current block of
+				   matrix A. */ \
+				ctype* restrict a_ic_use = a_use; \
+\
+				/* Identify the current thrinfo_t node and then grow the tree. */ \
+				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
+\
+				/* Query the number of threads and thread ids for the JR loop.
+				   NOTE: These values are only needed when computing the next
+				   micropanel of B. */ \
+				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+\
+				/* Compute number of primary and leftover components of the JR loop. */ \
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+				dim_t jr_left =   nc_cur % NR; \
+\
+				/* Compute the JR loop thread range for the current thread. */ \
+				dim_t jr_start, jr_end; \
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				/* Loop over the n dimension (NR columns at a time). */ \
+				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
+				{ \
+					const dim_t nr_cur \
+					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
+\
+					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
+					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+\
+					/* Assume for now that our next panel of B to be the current panel
+					   of B. */ \
+					ctype* restrict b2 = b_jr; \
+\
+					/* Identify the current thrinfo_t node. */ \
+					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
+\
+					/* Query the number of threads and thread ids for the IR loop.
+					   NOTE: These values are only needed when computing the next
+					   micropanel of A. */ \
+					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+\
+					/* Compute number of primary and leftover components of the IR loop. */ \
+					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
+					dim_t ir_left =   mc_cur % MR; \
+\
+					/* Compute the IR loop thread range for the current thread. */ \
+					dim_t ir_start, ir_end; \
+					bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
+\
+					/* Loop over the m dimension (MR rows at a time). */ \
+					for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
+					{ \
+						const dim_t mr_cur \
+						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
+\
+						ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
+						ctype* restrict c_ir = c_jr     + i * irstep_c; \
+\
+						ctype* restrict a2; \
+\
+						/* Compute the addresses of the next micropanels of A and B. */ \
+						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
+						if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+						{ \
+							a2 = a_ic_use; \
+							b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
+							if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+								b2 = b_pc_use; \
+						} \
+\
+						/* Save the addresses of next micropanels of A and B to the
+						   auxinfo_t object. */ \
+						bli_auxinfo_set_next_a( a2, &aux ); \
+						bli_auxinfo_set_next_b( b2, &aux ); \
+\
+						/* Handle interior and edge cases separately. */ \
+						if ( mr_cur == MR && nr_cur == NR ) \
+						{ \
+							/* Invoke the gemm microkernel. */ \
+							gemm_ukr \
+							( \
+							  kc_cur, \
+							  &alpha_local, \
+							  a_ir, \
+							  b_jr, \
+							  beta_use, \
+							  c_ir, rs_c, cs_c, \
+							  &aux, \
+							  cntx  \
+							); \
+						} \
+						else \
+						{ \
+							/* Invoke the gemm microkernel. */ \
+							gemm_ukr \
+							( \
+							  kc_cur, \
+							  &alpha_local, \
+							  a_ir, \
+							  b_jr, \
+							  &zero_local, \
+							  ct, rs_ct, cs_ct, \
+							  &aux, \
+							  cntx  \
+							); \
+\
+							/* Scale the bottom edge of C and add the result from above. */ \
+							PASTEMAC(ch,xpbys_mxn) \
+							( \
+							  mr_cur, \
+							  nr_cur, \
+							  ct,   rs_ct, cs_ct, \
+							  beta_use, \
+							  c_ir, rs_c,  cs_c \
+							); \
+						} \
+					} \
+				} \
+			} \
+\
+			/* This barrier is needed to prevent threads from starting to pack
+			   the next row panel of B before the current row panel is fully
+			   computed upon. */ \
+			bli_thread_barrier( thread_pb ); \
+		} \
+	} \
+\
+	/* Release any memory that was acquired for packing matrices A and B. */ \
+	PASTECH2(bls_,ch,packm_finalize_mem_a) \
+	( \
+	  rntm, \
+	  &mem_a, \
+	  thread_pa  \
+	); \
+	PASTECH2(bls_,ch,packm_finalize_mem_b) \
+	( \
+	  rntm, \
+	  &mem_b, \
+	  thread_pb  \
+	); \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
+*/ \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemm_bp_var1 )
+GENTFUNC( float,    s, gemm_bp_var1 )
+GENTFUNC( double,   d, gemm_bp_var1 )
+GENTFUNC( scomplex, c, gemm_bp_var1 )
+GENTFUNC( dcomplex, z, gemm_bp_var1 )
+
--- a/sandbox/gemmlike/bls_gemm_bp_var2.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var2.c
@@ -0,0 +1,596 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       conj_t           conja,
+       conj_t           conjb,
+       dim_t            m,
+       dim_t            n,
+       dim_t            k,
+       void*   restrict alpha,
+       void*   restrict a, inc_t rs_a, inc_t cs_a,
+       void*   restrict b, inc_t rs_b, inc_t cs_b,
+       void*   restrict beta,
+       void*   restrict c, inc_t rs_c, inc_t cs_c,
+       cntx_t* restrict cntx,
+       rntm_t* restrict rntm,
+       thrinfo_t* restrict thread
+     );
+
+//
+// -- gemm-like block-panel algorithm (object interface) -----------------------
+//
+
+// Define a function pointer array named ftypes and initialize its contents with
+// the addresses of the typed functions defined below, bls_?gemm_bp_var2().
+static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var2);
+
+void bls_gemm_bp_var2
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	const num_t    dt        = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	const dim_t    k         = bli_obj_width( a );
+
+	void* restrict buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t    rs_a      = bli_obj_row_stride( a );
+	const inc_t    cs_a      = bli_obj_col_stride( a );
+
+	void* restrict buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t    rs_b      = bli_obj_row_stride( b );
+	const inc_t    cs_b      = bli_obj_col_stride( b );
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+
+	// Index into the function pointer array to extract the correct
+	// typed function pointer based on the chosen datatype.
+	FUNCPTR_T f = ftypes[dt];
+
+	// Invoke the function.
+	f
+	(
+	  conja,
+	  conjb,
+	  m,
+	  n,
+	  k,
+	  buf_alpha,
+	  buf_a, rs_a, cs_a,
+	  buf_b, rs_b, cs_b,
+	  buf_beta,
+	  buf_c, rs_c, cs_c,
+	  cntx,
+	  rntm,
+	  thread
+	);
+}
+
+//
+// -- gemm-like block-panel algorithm (typed interface) ------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bls_,ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	/* Query the context for various blocksizes. */ \
+	const dim_t NR  = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t MR  = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t NC  = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
+	const dim_t MC  = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
+	const dim_t KC  = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+	/* Query the context for the microkernel address and cast it to its
+	   function pointer type. */ \
+	/*
+	PASTECH(ch,gemm_ukr_ft) \
+               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	*/ \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	/*
+	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
+	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
+	*/ \
+\
+	/* Compute partitioning step values for each matrix of each loop. */ \
+	const inc_t jcstep_c = cs_c; \
+	const inc_t jcstep_b = cs_b; \
+\
+	const inc_t pcstep_a = cs_a; \
+	const inc_t pcstep_b = rs_b; \
+\
+	const inc_t icstep_c = rs_c; \
+	const inc_t icstep_a = rs_a; \
+\
+	const inc_t jrstep_c = cs_c * NR; \
+\
+	const inc_t irstep_c = rs_c * MR; \
+\
+	ctype* restrict a_00       = a; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	/* Make local copies of the scalars to prevent any unnecessary sharing of
+	   cache lines between the cores' caches. */ \
+	ctype           alpha_local = *alpha_cast; \
+	ctype           beta_local  = *beta_cast; \
+	ctype           one_local   = *PASTEMAC(ch,1); \
+	/*ctype           zero_local  = *PASTEMAC(ch,0);*/ \
+\
+	auxinfo_t       aux; \
+\
+	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
+	   needed for the matrix we will be packing (if any), but we do it
+	   unconditionally to be safe. */ \
+	mem_t mem_a = BLIS_MEM_INITIALIZER; \
+	mem_t mem_b = BLIS_MEM_INITIALIZER; \
+\
+	/* Define an array of bszid_t ids, which will act as our substitute for
+	   the cntl_t tree. */ \
+	bszid_t bszids[8] = { BLIS_NC,      /* 5th loop */ \
+	                      BLIS_KC,      /* 4th loop */ \
+	                      BLIS_NO_PART, /* pack B */ \
+	                      BLIS_MC,      /* 3rd loop */ \
+	                      BLIS_NO_PART, /* pack A */ \
+	                      BLIS_NR,      /* 2nd loop */ \
+	                      BLIS_MR,      /* 1st loop */ \
+	                      BLIS_KR };    /* microkernel loop */  \
+\
+	bszid_t* restrict bszids_jc = &bszids[0]; \
+	bszid_t* restrict bszids_pc = &bszids[1]; \
+	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
+	bszid_t* restrict bszids_ic = &bszids[3]; \
+	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
+	bszid_t* restrict bszids_jr = &bszids[5]; \
+	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
+\
+	thrinfo_t* restrict thread_jc = NULL; \
+	thrinfo_t* restrict thread_pc = NULL; \
+	thrinfo_t* restrict thread_pb = NULL; \
+	thrinfo_t* restrict thread_ic = NULL; \
+	thrinfo_t* restrict thread_pa = NULL; \
+	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* restrict thread_ir = NULL; \
+\
+	/* Identify the current thrinfo_t node and then grow the tree. */ \
+	thread_jc = thread; \
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+\
+	/* Compute the JC loop thread range for the current thread. */ \
+	dim_t jc_start, jc_end; \
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
+	const dim_t n_local = jc_end - jc_start; \
+\
+	/* Compute number of primary and leftover components of the JC loop. */ \
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
+	const dim_t jc_left =   n_local % NC; \
+\
+	/* Loop over the n dimension (NC rows/columns at a time). */ \
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
+	{ \
+		/* Calculate the thread's current JC block dimension. */ \
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
+\
+		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+		/* Identify the current thrinfo_t node and then grow the tree. */ \
+		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
+\
+		/* Compute the PC loop thread range for the current thread. */ \
+		const dim_t pc_start = 0, pc_end = k; \
+		const dim_t k_local = k; \
+\
+		/* Compute number of primary and leftover components of the PC loop. */ \
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
+		const dim_t pc_left =   k_local % KC; \
+\
+		/* Loop over the k dimension (KC rows/columns at a time). */ \
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
+		{ \
+			/* Calculate the thread's current PC block dimension. */ \
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
+\
+			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+\
+			/* Only apply beta to the first iteration of the pc loop. */ \
+			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+\
+			ctype* b_use; \
+			inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+			/* Identify the current thrinfo_t node. Note that the thrinfo_t
+			   node will have already been created by a previous call to
+			   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+			   cause the tree to grow by two (e.g. to the next bszid that is
+			   a normal bszid_t value). */ \
+			thread_pb = bli_thrinfo_sub_node( thread_pc ); \
+			/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
+\
+			/* Determine the packing buffer and related parameters for matrix
+			   B. Then call the packm implementation. */ \
+			PASTECH2(bls_,ch,packm_b) \
+			( \
+			  conjb, \
+			  KC,     NC, \
+			  kc_cur, nc_cur, NR, \
+			  &one_local, \
+			  b_pc,   rs_b,      cs_b, \
+			  &b_use, &rs_b_use, &cs_b_use, \
+			                     &ps_b_use, \
+			  cntx, \
+			  rntm, \
+			  &mem_b, \
+			  thread_pb  \
+			); \
+\
+			/* Alias b_use so that it's clear this is our current block of
+			   matrix B. */ \
+			ctype* restrict b_pc_use = b_use; \
+\
+			/* Identify the current thrinfo_t node and then grow the tree. */ \
+			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
+\
+			/* Compute the IC loop thread range for the current thread. */ \
+			dim_t ic_start, ic_end; \
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
+			const dim_t m_local = ic_end - ic_start; \
+\
+			/* Compute number of primary and leftover components of the IC loop. */ \
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
+			const dim_t ic_left =   m_local % MC; \
+\
+			/* Loop over the m dimension (MC rows at a time). */ \
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
+			{ \
+				/* Calculate the thread's current IC block dimension. */ \
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
+\
+				ctype* restrict a_ic = a_pc + ii * icstep_a; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+\
+				ctype* a_use; \
+				inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+				/* Identify the current thrinfo_t node. Note that the thrinfo_t
+				   node will have already been created by a previous call to
+				   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
+				   cause the tree to grow by two (e.g. to the next bszid that is
+				   a normal bszid_t value). */ \
+				thread_pa = bli_thrinfo_sub_node( thread_ic ); \
+				/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
+\
+				/* Determine the packing buffer and related parameters for matrix
+				   A. Then call the packm implementation. */ \
+				PASTECH2(bls_,ch,packm_a) \
+				( \
+				  conja, \
+				  MC,     KC, \
+				  mc_cur, kc_cur, MR, \
+				  &one_local, \
+				  a_ic,   rs_a,      cs_a, \
+				  &a_use, &rs_a_use, &cs_a_use, \
+				                     &ps_a_use, \
+				  cntx, \
+				  rntm, \
+				  &mem_a, \
+				  thread_pa  \
+				); \
+\
+				/* Alias a_use so that it's clear this is our current block of
+				   matrix A. */ \
+				ctype* restrict a_ic_use = a_use; \
+\
+				/* Identify the current thrinfo_t node and then grow the tree. */ \
+				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
+\
+				/* Query the number of threads and thread ids for the JR loop.
+				   NOTE: These values are only needed when computing the next
+				   micropanel of B. */ \
+				const dim_t jr_nt  = bli_thread_n_way( thread_jr ); \
+				const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
+\
+				/* Compute number of primary and leftover components of the JR loop. */ \
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+				dim_t jr_left =   nc_cur % NR; \
+\
+				/* Compute the JR loop thread range for the current thread. */ \
+				dim_t jr_start, jr_end; \
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				/* Loop over the n dimension (NR columns at a time). */ \
+				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
+				{ \
+					const dim_t nr_cur \
+					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
+\
+					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
+					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+\
+					/* Assume for now that our next panel of B to be the current panel
+					   of B. */ \
+					ctype* restrict b2 = b_jr; \
+\
+					/* Identify the current thrinfo_t node. */ \
+					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
+\
+					/* Query the number of threads and thread ids for the IR loop.
+					   NOTE: These values are only needed when computing the next
+					   micropanel of A. */ \
+					const dim_t ir_nt  = bli_thread_n_way( thread_ir ); \
+					const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
+\
+					/* Compute number of primary and leftover components of the IR loop. */ \
+					dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
+					dim_t ir_left =   mc_cur % MR; \
+\
+					/* Compute the IR loop thread range for the current thread. */ \
+					dim_t ir_start, ir_end; \
+					bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
+\
+					/* Loop over the m dimension (MR rows at a time). */ \
+					for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
+					{ \
+						const dim_t mr_cur \
+						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
+\
+						ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
+						ctype* restrict c_ir = c_jr     + i * irstep_c; \
+\
+						ctype* restrict a2; \
+\
+						/* Compute the addresses of the next micropanels of A and B. */ \
+						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
+						if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+						{ \
+							a2 = a_ic_use; \
+							b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
+							if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+								b2 = b_pc_use; \
+						} \
+\
+						/* Save the addresses of next micropanels of A and B to the
+						   auxinfo_t object. */ \
+						bli_auxinfo_set_next_a( a2, &aux ); \
+						bli_auxinfo_set_next_b( b2, &aux ); \
+\
+						/* Call a wrapper to the kernel (which handles edge cases). */ \
+						PASTECH2(bls_,ch,gemm_kernel) \
+						( \
+						  MR, \
+						  NR, \
+						  mr_cur, \
+						  nr_cur, \
+						  kc_cur, \
+						  &alpha_local, \
+						  a_ir, rs_a_use, cs_a_use, \
+						  b_jr, rs_b_use, cs_b_use, \
+						  beta_use, \
+						  c_ir, rs_c,     cs_c, \
+						  &aux, \
+						  cntx  \
+						); \
+					} \
+				} \
+			} \
+\
+			/* This barrier is needed to prevent threads from starting to pack
+			   the next row panel of B before the current row panel is fully
+			   computed upon. */ \
+			bli_thread_barrier( thread_pb ); \
+		} \
+	} \
+\
+	/* Release any memory that was acquired for packing matrices A and B. */ \
+	PASTECH2(bls_,ch,packm_finalize_mem_a) \
+	( \
+	  rntm, \
+	  &mem_a, \
+	  thread_pa  \
+	); \
+	PASTECH2(bls_,ch,packm_finalize_mem_b) \
+	( \
+	  rntm, \
+	  &mem_b, \
+	  thread_pb  \
+	); \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
+*/ \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemm_bp_var2 )
+GENTFUNC( float,    s, gemm_bp_var2 )
+GENTFUNC( double,   d, gemm_bp_var2 )
+GENTFUNC( scomplex, c, gemm_bp_var2 )
+GENTFUNC( dcomplex, z, gemm_bp_var2 )
+
+//
+// -- gemm-like microkernel wrapper --------------------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bls_,ch,varname) \
+     ( \
+       const dim_t         MR, \
+       const dim_t         NR, \
+       dim_t               mr_cur, \
+       dim_t               nr_cur, \
+       dim_t               kc_cur, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict aux, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	/* Infer the datatype from the ctype. */ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	/* Query the context for the microkernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
+	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
+\
+	ctype       zero    = *PASTEMAC(ch,0); \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs.
+	   NOTE: This initialization should really be done statically since
+	   var2 executes this microkernel wrapper many times, and the overhead
+	   of touching the temporary microtile adds up. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
+\
+	/* Handle interior and edge cases separately. */ \
+	if ( mr_cur == MR && nr_cur == NR ) \
+	{ \
+		/* Invoke the gemm microkernel. */ \
+		gemm_ukr \
+		( \
+		  kc_cur, \
+		  alpha, \
+		  a, \
+		  b, \
+		  beta, \
+		  c, rs_c, cs_c, \
+		  aux, \
+		  cntx  \
+		); \
+	} \
+	else \
+	{ \
+		/* Invoke the gemm microkernel. */ \
+		gemm_ukr \
+		( \
+		  kc_cur, \
+		  alpha, \
+		  a, \
+		  b, \
+		  &zero, \
+		  ct, rs_ct, cs_ct, \
+		  aux, \
+		  cntx  \
+		); \
+\
+		/* Scale the bottom edge of C and add the result from above. */ \
+		PASTEMAC(ch,xpbys_mxn) \
+		( \
+		  mr_cur, \
+		  nr_cur, \
+		  ct, rs_ct, cs_ct, \
+		  beta, \
+		  c,  rs_c,  cs_c \
+		); \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( gemm_kernel )
+GENTFUNC( float,    s, gemm_kernel )
+GENTFUNC( double,   d, gemm_kernel )
+GENTFUNC( scomplex, c, gemm_kernel )
+GENTFUNC( dcomplex, z, gemm_kernel )
+
--- a/sandbox/gemmlike/bls_gemm_var.h
+++ b/sandbox/gemmlike/bls_gemm_var.h
@@ -0,0 +1,124 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype the object-based variant interfaces.
+//
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+void PASTECH(bls_,opname) \
+     ( \
+       obj_t*  alpha, \
+       obj_t*  a, \
+       obj_t*  b, \
+       obj_t*  beta, \
+       obj_t*  c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     );
+
+GENPROT( gemm_bp_var1 )
+GENPROT( gemm_bp_var2 )
+
+
+//
+// Prototype the typed variant interfaces.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTECH2(bls_,ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     );
+
+//INSERT_GENTPROT_BASIC0( gemm_bp_var1 )
+GENTPROT( float,    s, gemm_bp_var1 )
+GENTPROT( double,   d, gemm_bp_var1 )
+GENTPROT( scomplex, c, gemm_bp_var1 )
+GENTPROT( dcomplex, z, gemm_bp_var1 )
+
+//INSERT_GENTPROT_BASIC0( gemm_bp_var2 )
+GENTPROT( float,    s, gemm_bp_var2 )
+GENTPROT( double,   d, gemm_bp_var2 )
+GENTPROT( scomplex, c, gemm_bp_var2 )
+GENTPROT( dcomplex, z, gemm_bp_var2 )
+
+
+//
+// Prototype the typed kernel interfaces.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTECH2(bls_,ch,varname) \
+     ( \
+       const dim_t         MR, \
+       const dim_t         NR, \
+       dim_t               mr_cur, \
+       dim_t               nr_cur, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict aux, \
+       cntx_t*    restrict cntx  \
+     );
+
+//INSERT_GENTPROT_BASIC0( gemm_kernel )
+GENTPROT( float,    s, gemm_kernel )
+GENTPROT( double,   d, gemm_kernel )
+GENTPROT( scomplex, c, gemm_kernel )
+GENTPROT( dcomplex, z, gemm_kernel )
+
--- a/sandbox/gemmlike/bls_l3_packm_a.c
+++ b/sandbox/gemmlike/bls_l3_packm_a.c
@@ -0,0 +1,328 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	/* Set the pack buffer type so that we are obtaining memory blocks from
+	   the pool dedicated to blocks of A. */ \
+	const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
+\
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
+	const dim_t k_pack = k; \
+\
+	/* Barrier to make sure all threads are caught up and ready to begin the
+	   packm stage. */ \
+	bli_thread_barrier( thread ); \
+\
+	/* Compute the size of the memory block eneded. */ \
+	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
+\
+	/* Check the mem_t entry provided by the caller. If it is unallocated,
+	   then we need to acquire a block from the memory broker. */ \
+	if ( bli_mem_is_unalloc( mem ) ) \
+	{ \
+		if ( bli_thread_am_ochief( thread ) ) \
+		{ \
+			/* Acquire directly to the chief thread's mem_t that was passed in.
+			   It needs to be that mem_t struct, and not a local (temporary)
+			   mem_t, since there is no barrier until after packing is finished,
+			   which could allow a race condition whereby the chief thread exits
+			   the current function before the other threads have a chance to
+			   copy from it. (A barrier would fix that race condition, but then
+			   again, I prefer to keep barriers to a minimum.) */ \
+			bli_membrk_acquire_m \
+			( \
+			  rntm, \
+			  size_needed, \
+			  pack_buf_type, \
+			  mem  \
+			); \
+		} \
+\
+		/* Broadcast the address of the chief thread's passed-in mem_t to all
+		   threads. */ \
+		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+		/* Non-chief threads: Copy the contents of the chief thread's
+		   passed-in mem_t to the passed-in mem_t for this thread. (The
+		   chief thread already has the mem_t, so it does not need to
+		   perform any copy.) */ \
+		if ( !bli_thread_am_ochief( thread ) ) \
+		{ \
+			*mem = *mem_p; \
+		} \
+	} \
+	else /* if ( bli_mem_is_alloc( mem ) ) */ \
+	{ \
+		/* If the mem_t entry provided by the caller does NOT contain a NULL
+		   buffer, then a block has already been acquired from the memory
+		   broker and cached by the caller. */ \
+\
+		/* As a sanity check, we should make sure that the mem_t object isn't
+		   associated with a block that is too small compared to the size of
+		   the packed matrix buffer that is needed, according to the value
+		   computed above. */ \
+		siz_t mem_size = bli_mem_size( mem ); \
+\
+		if ( mem_size < size_needed ) \
+		{ \
+			if ( bli_thread_am_ochief( thread ) ) \
+			{ \
+				/* The chief thread releases the existing block associated
+				   with the mem_t, and then re-acquires a new block, saving
+				   the associated mem_t to its passed-in mem_t. (See coment
+				   above for why the acquisition needs to be directly to
+				   the chief thread's passed-in mem_t and not a local
+				   (temporary) mem_t. */ \
+				bli_membrk_release \
+				( \
+				  rntm, \
+				  mem \
+				); \
+				bli_membrk_acquire_m \
+				( \
+				  rntm, \
+				  size_needed, \
+				  pack_buf_type, \
+				  mem \
+				); \
+			} \
+\
+			/* Broadcast the address of the chief thread's passed-in mem_t
+			   to all threads. */ \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+			/* Non-chief threads: Copy the contents of the chief thread's
+			   passed-in mem_t to the passed-in mem_t for this thread. (The
+			   chief thread already has the mem_t, so it does not need to
+			   perform any copy.) */ \
+			if ( !bli_thread_am_ochief( thread ) ) \
+			{ \
+				*mem = *mem_p; \
+			} \
+		} \
+		else \
+		{ \
+			/* If the mem_t entry is already allocated and sufficiently large,
+			   then we use it as-is. No action is needed. */ \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
+GENTFUNC( float,    s, packm_init_mem_a )
+GENTFUNC( double,   d, packm_init_mem_a )
+GENTFUNC( scomplex, c, packm_init_mem_a )
+GENTFUNC( dcomplex, z, packm_init_mem_a )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	if ( thread != NULL ) \
+	if ( bli_thread_am_ochief( thread ) ) \
+	{ \
+		/* Check the mem_t entry provided by the caller. Only proceed if it
+		   is allocated, which it should be. */ \
+		if ( bli_mem_is_alloc( mem ) ) \
+		{ \
+			bli_membrk_release \
+			( \
+			  rntm, \
+			  mem \
+			); \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
+GENTFUNC( float,    s, packm_finalize_mem_a )
+GENTFUNC( double,   d, packm_finalize_mem_a )
+GENTFUNC( scomplex, c, packm_finalize_mem_a )
+GENTFUNC( dcomplex, z, packm_finalize_mem_a )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       dim_t*  restrict m_max, \
+       dim_t*  restrict k_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ) \
+{ \
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
+	*k_max = k; \
+\
+	/* Determine the dimensions and strides for the packed matrix A. */ \
+	{ \
+		/* Pack A to column-stored row-panels. */ \
+		*rs_p = 1; \
+		*cs_p = mr; \
+\
+		*pd_p = mr; \
+		*ps_p = mr * k; \
+\
+		/* Set the schema to "packed row panels" to indicate packing to
+		   conventional column-stored row panels. */ \
+		*schema = BLIS_PACKED_ROW_PANELS; \
+	} \
+\
+	/* Set the buffer address provided by the caller to point to the memory
+	   associated with the mem_t entry acquired from the memory pool. */ \
+	*p = bli_mem_buffer( mem ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_a )
+GENTFUNC( float,    s, packm_init_a )
+GENTFUNC( double,   d, packm_init_a )
+GENTFUNC( scomplex, c, packm_init_a )
+GENTFUNC( dcomplex, z, packm_init_a )
+
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            m_alloc, \
+       dim_t            k_alloc, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	pack_t schema; \
+	dim_t  m_max; \
+	dim_t  k_max; \
+	dim_t  pd_p; \
+\
+	/* Prepare the packing destination buffer. */ \
+	PASTECH2(bls_,ch,packm_init_mem_a) \
+	( \
+	  m_alloc, k_alloc, mr, \
+	  cntx, \
+	  rntm, \
+	  mem, \
+	  thread  \
+	); \
+\
+	/* Determine the packing buffer and related parameters for matrix A. */ \
+	PASTECH2(bls_,ch,packm_init_a) \
+	( \
+	  &schema, \
+	  m, k, mr, \
+	  &m_max, &k_max, \
+	  p, rs_p,  cs_p, \
+	     &pd_p, ps_p, \
+	  mem  \
+	); \
+\
+	/* Pack matrix A to the destination buffer chosen above. Here, the packed
+	   matrix is stored to column-stored MR x k micropanels. */ \
+	PASTECH2(bls_,ch,packm_var1) \
+	( \
+	  conj, \
+	  schema, \
+	  m, \
+	  k, \
+	  m_max, \
+	  k_max, \
+	  kappa, \
+	  a,  rs_a,  cs_a, \
+	  *p, *rs_p, *cs_p, \
+		  pd_p,  *ps_p, \
+	  cntx, \
+	  thread  \
+	); \
+\
+	/* Barrier so that packing is done before computation. */ \
+	bli_thread_barrier( thread ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_a )
+GENTFUNC( float,    s, packm_a )
+GENTFUNC( double,   d, packm_a )
+GENTFUNC( scomplex, c, packm_a )
+GENTFUNC( dcomplex, z, packm_a )
+
--- a/sandbox/gemmlike/bls_l3_packm_a.h
+++ b/sandbox/gemmlike/bls_l3_packm_a.h
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_mem_a )
+GENTPROT( float,    s, packm_init_mem_a )
+GENTPROT( double,   d, packm_init_mem_a )
+GENTPROT( scomplex, c, packm_init_mem_a )
+GENTPROT( dcomplex, z, packm_init_mem_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
+GENTPROT( float,    s, packm_finalize_mem_a )
+GENTPROT( double,   d, packm_finalize_mem_a )
+GENTPROT( scomplex, c, packm_finalize_mem_a )
+GENTPROT( dcomplex, z, packm_finalize_mem_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       dim_t*  restrict m_max, \
+       dim_t*  restrict k_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_a )
+GENTPROT( float,    s, packm_init_a )
+GENTPROT( double,   d, packm_init_a )
+GENTPROT( scomplex, c, packm_init_a )
+GENTPROT( dcomplex, z, packm_init_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            m_alloc, \
+       dim_t            k_alloc, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_a )
+GENTPROT( float,    s, packm_a )
+GENTPROT( double,   d, packm_a )
+GENTPROT( scomplex, c, packm_a )
+GENTPROT( dcomplex, z, packm_a )
+
--- a/sandbox/gemmlike/bls_l3_packm_b.c
+++ b/sandbox/gemmlike/bls_l3_packm_b.c
@@ -0,0 +1,328 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	/* Set the pack buffer type so that we are obtaining memory blocks from
+	   the pool dedicated to panels of B. */ \
+	const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \
+\
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	const dim_t k_pack = k; \
+	const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
+\
+	/* Barrier to make sure all threads are caught up and ready to begin the
+	   packm stage. */ \
+	bli_thread_barrier( thread ); \
+\
+	/* Compute the size of the memory block eneded. */ \
+	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
+\
+	/* Check the mem_t entry provided by the caller. If it is unallocated,
+	   then we need to acquire a block from the memory broker. */ \
+	if ( bli_mem_is_unalloc( mem ) ) \
+	{ \
+		if ( bli_thread_am_ochief( thread ) ) \
+		{ \
+			/* Acquire directly to the chief thread's mem_t that was passed in.
+			   It needs to be that mem_t struct, and not a local (temporary)
+			   mem_t, since there is no barrier until after packing is finished,
+			   which could allow a race condition whereby the chief thread exits
+			   the current function before the other threads have a chance to
+			   copy from it. (A barrier would fix that race condition, but then
+			   again, I prefer to keep barriers to a minimum.) */ \
+			bli_membrk_acquire_m \
+			( \
+			  rntm, \
+			  size_needed, \
+			  pack_buf_type, \
+			  mem  \
+			); \
+		} \
+\
+		/* Broadcast the address of the chief thread's passed-in mem_t to all
+		   threads. */ \
+		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+		/* Non-chief threads: Copy the contents of the chief thread's
+		   passed-in mem_t to the passed-in mem_t for this thread. (The
+		   chief thread already has the mem_t, so it does not need to
+		   perform any copy.) */ \
+		if ( !bli_thread_am_ochief( thread ) ) \
+		{ \
+			*mem = *mem_p; \
+		} \
+	} \
+	else /* if ( bli_mem_is_alloc( mem ) ) */ \
+	{ \
+		/* If the mem_t entry provided by the caller does NOT contain a NULL
+		   buffer, then a block has already been acquired from the memory
+		   broker and cached by the caller. */ \
+\
+		/* As a sanity check, we should make sure that the mem_t object isn't
+		   associated with a block that is too small compared to the size of
+		   the packed matrix buffer that is needed, according to the value
+		   computed above. */ \
+		siz_t mem_size = bli_mem_size( mem ); \
+\
+		if ( mem_size < size_needed ) \
+		{ \
+			if ( bli_thread_am_ochief( thread ) ) \
+			{ \
+				/* The chief thread releases the existing block associated
+				   with the mem_t, and then re-acquires a new block, saving
+				   the associated mem_t to its passed-in mem_t. (See coment
+				   above for why the acquisition needs to be directly to
+				   the chief thread's passed-in mem_t and not a local
+				   (temporary) mem_t. */ \
+				bli_membrk_release \
+				( \
+				  rntm, \
+				  mem \
+				); \
+				bli_membrk_acquire_m \
+				( \
+				  rntm, \
+				  size_needed, \
+				  pack_buf_type, \
+				  mem \
+				); \
+			} \
+\
+			/* Broadcast the address of the chief thread's passed-in mem_t
+			   to all threads. */ \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
+\
+			/* Non-chief threads: Copy the contents of the chief thread's
+			   passed-in mem_t to the passed-in mem_t for this thread. (The
+			   chief thread already has the mem_t, so it does not need to
+			   perform any copy.) */ \
+			if ( !bli_thread_am_ochief( thread ) ) \
+			{ \
+				*mem = *mem_p; \
+			} \
+		} \
+		else \
+		{ \
+			/* If the mem_t entry is already allocated and sufficiently large,
+			   then we use it as-is. No action is needed. */ \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_mem_b )
+GENTFUNC( float,    s, packm_init_mem_b )
+GENTFUNC( double,   d, packm_init_mem_b )
+GENTFUNC( scomplex, c, packm_init_mem_b )
+GENTFUNC( dcomplex, z, packm_init_mem_b )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	if ( thread != NULL ) \
+	if ( bli_thread_am_ochief( thread ) ) \
+	{ \
+		/* Check the mem_t entry provided by the caller. Only proceed if it
+		   is allocated, which it should be. */ \
+		if ( bli_mem_is_alloc( mem ) ) \
+		{ \
+			bli_membrk_release \
+			( \
+			  rntm, \
+			  mem \
+			); \
+		} \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
+GENTFUNC( float,    s, packm_finalize_mem_b )
+GENTFUNC( double,   d, packm_finalize_mem_b )
+GENTFUNC( scomplex, c, packm_finalize_mem_b )
+GENTFUNC( dcomplex, z, packm_finalize_mem_b )
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       dim_t*  restrict k_max, \
+       dim_t*  restrict n_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ) \
+{ \
+	/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
+	   we NEED that last micropanel to have the same ldim (cs_p) as the other
+	   micropanels. Why? Because the microkernel assumes that the register (MR,
+	   NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
+	*k_max = k; \
+	*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
+\
+	/* Determine the dimensions and strides for the packed matrix B. */ \
+	{ \
+		/* Pack B to row-stored column-panels. */ \
+		*rs_p = nr; \
+		*cs_p = 1; \
+\
+		*pd_p = nr; \
+		*ps_p = k * nr; \
+\
+		/* Set the schema to "packed column panels" to indicate packing to
+		   conventional row-stored column panels. */ \
+		*schema = BLIS_PACKED_COL_PANELS; \
+	} \
+\
+	/* Set the buffer address provided by the caller to point to the memory
+	   associated with the mem_t entry acquired from the memory pool. */ \
+	*p = bli_mem_buffer( mem ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_init_b )
+GENTFUNC( float,    s, packm_init_b )
+GENTFUNC( double,   d, packm_init_b )
+GENTFUNC( scomplex, c, packm_init_b )
+GENTFUNC( dcomplex, z, packm_init_b )
+
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            k_alloc, \
+       dim_t            n_alloc, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	pack_t schema; \
+	dim_t  k_max; \
+	dim_t  n_max; \
+	dim_t  pd_p; \
+\
+	/* Prepare the packing destination buffer. */ \
+	PASTECH2(bls_,ch,packm_init_mem_b) \
+	( \
+	  k_alloc, n_alloc, nr, \
+	  cntx, \
+	  rntm, \
+	  mem, \
+	  thread  \
+	); \
+\
+	/* Determine the packing buffer and related parameters for matrix B. */ \
+	PASTECH2(bls_,ch,packm_init_b) \
+	( \
+	  &schema, \
+	  k, n, nr, \
+	  &k_max, &n_max, \
+	  p, rs_p,  cs_p, \
+	     &pd_p, ps_p, \
+	  mem  \
+	); \
+\
+	/* Pack matrix B to the destination buffer chosen above. Here, the packed
+	   matrix is stored to row-stored k x NR micropanels. */ \
+	PASTECH2(bls_,ch,packm_var1) \
+	( \
+	  conj, \
+	  schema, \
+	  k, \
+	  n, \
+	  k_max, \
+	  n_max, \
+	  kappa, \
+	  b,  rs_b,  cs_b, \
+	  *p, *rs_p, *cs_p, \
+		  pd_p,  *ps_p, \
+	  cntx, \
+	  thread  \
+	); \
+\
+	/* Barrier so that packing is done before computation. */ \
+	bli_thread_barrier( thread ); \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_b )
+GENTFUNC( float,    s, packm_b )
+GENTFUNC( double,   d, packm_b )
+GENTFUNC( scomplex, c, packm_b )
+GENTFUNC( dcomplex, z, packm_b )
+
--- a/sandbox/gemmlike/bls_l3_packm_b.h
+++ b/sandbox/gemmlike/bls_l3_packm_b.h
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_mem_b )
+GENTPROT( float,    s, packm_init_mem_b )
+GENTPROT( double,   d, packm_init_mem_b )
+GENTPROT( scomplex, c, packm_init_mem_b )
+GENTPROT( dcomplex, z, packm_init_mem_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
+GENTPROT( float,    s, packm_finalize_mem_b )
+GENTPROT( double,   d, packm_finalize_mem_b )
+GENTPROT( scomplex, c, packm_finalize_mem_b )
+GENTPROT( dcomplex, z, packm_finalize_mem_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       pack_t* restrict schema, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       dim_t*  restrict k_max, \
+       dim_t*  restrict n_max, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       mem_t*  restrict mem  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_init_b )
+GENTPROT( float,    s, packm_init_b )
+GENTPROT( double,   d, packm_init_b )
+GENTPROT( scomplex, c, packm_init_b )
+GENTPROT( dcomplex, z, packm_init_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTECH2(bls_,ch,opname) \
+     ( \
+       conj_t           conj, \
+       dim_t            k_alloc, \
+       dim_t            n_alloc, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+//INSERT_GENTPROT_BASIC0( packm_b )
+GENTPROT( float,    s, packm_b )
+GENTPROT( double,   d, packm_b )
+GENTPROT( scomplex, c, packm_b )
+GENTPROT( dcomplex, z, packm_b )
+
--- a/sandbox/gemmlike/bls_l3_packm_var.c
+++ b/sandbox/gemmlike/bls_l3_packm_var.c
@@ -0,0 +1,198 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// Define BLAS-like interfaces to the variants.
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(bls_,ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            m_max, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+                           dim_t pd_p, inc_t ps_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it, ic; \
+	dim_t           ic0; \
+	doff_t          ic_inc; \
+	dim_t           panel_len_full; \
+	dim_t           panel_len_i; \
+	dim_t           panel_len_max; \
+	dim_t           panel_len_max_i; \
+	dim_t           panel_dim_i; \
+	dim_t           panel_dim_max; \
+	inc_t           vs_c; \
+	inc_t           ldc; \
+	inc_t           ldp; \
+	conj_t          conjc; \
+\
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	bool row_stored = bli_is_col_packed( schema ); \
+	/*bool col_stored = bli_is_row_packed( schema );*/ \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len_full = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		vs_c           = cs_c; \
+		ldc            = rs_c; \
+		ldp            = rs_p; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len_full = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		vs_c           = rs_c; \
+		ldc            = cs_c; \
+		ldp            = cs_p; \
+	} \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */ \
+	{ \
+		ic0    = 0; \
+		ic_inc = panel_dim_max; \
+	} \
+\
+	ctype* restrict p_begin = p_cast; \
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
+	( void )nt; \
+	( void )tid; \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    it  = 0; it < n_iter; \
+	      ic += ic_inc, it += 1 ) \
+	{ \
+		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		ctype* restrict c_begin = c_cast   + (ic  )*vs_c; \
+\
+		ctype* restrict c_use = c_begin; \
+		ctype* restrict p_use = p_begin; \
+\
+		panel_len_i     = panel_len_full; \
+		panel_len_max_i = panel_len_max; \
+\
+		/* The definition of bli_packm_my_iter() will depend on whether slab
+		   or round-robin partitioning was requested at configure-time. (The
+		   default is slab.) */ \
+		if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+		{ \
+			PASTEMAC(ch,packm_cxk) \
+			( \
+			  conjc, \
+			  schema, \
+			  panel_dim_i, \
+			  panel_dim_max, \
+			  panel_len_i, \
+			  panel_len_max_i, \
+			  kappa_cast, \
+			  c_use, vs_c, ldc, \
+			  p_use,       ldp, \
+			  cntx  \
+			); \
+		} \
+\
+/*
+if ( !row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+else \
+PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+\
+		p_begin += ps_p; \
+	} \
+}
+
+//INSERT_GENTFUNC_BASIC0( packm_var1 )
+GENTFUNC( float,    s, packm_var1 )
+GENTFUNC( double,   d, packm_var1 )
+GENTFUNC( scomplex, c, packm_var1 )
+GENTFUNC( dcomplex, z, packm_var1 )
+
--- a/sandbox/gemmlike/bls_l3_packm_var.h
+++ b/sandbox/gemmlike/bls_l3_packm_var.h
@@ -0,0 +1,63 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//
+// Prototype BLAS-like interfaces to the variants.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTECH2(bls_,ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            m_max, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+                           dim_t pd_p, inc_t ps_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     );
+
+//INSERT_GENTPROT_BASIC0( packm_var1 )
+GENTPROT( float,    s, packm_var1 )
+GENTPROT( double,   d, packm_var1 )
+GENTPROT( scomplex, c, packm_var1 )
+GENTPROT( dcomplex, z, packm_var1 )
+
--- a/sandbox/gemmlike/thread/bls_l3_decor.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor.h
@@ -0,0 +1,73 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_H
+#define BLIS_SBX_L3_DECOR_H
+
+// -- sup definitions ----------------------------------------------------------
+
+// Level-3 sup internal function type.
+typedef void (*l3sbxint_t)
+     (
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       thrinfo_t* thread
+     );
+
+// Level-3 sup thread decorator prototype.
+void bls_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     );
+
+// Include definitions specific to the method of multithreading.
+#include "bls_l3_decor_single.h"
+#include "bls_l3_decor_openmp.h"
+#include "bls_l3_decor_pthreads.h"
+
+#endif
+
--- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
@@ -0,0 +1,138 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_OPENMP
+
+// Define a dummy thread entry function, which is needed in the pthreads
+// version, so that when building Windows DLLs (with OpenMP enabled or with
+// no multithreading) we don't risk having an unresolved symbol.
+void* bls_l3_thread_entry( void* data_void ) { return NULL; }
+
+//#define PRINT_THRINFO
+
+void bls_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+	// Query the total number of threads from the rntm_t object.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
+	bli_membrk_rntm_set_membrk( rntm );
+
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Create a thread-local copy of the master thread's rntm_t. This is
+		// necessary since we want each thread to be able to track its own
+		// small block pool_t as it executes down the function stack.
+		rntm_t           rntm_l = *rntm;
+		rntm_t* restrict rntm_p = &rntm_l;
+
+		// Query the thread's id from OpenMP.
+		const dim_t tid = omp_get_thread_num();
+
+		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
+		// NOTE: This calls the same function used for the conventional/large
+		// code path.
+		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+		thrinfo_t* thread = NULL;
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+
+		func
+		(
+		  alpha,
+		  a,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  thread
+		);
+
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_free( rntm_p, thread );
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called from the thread entry function).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+#endif
+
--- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
+#define BLIS_SBX_L3_DECOR_OPENMP_H
+
+// Definitions specific to situations when OpenMP multithreading is enabled.
+#ifdef BLIS_ENABLE_OPENMP
+
+#endif
+
+#endif
+
--- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
@@ -0,0 +1,213 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_PTHREADS
+
+// A data structure to assist in passing operands to additional threads.
+typedef struct thread_data
+{
+	l3sbxint_t func;
+	opid_t     family;
+	obj_t*     alpha;
+	obj_t*     a;
+	obj_t*     b;
+	obj_t*     beta;
+	obj_t*     c;
+	cntx_t*    cntx;
+	rntm_t*    rntm;
+	dim_t      tid;
+	thrcomm_t* gl_comm;
+	array_t*   array;
+} thread_data_t;
+
+// Entry point function for additional threads.
+void* bls_l3_thread_entry( void* data_void )
+{
+	thread_data_t* data     = data_void;
+
+	l3sbxint_t     func     = data->func;
+	opid_t         family   = data->family;
+	obj_t*         alpha    = data->alpha;
+	obj_t*         a        = data->a;
+	obj_t*         b        = data->b;
+	obj_t*         beta     = data->beta;
+	obj_t*         c        = data->c;
+	cntx_t*        cntx     = data->cntx;
+	rntm_t*        rntm     = data->rntm;
+	dim_t          tid      = data->tid;
+	array_t*       array    = data->array;
+	thrcomm_t*     gl_comm  = data->gl_comm;
+
+	( void )family;
+
+	// Create a thread-local copy of the master thread's rntm_t. This is
+	// necessary since we want each thread to be able to track its own
+	// small block pool_t as it executes down the function stack.
+	rntm_t           rntm_l = *rntm;
+	rntm_t* restrict rntm_p = &rntm_l;
+
+	// Use the thread id to access the appropriate pool_t* within the
+	// array_t, and use it to set the sba_pool field within the rntm_t.
+	// If the pool_t* element within the array_t is NULL, it will first
+	// be allocated/initialized.
+	bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+	thrinfo_t* thread = NULL;
+
+	// Create the root node of the current thread's thrinfo_t structure.
+	bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+
+	func
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm_p,
+	  thread
+	);
+
+	// Free the current thread's thrinfo_t structure.
+	bli_l3_sup_thrinfo_free( rntm_p, thread );
+
+	return NULL;
+}
+
+void bls_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+	// Query the total number of threads from the context.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
+	bli_membrk_rntm_set_membrk( rntm );
+
+	// Allocate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+	// Allocate an array of pthread objects and auxiliary data structs to pass
+	// to the thread entry functions.
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
+
+	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
+	// can spawn all other threads before proceeding with its own computation.
+	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
+	{
+		// Set up thread data for additional threads (beyond thread 0).
+		datas[tid].func     = func;
+		datas[tid].family   = family;
+		datas[tid].alpha    = alpha;
+		datas[tid].a        = a;
+		datas[tid].b        = b;
+		datas[tid].beta     = beta;
+		datas[tid].c        = c;
+		datas[tid].cntx     = cntx;
+		datas[tid].rntm     = rntm;
+		datas[tid].tid      = tid;
+		datas[tid].gl_comm  = gl_comm;
+		datas[tid].array    = array;
+
+		// Spawn additional threads for ids greater than 1.
+		if ( tid != 0 )
+			bli_pthread_create( &pthreads[tid], NULL, &bls_l3_thread_entry, &datas[tid] );
+		else
+			bls_l3_thread_entry( ( void* )(&datas[0]) );
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called from the thread entry function).
+
+	// Thread 0 waits for additional threads to finish.
+	for ( dim_t tid = 1; tid < n_threads; tid++ )
+	{
+		bli_pthread_join( pthreads[tid], NULL );
+	}
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( pthreads );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( datas );
+}
+
+#endif
+
--- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h
@@ -0,0 +1,47 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
+#define BLIS_SBX_L3_DECOR_PTHREADS_H
+
+// Definitions specific to situations when POSIX multithreading is enabled.
+#ifdef BLIS_ENABLE_PTHREADS
+
+// Thread entry point prototype.
+void* bls_l3_thread_entry( void* data_void );
+
+#endif
+
+#endif
+
--- a/sandbox/gemmlike/thread/bls_l3_decor_single.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_single.c
@@ -0,0 +1,141 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+#define SKIP_THRINFO_TREE
+
+void bls_l3_thread_decorator
+     (
+       l3sbxint_t func,
+       opid_t     family,
+       //pack_t     schema_a,
+       //pack_t     schema_b,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+	// For sequential execution, we use only one thread.
+	const dim_t n_threads = 1;
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm.
+	bli_membrk_rntm_set_membrk( rntm );
+
+#ifndef SKIP_THRINFO_TREE
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+#endif
+
+
+	{
+		// NOTE: We don't need to create another copy of the rntm_t since
+		// it was already copied in one of the high-level oapi functions.
+		rntm_t* restrict rntm_p = rntm;
+
+		// There is only one thread id (for the thief thread).
+		const dim_t tid = 0;
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		// NOTE: This is commented out because, in the single-threaded case,
+		// this is redundant since it's already been done above.
+		//bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+#ifndef SKIP_THRINFO_TREE
+		thrinfo_t* thread = NULL;
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+#else
+		// This optimization allows us to use one of the global thrinfo_t
+		// objects for single-threaded execution rather than grow one from
+		// scratch. The key is that bli_thrinfo_sup_grow(), which is called
+		// from within the variants, will immediately return if it detects
+		// that the thrinfo_t* passed into it is either
+		// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
+		thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
+
+		( void )tid;
+#endif
+
+		func
+		(
+		  alpha,
+		  a,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  thread
+		);
+
+#ifndef SKIP_THRINFO_TREE
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_sup_thrinfo_free( rntm_p, thread );
+#endif
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+#endif
+
--- a/sandbox/gemmlike/thread/bls_l3_decor_single.h
+++ b/sandbox/gemmlike/thread/bls_l3_decor_single.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
+#define BLIS_SBX_L3_DECOR_SINGLE_H
+
+// Definitions specific to situations when multithreading is disabled.
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+#endif
+
+#endif
+
--- a/sandbox/power10/bli_gemmnat.c
+++ b/sandbox/power10/bli_gemmnat.c
@@ -32,7 +32,14 @@

 */

-// This file is needed for the BLIS build system.
+// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the
+// entry point to any sandbox implementation.
+
+// NOTE: This function is implemented identically to the function that it
+// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are
+// forgoing the option of customizing the implementations that underlie
+// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox
+// directory, however, will be included in the BLIS.

 #include "blis.h"

--- a/test/3/octave/plot_panel_4x5.m
+++ b/test/3/octave/plot_panel_4x5.m
@@ -6,7 +6,7 @@ function r_val = plot_panel_4x5 ...
       thr_str, ...
       dirpath, ...
       arch_str, ...
-       vend_str  ...
+       vend_leg_str  ...
     )

 impl = 'octave';
@@ -25,11 +25,13 @@ else
 	position  = [100 100 1864 1540];
 	papersize = [15.6 19.4];
 	%leg_pos_st = [1.15 8.70 2.1 1.2 ]; % (dgemm)
-	leg_pos_st = [1.60 8.80 2.1 1.2 ]; % (dgemm)
+	%leg_pos_st = [1.60 8.80 2.1 1.2 ]; % (dgemm)
+	leg_pos_st = [15.90 13.60 2.1 1.2 ]; % (strsm)
 	%leg_pos_mt = [12.20 13.60 2.1 1.2 ]; % (strmm)
 	%leg_pos_mt = [5.30 12.60 2.1 1.2 ]; % (ssymm)
 	%leg_pos_mt = [8.50 13.62 2.1 1.2 ]; % (ssyrk)
-	leg_pos_mt = [5.30 5.10 2.1 1.2 ]; % (chemm)
+	%leg_pos_mt = [5.30 5.10 2.1 1.2 ]; % (chemm)
+	leg_pos_mt = [15.90 13.60 2.1 1.2 ]; % (strsm)
 	sp_margins = [ 0.068 0.051 ];
 end

@@ -59,7 +61,7 @@ eige_str = 'eigen';

 % Create filename "templates" for the files that contain the performance
 % results.
-filetemp      = '%s/output_%s_%s_%s.m'
+filetemp      = '%s/output_%s_%s_%s.m';
 filetemp_blis = sprintf( filetemp, '%s', '%s', '%s', blis_str );
 filetemp_open = sprintf( filetemp, '%s', '%s', '%s', open_str );
 filetemp_vend = sprintf( filetemp, '%s', '%s', '%s', vend_str );
@@ -102,7 +104,7 @@ for opi = 1:n_opnames
 	              data_blis, ...
 	              data_open, ...
 	              data_eige, ...
-	              data_vend, vend_str, ...
+	              data_vend, vend_leg_str, ...
 	              nth, ...
 	              4, 5, ...
 	              cfreq, ...
--- a/test/3/octave/runthese.m
+++ b/test/3/octave/runthese.m
@@ -24,7 +24,6 @@ plot_panel_4x5(2.60,16,64, '1s','../results/zen2/20200929/jc4ic4jr4','zen2','MKL
 plot_panel_4x5(2.60,16,128,'2s','../results/zen2/20200929/jc8ic4jr4','zen2','MKL'); close all; clear all;

 % a64fx
-plot_panel_4x5(2.20,32,1, 'st','../results/a64fx/20210316/st',        'a64fx','Fujitsu SSL2'); close all; clear all;
-plot_panel_4x5(2.20,32,12,'1s','../results/a64fx/20210316/jc1ic4jr3', 'a64fx','Fujitsu SSL2'); close all; clear all;
-plot_panel_4x5(2.20,32,48,'2s','../results/a64fx/20210316/jc1ic4jr12','a64fx','Fujitsu SSL2'); close all; clear all;
-
+plot_panel_4x5(2.20,32,1, 'st','../results/a64fx/20210520/st',        'a64fx','Fujitsu SSL2'); close all; clear all;
+plot_panel_4x5(2.20,32,12,'1s','../results/a64fx/20210520/jc1ic1jr12','a64fx','Fujitsu SSL2'); close all; clear all;
+plot_panel_4x5(2.20,32,48,'2s','../results/a64fx/20210520/jc1ic4jr12','a64fx','Fujitsu SSL2'); close all; clear all;
--- a/testsuite/src/test_gemm.c
+++ b/testsuite/src/test_gemm.c
@@ -254,18 +254,17 @@ void libblis_test_gemm_experiment
 		bli_setsc(  0.9,  1.0, &beta );
 	}

+	#if 0
+	//bli_setm( &BLIS_ONE, &a );
+	bli_setsc(  1.0,  0.0, &alpha );
+	bli_setsc(  1.0,  0.0, &beta );
+	#endif
+
 	// Randomize A, B, and C, and save C.
 	libblis_test_mobj_randomize( params, TRUE, &a );
 	libblis_test_mobj_randomize( params, TRUE, &b );
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
-//bli_setm( &BLIS_ONE, &a );
-//bli_setsc(  1.0,  0.0, &alpha );
-//bli_setsc(  0.0,  0.0, &beta );
-
-//bli_setm( &BLIS_ONE, &a );
-//bli_setsc(  1.0,  0.0, &alpha );
-//bli_setsc(  0.0,  0.0, &beta );

 	// Apply the parameters.
 	bli_obj_set_conjtrans( transa, &a );
@@ -456,11 +455,13 @@ bli_printm( "c", c, "%5.2f", "" );
 //     bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
 //bli_printm( "c before", c, "%6.3f", "" );
 		bli_gemm( alpha, a, b, beta, c );
+		//bls_gemm( alpha, a, b, beta, c );
 #if 0
 if ( bli_obj_length( c ) == 12 &&
     bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
 bli_printm( "c after", c, "%6.3f", "" );
 #endif
+//bli_printm( "c after", c, "%5.2f", "" );
 		break;

 		default: