Added 512b SVE-based a64fx subconfig + SVE kernels.

Details: - Added 512-bit specific 'a64fx' subconfiguration that uses empirically tuned block size by Stepan Nassyr. This subconfig also sets the sector cache size and enables memory-tagging code in SVE gemm kernels. This subconfig utilizes (16, k) and (10, k) DPACKM kernels. - Added a vector-length agnostic 'armsve' subconfiguration that computes blocksizes according to the analytical model. This part is ported from Stepan Nassyr's repository. - Implemented vector-length-agnostic [d/s/sh] gemm kernels for Arm SVE at size (2*VL, 10). These kernels use unindexed FMLA instructions because indexed FMLA takes 2 FMA units in many implementations. PS: There are indexed-FLMA kernels in Stepan Nassyr's repository. - Implemented 512-bit SVE dpackm kernels with in-register transpose support for sizes (16, k) and (10, k). - Extended 256-bit SVE dpackm kernels by Linaro Ltd. to 512-bit for size (12, k). This dpackm kernel is not currently used by any subconfiguration. - Implemented several experimental dgemmsup kernels which would improve performance in a few cases. However, those dgemmsup kernels generally underperform hence they are not currently used in any subconfig. - Note: This commit squashes several commits submitted by RuQing Xu via PR #424.
2026-06-29 10:47:16 +00:00 · 2021-05-19 23:52:29 +09:00
parent 5d46dbee4a
commit 61584deddf
34 changed files with 4957 additions and 9 deletions
--- a/config/a64fx/bli_a64fx_sector_cache.h
+++ b/config/a64fx/bli_a64fx_sector_cache.h
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+    // A64FX: set up cache sizes
+    //
+    // Reference: A64FX (TM) specification Fujitsu HPC Extension
+    // Link:      https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf
+    //
+    // 63:15 |    14:12    |  11  |    10:08    |  07  |    06:04    |  03  |    02:00    |
+    // RES0  | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max |
+    //
+    // the bits set number of maximum sectors from 0-7
+    // 000 - 0
+    // 001 - 1
+    // 010 - 2
+    // 011 - 3
+    // 100 - 4
+    // 101 - 5
+    // 110 - 6
+    // 111 - 7
+    //
+    // For L1 we want to maximize the number of sectors for B
+    // Configuration 1: 1 sector for  C (sector 3)
+    //                  1 sector for  A (sector 1)
+    //                  6 sectors for B (sector 2)
+    //                  0 sectors for the rest (sector 0)
+    // 
+    // 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000
+    //
+    // Configuration 2: 1 sector for  C (sector 3)
+    //                  1 sector for  A (sector 1)
+    //                  5 sectors for B (sector 2)
+    //                  1 sectors for the rest (sector 0)
+    // 
+    // 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001
+    //
+    // accessing the control register:
+    //
+    // MRS <Xt>, S3_3_C11_C8_2
+    // MSR S3_3_C11_C8_2, <Xt>
+    //
+    // TODO: First tests showed no change in performance, a deeper investigation
+    //       is necessary
+#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\
+{\
+    uint64_t sector_cache_config = config_bitfield;\
+    __asm__ volatile(\
+            "msr s3_3_c11_c8_2,%[sector_cache_config]"\
+            :\
+            : [sector_cache_config] "r" (sector_cache_config)\
+            :\
+            );\
+}
+
+#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\
+{\
+    uint64_t sector_cache_config = config_bitfield;\
+    __asm__ volatile(\
+            "msr s3_3_c15_c8_2,%[sector_cache_config]"\
+            :\
+            : [sector_cache_config] "r" (sector_cache_config)\
+            :\
+            );\
+}
+
+
+#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\
+" mov "#sparereg", "#tag"      \n\t"\
+" lsl "#sparereg", "#sparereg", 56  \n\t"\
+" orr "#areg", "#areg", "#sparereg"   \n\t"
+
+#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\
+__asm__ volatile(\
+        "mrs %["#output_uint64"],s3_3_c11_c8_2"\
+        : [output_uint64] "=r" (output_uint64)\
+        : \
+        :\
+        );
+
+#define A64FX_SCC(sec0,sec1,sec2,sec3)\
+    (uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12))
+
+#define A64FX_SCC_L2(sec02,sec13)\
+    (uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8))
+
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -0,0 +1,151 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_a64fx_sector_cache.h"
+
+void bli_cntx_init_a64fx( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+	blksz_t thresh[ BLIS_NUM_THRESH ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_a64fx_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native gemm micro-kernels and
+	// their storage preferences.
+	bli_cntx_set_l3_nat_ukrs
+	(
+	  2,
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  cntx
+	);
+
+	// Set SVE-512 packing routine.
+	bli_cntx_set_packm_kers
+	(
+	  3,
+	  BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+	  BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
+	  BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+	  cntx
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    16,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    10,    10,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   256,   128,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  2048,  2048,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  BLIS_NAT, 5,
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  cntx
+	);
+
+#if 0
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,   65,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,   65,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,   65,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  4,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  cntx
+	);
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    10,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,    16,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
+#endif
+
+	// Set A64FX cache sector sizes for each PE/CMG
+	// SC Fugaku might disable users' setting cache sizes.
+#if !defined(CACHE_SECTOR_SIZE_READONLY)
+#pragma omp parallel
+	{
+	  A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0))
+	  A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28))
+	}
+#endif
+
+}
+
--- a/config/a64fx/bli_family_a64fx.h
+++ b/config/a64fx/bli_family_a64fx.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+#define BLIS_SIMD_ALIGN_SIZE    256
+#define BLIS_SIMD_NUM_REGISTERS 32
+
+
+//#endif
+
--- a/config/a64fx/make_defs.mk
+++ b/config/a64fx/make_defs.mk
@@ -0,0 +1,82 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := a64fx
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -D_GNU_SOURCE -D_A64FX
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3 -ftree-vectorize -march=armv8-a+sve
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
--- a/config/armsve/bli_armsve_config_utils.c
+++ b/config/armsve/bli_armsve_config_utils.c
@@ -0,0 +1,92 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+
+dim_t bli_vl_bits_armsve(void)
+{ \
+    uint64_t vl = 0;
+    __asm__ (
+      " mov  x0, xzr   \n\t"
+      " incb x0        \n\t"
+      " mov  %[vl], x0 \n\t"
+    : [vl] "=r" (vl)
+    : 
+    : "x0"
+     );
+    return vl;
+}
+
+
+#define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \
+void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
+                                  dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \
+{ \
+    dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \
+    dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \
+    dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \
+    dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \
+    dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \
+    dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \
+    dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \
+    dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \
+    dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \
+\
+    dim_t vl_b = bli_vl_bits_armsve(); \
+    dim_t vl = vl_b / S_Data; \
+    dim_t m_r = 2 * vl; \
+    dim_t n_r = 10; \
+\
+    dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \
+        / (n_r * S_Data); \
+\
+    dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \
+    dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \
+    m_c -= m_c % m_r; \
+\
+    dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \
+    dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \
+    n_c -= n_c % n_r; \
+\
+    *m_r_ = m_r; \
+    *n_r_ = n_r; \
+    *k_c_ = k_c; \
+    *m_c_ = m_c; \
+    *n_c_ = n_c; \
+}
+
+EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
+EXPANDMAC_BLKSZ_ARMSVE( d, 8 )
+
--- a/config/armsve/bli_armsve_config_utils.h
+++ b/config/armsve/bli_armsve_config_utils.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+
+dim_t bli_vl_bits_armsve(void);
+
+void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
+void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
+
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_armsve_config_utils.h"
+
+void bli_cntx_init_armsve( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+#if 0
+	blksz_t thresh[ BLIS_NUM_THRESH ];
+#endif
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_armsve_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Block size.
+	dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
+	dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
+	bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
+	bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
+
+	// Update the context with optimized native gemm micro-kernels and
+	// their storage preferences.
+	bli_cntx_set_l3_nat_ukrs
+	(
+	  2,
+	  // These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  cntx
+	);
+
+	// Set VL-specific packing routines if applicable.
+	if (m_r_d==16)
+	  bli_cntx_set_packm_kers
+	  (
+		3,
+		BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
+		BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
+		BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+		cntx
+	  );
+	else if (m_r_d==8)
+	  bli_cntx_set_packm_kers
+	  (
+		1,
+		BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk,
+		cntx
+	  );
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  BLIS_NAT, 5,
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  cntx
+	);
+
+#if 0
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  101,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  101,   -1,   -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  101,   -1,   -1 );
+
+	// Initialize the context with the sup thresholds.
+	bli_cntx_set_l3_sup_thresh
+	(
+	  3,
+	  BLIS_MT, &thresh[ BLIS_MT ],
+	  BLIS_NT, &thresh[ BLIS_NT ],
+	  BLIS_KT, &thresh[ BLIS_KT ],
+	  cntx
+	);
+
+	// Update the context with optimized small/unpacked gemm kernels.
+	bli_cntx_set_l3_sup_kers
+	(
+	  4,
+	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
+	  cntx
+	);
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1, n_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1, m_r_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  2048,    -1,    -1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes for small/unpacked level-3 problems.
+	bli_cntx_set_l3_sup_blkszs
+	(
+	  5,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
+	  cntx
+	);
+#endif
+}
+
--- a/config/armsve/bli_family_armsve.h
+++ b/config/armsve/bli_family_armsve.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+#define BLIS_SIMD_ALIGN_SIZE    256
+#define BLIS_SIMD_NUM_REGISTERS 32
+
+// SVE-specific configs.
+#define N_L1_SVE_DEFAULT 64
+#define W_L1_SVE_DEFAULT 4
+#define C_L1_SVE_DEFAULT 256
+#define N_L2_SVE_DEFAULT 2048
+#define W_L2_SVE_DEFAULT 16
+#define C_L2_SVE_DEFAULT 256
+#define N_L3_SVE_DEFAULT 8192
+#define W_L3_SVE_DEFAULT 16
+#define C_L3_SVE_DEFAULT 256
+
+//#endif
+
--- a/config/armsve/make_defs.mk
+++ b/config/armsve/make_defs.mk
@@ -0,0 +1,82 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := armsve
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CPPROCFLAGS    := -D_GNU_SOURCE
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3 -ftree-vectorize -march=armv8-a+sve
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
--- a/2
+++ b/2
@@ -32,6 +32,8 @@ piledriver:  piledriver
 bulldozer:   bulldozer

 # ARM architectures.
+armsve:      armsve/armsve
+a64fx:       a64fx/armsve
 thunderx2:   thunderx2/armv8a
 cortexa57:   cortexa57/armv8a
 cortexa53:   cortexa53/armv8a
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -173,6 +173,12 @@ void bli_arch_set_id( void )
 		#endif

 		// ARM microarchitectures.
+		#ifdef BLIS_FAMILY_ARMSVE
+		id = BLIS_ARCH_ARMSVE;
+		#endif
+		#ifdef BLIS_FAMILY_A64FX
+		id = BLIS_ARCH_A64FX;
+		#endif
 		#ifdef BLIS_FAMILY_THUNDERX2
 		id = BLIS_ARCH_THUNDERX2;
 		#endif
@@ -242,6 +248,8 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
    "thunderx2",
    "cortexa57",
    "cortexa53",
+    "armsve",
+    "a64fx",
    "cortexa15",
    "cortexa9",

--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -76,7 +76,7 @@ arch_t bli_cpuid_query_id( void )
 	printf( "vendor   = %s\n", vendor==1 ? "AMD": "INTEL" );
 	printf("family    = %x\n", family );
 	printf( "model    = %x\n", model );
-	
+
 	printf( "features = %x\n", features );
 #endif

@@ -455,6 +455,14 @@ arch_t bli_cpuid_query_id( void )
 		{
 			// Check for each ARMv8 configuration that is enabled, check for that
 			// microarchitecture. We check from most recent to most dated.
+#ifdef BLIS_CONFIG_ARMSVE
+			if ( bli_cpuid_is_armsve( model, part, features ) )
+				return BLIS_ARCH_ARMSVE;
+#endif
+#ifdef BLIS_CONFIG_A64FX
+			if ( bli_cpuid_is_a64fx( model, part, features ) )
+				return BLIS_ARCH_A64FX;
+#endif
 #ifdef BLIS_CONFIG_THUNDERX2
 			if ( bli_cpuid_is_thunderx2( model, part, features ) )
 				return BLIS_ARCH_THUNDERX2;
@@ -537,6 +545,36 @@ bool bli_cpuid_is_cortexa53
 	return TRUE;
 }

+bool bli_cpuid_is_armsve
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
+{
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_SVE;
+
+	if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
+
+	return TRUE;
+}
+
+bool bli_cpuid_is_a64fx
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
+{
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_SVE;
+
+	if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
+
+	return TRUE;
+}
+
 bool bli_cpuid_is_cortexa15
     (
       uint32_t family,
@@ -1032,6 +1070,10 @@ uint32_t bli_cpuid_query
 	     strstr( feat_str, "asimd" ) != NULL )
 		*features |= FEATURE_NEON;

+	// Parse the feature string to check for SVE features.
+	if ( strstr( feat_str, "sve" ) != NULL )
+		*features |= FEATURE_SVE;
+
 	//printf( "bli_cpuid_query(): features var: %u\n", *features );

 	// Parse the processor string to uncover the model.
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -72,6 +72,8 @@ bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features
 bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features );
+bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features );
+bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );

@@ -175,7 +177,8 @@ enum
 };
 enum
 {
-	FEATURE_NEON = 0x1
+	FEATURE_NEON = 0x01,
+	FEATURE_SVE  = 0x02
 };

 #endif
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -144,6 +144,16 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_cortexa53_ref,
 		                                              bli_cntx_init_cortexa53_ind );
 #endif
+#ifdef BLIS_CONFIG_ARMSVE
+		bli_gks_register_cntx( BLIS_ARCH_ARMSVE,      bli_cntx_init_armsve,
+		                                              bli_cntx_init_armsve_ref,
+		                                              bli_cntx_init_armsve_ind );
+#endif
+#ifdef BLIS_CONFIG_A64FX
+		bli_gks_register_cntx( BLIS_ARCH_A64FX,       bli_cntx_init_a64fx,
+		                                              bli_cntx_init_a64fx_ref,
+		                                              bli_cntx_init_a64fx_ind );
+#endif
 #ifdef BLIS_CONFIG_CORTEXA15
 		bli_gks_register_cntx( BLIS_ARCH_CORTEXA15,   bli_cntx_init_cortexa15,
 		                                              bli_cntx_init_cortexa15_ref,
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -83,6 +83,12 @@ CNTX_INIT_PROTS( bulldozer )

 // -- ARM architectures --

+#ifdef BLIS_CONFIG_ARMSVE
+CNTX_INIT_PROTS( armsve )
+#endif
+#ifdef BLIS_CONFIG_A64FX
+CNTX_INIT_PROTS( a64fx )
+#endif
 #ifdef BLIS_CONFIG_THUNDERX2
 CNTX_INIT_PROTS( thunderx2 )
 #endif
@@ -183,6 +189,12 @@ CNTX_INIT_PROTS( generic )

 // -- ARM architectures --

+#ifdef BLIS_FAMILY_ARMSVE
+#include "bli_family_armsve.h"
+#endif
+#ifdef BLIS_FAMILY_A64FX
+#include "bli_family_a64fx.h"
+#endif
 #ifdef BLIS_FAMILY_THUNDERX2
 #include "bli_family_thunderx2.h"
 #endif
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1005,6 +1005,8 @@ typedef enum
 	BLIS_ARCH_BULLDOZER,

 	// ARM
+	BLIS_ARCH_ARMSVE,
+	BLIS_ARCH_A64FX,
 	BLIS_ARCH_THUNDERX2,
 	BLIS_ARCH_CORTEXA57,
 	BLIS_ARCH_CORTEXA53,
@@ -1029,7 +1031,7 @@ typedef enum

 // NOTE: This value must be updated to reflect the number of enum values
 // listed above for arch_t!
-#define BLIS_NUM_ARCHS 22
+//#define BLIS_NUM_ARCHS 25


 //
--- a/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
+++ b/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h
@@ -0,0 +1,45 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \
+  "trn1    " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
+  "trn2    " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \
+  "compact " #DST2".d, " #P2C", " #DST0".d \n\t" \
+  "compact " #DST3".d, " #P2C", " #DST1".d \n\t" \
+  "compact " #DST4".d, " #P4C", " #DST0".d \n\t" \
+  "compact " #DST5".d, " #P4C", " #DST1".d \n\t" \
+  "compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \
+  "compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t"
+
--- a/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
+++ b/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h
@@ -0,0 +1,97 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
+  "ptrue   " #PT".d \n\t" \
+  "mov     " #XTMP", #2 \n\t" \
+  "whilelo " #P2C".d, xzr, " #XTMP" \n\t" \
+  "mov     " #XTMP", #4 \n\t" \
+  "whilelo " #P4".d, xzr, " #XTMP" \n\t" \
+  "mov     " #XTMP", #6 \n\t" \
+  "whilelo " #P6".d, xzr, " #XTMP" \n\t" \
+ \
+  "eor     " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \
+  "orr     " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \
+ \
+  "not     " #P2C".b, " #PT"/z, " #P2C".b \n\t" \
+  "not     " #P4C".b, " #PT"/z, " #P4".b \n\t" \
+  "not     " #P6C".b, " #PT"/z, " #P6".b \n\t" \
+
+#define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \
+  "trn1    " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \
+  "trn2    " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \
+  "trn1    " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \
+  "trn2    " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \
+  "trn1    " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \
+  "trn2    " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \
+  "trn1    " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \
+  "trn2    " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \
+ \
+  "compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \
+  "compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \
+  "ext     " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \
+  "ext     " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \
+  "compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \
+  "compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \
+  "ext     " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \
+  "ext     " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \
+ \
+  "sel     " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \
+  "sel     " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \
+  "sel     " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \
+  "sel     " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \
+  "sel     " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \
+  "sel     " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \
+  "sel     " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \
+  "sel     " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \
+ \
+  "compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \
+  "compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \
+  "compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \
+  "compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \
+  "ext     " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \
+  "ext     " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \
+  "ext     " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \
+  "ext     " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \
+ \
+  "sel     " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \
+  "sel     " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \
+  "sel     " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \
+  "sel     " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \
+  "sel     " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \
+  "sel     " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \
+  "sel     " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \
+  "sel     " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t"
+
--- a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
@@ -52,15 +52,12 @@ void bli_dpackm_armsve256_asm_8xk
       dim_t            cdim_,
       dim_t            n_,
       dim_t            n_max_,
-       void*   restrict kappa_,
-       void*   restrict a_, inc_t inca_, inc_t lda_,
-       void*   restrict p_,              inc_t ldp_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
       cntx_t* restrict cntx
     )
 {
-    double*       a     = ( double* )a_;
-    double*       p     = ( double* )p_;
-    double*       kappa = ( double* )kappa_;
    const int64_t cdim  = cdim_;
    const int64_t mnr   = 8;
    const int64_t n     = n_;
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
@@ -0,0 +1,365 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "armsve512_asm_transpose_d8x8.h"
+#include "armsve512_asm_transpose_d8x2.h"
+
+// assumption:
+//   SVE vector length = 512 bits.
+
+void bli_dpackm_armsve512_asm_10xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 10;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+    const bool    gs    = inca != 1 && lda != 1;
+    const bool    unitk = bli_deq1( *kappa );
+
+#ifdef _A64FX
+    if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
+    {
+        // A twisted way to infer whether A or B is being packed.
+        if ( schema == bli_cntx_schema_a_block(cntx) )
+            p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
+        if ( schema == bli_cntx_schema_b_panel(cntx) )
+            p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
+    }
+#endif
+
+    if ( cdim == mnr && !gs && unitk )
+    {
+        uint64_t n_mker = n / 8;
+        uint64_t n_left = n % 8;
+        __asm__ volatile (
+            "mov  x0, %[a] \n\t"
+            "mov  x1, %[p] \n\t"
+            "mov  x2, %[ldp] \n\t"
+            "mov  x3, %[lda] \n\t"
+            "mov  x4, %[inca] \n\t"
+            "cmp  x4, #1 \n\t"
+            // Skips by sizeof(double).
+            "mov  x8, #8 \n\t"
+            "madd x2, x2, x8, xzr \n\t"
+            "madd x3, x3, x8, xzr \n\t"
+            "madd x4, x4, x8, xzr \n\t"
+            // Loop constants.
+            "mov  x8, %[n_mker] \n\t"
+            "mov  x9, %[n_left] \n\t"
+            "ptrue p0.d \n\t"
+            "b.ne .AROWSTOR \n\t"
+            // A stored in columns.
+            " .ACOLSTOR: \n\t"
+            // Prefetch distance.
+            "mov  x17, #8 \n\t"
+            "madd x17, x17, x3, xzr \n\t"
+#ifdef _A64FX
+            // Disable hardware prefetch for A.
+            "mov  x16, 0x6 \n\t"
+            "lsl  x16, x16, #60 \n\t"
+            "orr  x0, x0, x16 \n\t"
+#endif
+            " .ACOLSTORMKER: \n\t"
+            "cmp  x8, xzr \n\t"
+            "b.eq .ACOLSTORMKEREND \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ldr  q1, [x0, #64] \n\t"
+            "ld1d z2.d, p0/z, [x5] \n\t"
+            "ldr  q3, [x5, #64] \n\t"
+            "ld1d z4.d, p0/z, [x6] \n\t"
+            "ldr  q5, [x6, #64] \n\t"
+            "ld1d z6.d, p0/z, [x7] \n\t"
+            "ldr  q7, [x7, #64] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x0, x7, x3 \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z8.d, p0/z, [x0] \n\t"
+            "ldr  q9, [x0, #64] \n\t"
+            "ld1d z10.d, p0/z, [x5] \n\t"
+            "ldr  q11, [x5, #64] \n\t"
+            "ld1d z12.d, p0/z, [x6] \n\t"
+            "ldr  q13, [x6, #64] \n\t"
+            "ld1d z14.d, p0/z, [x7] \n\t"
+            "ldr  q15, [x7, #64] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            // Plain storage
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "st1d z2.d, p0, [x10] \n\t"
+            "str  q3, [x10, #64] \n\t"
+            "st1d z4.d, p0, [x11] \n\t"
+            "str  q5, [x11, #64] \n\t"
+            "st1d z6.d, p0, [x12] \n\t"
+            "str  q7, [x12, #64] \n\t"
+            "st1d z8.d, p0, [x13] \n\t"
+            "str  q9, [x13, #64] \n\t"
+            "st1d z10.d, p0, [x14] \n\t"
+            "str  q11, [x14, #64] \n\t"
+            "st1d z12.d, p0, [x15] \n\t"
+            "str  q13, [x15, #64] \n\t"
+            "st1d z14.d, p0, [x16] \n\t"
+            "str  q15, [x16, #64] \n\t"
+            "add  x1, x16, x2 \n\t"
+            // Realign and store.
+            // "ext  z1.b, z1.b, z1.b, #16 \n\t"
+            // "ext  z1.b, z1.b, z2.b, #48 \n\t"
+            // "ext  z2.b, z2.b, z3.b, #16 \n\t"
+            // "ext  z2.b, z2.b, z4.b, #32 \n\t"
+            // "ext  z4.b, z4.b, z5.b, #16 \n\t"
+            // "ext  z4.b, z4.b, z6.b, #16 \n\t"
+            // "ext  z6.b, z6.b, z7.b, #16 \n\t"
+            // "ext  z9.b, z9.b, z9.b, #16 \n\t"
+            // "ext  z9.b, z9.b, z10.b, #48 \n\t"
+            // "ext  z10.b, z10.b, z11.b, #16 \n\t"
+            // "ext  z10.b, z10.b, z12.b, #32 \n\t"
+            // "ext  z12.b, z12.b, z13.b, #16 \n\t"
+            // "ext  z12.b, z12.b, z14.b, #16 \n\t"
+            // "ext  z14.b, z14.b, z15.b, #16 \n\t"
+            // "st1d z0.d, p0, [x1] \n\t"
+            // "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            // "st1d z2.d, p0, [x1, #2, mul vl] \n\t"
+            // "st1d z4.d, p0, [x1, #3, mul vl] \n\t"
+            // "st1d z6.d, p0, [x1, #4, mul vl] \n\t"
+            // "add  x1, x1, #320 \n\t"
+            // "st1d z8.d, p0, [x1] \n\t"
+            // "st1d z9.d, p0, [x1, #1, mul vl] \n\t"
+            // "st1d z10.d, p0, [x1, #2, mul vl] \n\t"
+            // "st1d z12.d, p0, [x1, #3, mul vl] \n\t"
+            // "st1d z14.d, p0, [x1, #4, mul vl] \n\t"
+            // "add  x1, x1, #320 \n\t"
+            "add  x0, x7, x3 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .ACOLSTORMKER \n\t"
+            " .ACOLSTORMKEREND: \n\t"
+            " .ACOLSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ldr  q1, [x0, #64] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "add  x0, x0, x3 \n\t"
+            "add  x1, x1, x2 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .ACOLSTORLEFT \n\t"
+            // A stored in rows.
+            " .AROWSTOR: \n\t"
+            // Prepare predicates for in-reg transpose.
+            SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
+            " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
+            "cmp  x8, xzr \n\t"
+            "b.eq .AROWSTORMKEREND \n\t"
+            "add  x10, x0, x4 \n\t"
+            "add  x11, x10, x4 \n\t"
+            "add  x12, x11, x4 \n\t"
+            "add  x13, x12, x4 \n\t"
+            "add  x14, x13, x4 \n\t"
+            "add  x15, x14, x4 \n\t"
+            "add  x16, x15, x4 \n\t"
+            "add  x17, x16, x4 \n\t"
+            "add  x18, x17, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x10] \n\t"
+            "ld1d z2.d, p0/z, [x11] \n\t"
+            "ld1d z3.d, p0/z, [x12] \n\t"
+            "ld1d z4.d, p0/z, [x13] \n\t"
+            "ld1d z5.d, p0/z, [x14] \n\t"
+            "ld1d z6.d, p0/z, [x15] \n\t"
+            "ld1d z7.d, p0/z, [x16] \n\t"
+            "ld1d z22.d, p0/z, [x17] \n\t"
+            "ld1d z23.d, p0/z, [x18] \n\t"
+            // Transpose first 8 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
+            // Transpose last 2 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3)
+            // Plain storage.
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z8.d, p0, [x1] \n\t"
+            "str  q16, [x1, #64] \n\t"
+            "st1d z9.d, p0, [x10] \n\t"
+            "str  q17, [x10, #64] \n\t"
+            "st1d z10.d, p0, [x11] \n\t"
+            "str  q18, [x11, #64] \n\t"
+            "st1d z11.d, p0, [x12] \n\t"
+            "str  q19, [x12, #64] \n\t"
+            "st1d z12.d, p0, [x13] \n\t"
+            "str  q20, [x13, #64] \n\t"
+            "st1d z13.d, p0, [x14] \n\t"
+            "str  q21, [x14, #64] \n\t"
+            "st1d z14.d, p0, [x15] \n\t"
+            "str  q22, [x15, #64] \n\t"
+            "st1d z15.d, p0, [x16] \n\t"
+            "str  q23, [x16, #64] \n\t"
+            "add  x1, x16, x2 \n\t"
+            "add  x0, x0, #64 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .AROWSTORMKER \n\t"
+            " .AROWSTORMKEREND: \n\t"
+            "mov  x4, %[inca] \n\t" // Restore unshifted inca.
+            "index z30.d, xzr, x4 \n\t" // Generate index.
+            "lsl  x4, x4, #3 \n\t" // Shift again.
+            "lsl  x5, x4, #3 \n\t" // Virtual column vl.
+            " .AROWSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "add  x6, x0, x5 \n\t"
+            "add  x7, x6, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
+            "ldr  d1, [x6] \n\t"
+            "ldr  d2, [x7] \n\t"
+            "trn1 v1.2d, v1.2d, v2.2d \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "str  q1, [x1, #64] \n\t"
+            "add  x1, x1, x2 \n\t"
+            "add  x0, x0, #8 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .AROWSTORLEFT \n\t"
+            " .UNITKDONE: \n\t"
+            "mov  x0, #0 \n\t"
+            :
+            : [a]      "r" (a),
+              [p]      "r" (p),
+              [lda]    "r" (lda),
+              [ldp]    "r" (ldp),
+              [inca]   "r" (inca),
+              [n_mker] "r" (n_mker),
+              [n_left] "r" (n_left)
+            : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+              "x8", "x9", "x10","x11","x12","x13","x14","x15",
+              "x16","x17","x18",
+              "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+              "z8", "z9", "z10","z11","z12","z13","z14","z15",
+              "z16","z17","z18","z19","z20","z21","z22","z23",
+              // "z24","z25","z26","z27","z28","z29",
+              "z30","z31",
+              "p0", "p1", "p2", "p3", "p4", // "p5",
+              "p6", "p7", "p8"
+            );
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c
@@ -0,0 +1,359 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Linaro Limited
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <stdio.h>
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#else
+#error "No Arm SVE intrinsics support in compiler"
+#endif // __ARM_FEATURE_SVE
+
+// assumption:
+//   SVE vector length = 512 bits.
+// TODO:
+//   2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed.
+//   prefetching is needed.
+
+void bli_dpackm_armsve512_asm_12xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 12;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+
+    double* restrict alpha1     = a;
+    double* restrict alpha1_8   = alpha1 + 8 * inca;
+    double* restrict alpha1_p4  = alpha1 + 4 * inca;
+    double* restrict alpha1_m4  = alpha1 - 4 * inca;
+    double* restrict pi1        = p;
+    const   svbool_t all_active = svptrue_b64();
+    const   svbool_t first_half_active = svwhilelt_b64(0, 4);
+    const   svbool_t last_half_active  = svnot_z(all_active, first_half_active);
+    svfloat64_t      z_a0;
+    svfloat64_t      z_a8;
+    svfloat64_t      z_a8_lh;
+    svfloat64_t      z_a16;
+    svuint64_t       z_index;
+
+    // creating index for gather/scatter
+    //   with each element as: 0, 1*inca, 2*inca, 3*inca
+    z_index = svindex_u64( 0, inca * sizeof( double ) );
+
+    if ( cdim == mnr )
+    {
+        if ( bli_deq1( *kappa ) )
+        {
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                dim_t k = n;
+                // 2 pack into 3 case.
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // load 12 continuous elments from *a
+                        z_a0 = svld1_f64( all_active, alpha1 );
+                        z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // load 12 continuous elments from *a, filling last half of z8.
+                        z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_f64( all_active, alpha1_p4 );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                // line-by-line packing case.
+                for ( ; k != 0; --k )
+                {
+                    // load 12 continuous elments from *a
+                    z_a0 = svld1_f64( all_active, alpha1 );
+                    z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                dim_t k = n;
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // gather load from *a
+                        z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                        z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // gather load from *a, filling last half of z8.
+                        z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                for ( ; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+        }
+        else  // *kappa != 1.0
+        {
+            // load kappa into vector
+            svfloat64_t z_kappa;
+
+            z_kappa = svdup_f64( *kappa );
+
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                dim_t k = n;
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // load 12 continuous elments from *a
+                        z_a0 = svld1_f64( all_active, alpha1 );
+                        z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // load 12 continuous elments from *a, filling last half of z8.
+                        z_a8_lh = svld1_f64( last_half_active, alpha1_m4 );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_f64( all_active, alpha1_p4 );
+
+                        // multiply by *kappa
+                        z_a0  = svmul_lane_f64( z_a0, z_kappa, 0 );
+                        z_a8  = svmul_lane_f64( z_a8, z_kappa, 0 );
+                        z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                for ( ; k != 0; --k )
+                {
+                    // load 12 continuous elments from *a
+                    z_a0 = svld1_f64( all_active, alpha1 );
+                    z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                dim_t k = n;
+                if ( ldp == mnr )
+                {
+                    for ( ; k > 1; k -= 2 )
+                    {
+                        // gather load from *a
+                        z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                        z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                        // forward address - 0 to 1
+                        alpha1   += lda;
+                        alpha1_p4 = alpha1 + 4 * inca;
+                        alpha1_m4 = alpha1 - 4 * inca;
+
+                        // gather load from *a, filling last half of z8.
+                        z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index );
+                        z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh );
+                        z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index );
+
+                        // multiply by *kappa
+                        z_a0  = svmul_lane_f64( z_a0, z_kappa, 0 );
+                        z_a8  = svmul_lane_f64( z_a8, z_kappa, 0 );
+                        z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 );
+
+                        // stored packed data into *p
+                        svst1_f64( all_active, pi1, z_a0 );
+                        svst1_vnum_f64( all_active, pi1, 1, z_a8 );
+                        svst1_vnum_f64( all_active, pi1, 2, z_a16 );
+
+                        // forward address - 1 to 0
+                        alpha1   += lda;
+                        alpha1_8  = alpha1 + 8 * inca;
+                        pi1      += 2 * ldp;
+                    }
+                }
+                for ( ; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( first_half_active, pi1, 1, z_a8 );
+
+                    alpha1   += lda;
+                    alpha1_8  = alpha1 + 8 * inca;
+                    pi1      += ldp;
+                }
+            }
+        } // end of if ( *kappa == 1.0 )
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
@@ -0,0 +1,363 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "armsve512_asm_transpose_d8x8.h"
+
+// assumption:
+//   SVE vector length = 512 bits.
+
+void bli_dpackm_armsve512_asm_16xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       double* restrict kappa,
+       double* restrict a, inc_t inca_, inc_t lda_,
+       double* restrict p,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 16;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+    const bool    gs    = inca != 1 && lda != 1;
+    const bool    unitk = bli_deq1( *kappa );
+
+#ifdef _A64FX
+    if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) )
+    {
+        // A twisted way to infer whether A or B is being packed.
+        if ( schema == bli_cntx_schema_a_block(cntx) )
+            p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p;
+        if ( schema == bli_cntx_schema_b_panel(cntx) )
+            p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p;
+    }
+#endif
+
+    if ( cdim == mnr && !gs && unitk )
+    {
+        uint64_t n_mker = n / 8;
+        uint64_t n_left = n % 8;
+        __asm__ volatile (
+            "mov  x0, %[a] \n\t"
+            "mov  x1, %[p] \n\t"
+            "mov  x2, %[ldp] \n\t"
+            "mov  x3, %[lda] \n\t"
+            "mov  x4, %[inca] \n\t"
+            "cmp  x4, #1 \n\t"
+            // Skips by sizeof(double).
+            "mov  x8, #8 \n\t"
+            "madd x2, x2, x8, xzr \n\t"
+            "madd x3, x3, x8, xzr \n\t"
+            "madd x4, x4, x8, xzr \n\t"
+
+            // "mov  x8, 0x8 \n\t" // Control#0 for A address.
+            // "mov  x8, 0x24 \n\t" // Higher 6bit for Control#0:
+            // "lsl  x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong
+            // "orr  x8, x8, x3 \n\t" // Stride.
+            // "msr  S3_3_C11_C6_0, x8 \n\t" // Write system register.
+
+            // Loop constants.
+            "mov  x8, %[n_mker] \n\t"
+            "mov  x9, %[n_left] \n\t"
+            "ptrue p0.d \n\t"
+            "b.ne .AROWSTOR \n\t"
+            // A stored in columns.
+            " .ACOLSTOR: \n\t"
+            // Prefetch distance.
+            "mov  x17, #8 \n\t"
+            "madd x17, x17, x3, xzr \n\t"
+#ifdef _A64FX
+            "mov  x16, 0x6 \n\t" // Disable hardware prefetch for A.
+            "lsl  x16, x16, #60 \n\t"
+            "orr  x0, x0, x16 \n\t"
+#endif
+            // "add  x5, x0, x3 \n\t"
+            // "add  x6, x5, x3 \n\t"
+            // "add  x7, x6, x3 \n\t"
+            // "prfm PLDL1STRM, [x0] \n\t"
+            // "prfm PLDL1STRM, [x5] \n\t"
+            // "prfm PLDL1STRM, [x6] \n\t"
+            // "prfm PLDL1STRM, [x7] \n\t"
+            // "add  x18, x7, x3 \n\t"
+            // "add  x5, x18, x3 \n\t"
+            // "add  x6, x5, x3 \n\t"
+            // "add  x7, x6, x3 \n\t"
+            // "prfm PLDL1STRM, [x18] \n\t"
+            // "prfm PLDL1STRM, [x5] \n\t"
+            // "prfm PLDL1STRM, [x6] \n\t"
+            // "prfm PLDL1STRM, [x7] \n\t"
+            " .ACOLSTORMKER: \n\t"
+            "cmp  x8, xzr \n\t"
+            "b.eq .ACOLSTORMKEREND \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
+            "ld1d z2.d, p0/z, [x5] \n\t"
+            "ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t"
+            "ld1d z4.d, p0/z, [x6] \n\t"
+            "ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t"
+            "ld1d z6.d, p0/z, [x7] \n\t"
+            "ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x0, x7, x3 \n\t"
+            "add  x5, x0, x3 \n\t"
+            "add  x6, x5, x3 \n\t"
+            "add  x7, x6, x3 \n\t"
+            "ld1d z8.d, p0/z, [x0] \n\t"
+            "ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t"
+            "ld1d z10.d, p0/z, [x5] \n\t"
+            "ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t"
+            "ld1d z12.d, p0/z, [x6] \n\t"
+            "ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t"
+            "ld1d z14.d, p0/z, [x7] \n\t"
+            "ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t"
+            "add  x18, x17, x0 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x5 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x6 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "add  x18, x17, x7 \n\t"
+            "prfm PLDL1STRM, [x18] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            "st1d z2.d, p0, [x10] \n\t"
+            "st1d z3.d, p0, [x10, #1, mul vl] \n\t"
+            "st1d z4.d, p0, [x11] \n\t"
+            "st1d z5.d, p0, [x11, #1, mul vl] \n\t"
+            "st1d z6.d, p0, [x12] \n\t"
+            "st1d z7.d, p0, [x12, #1, mul vl] \n\t"
+            "st1d z8.d, p0, [x13] \n\t"
+            "st1d z9.d, p0, [x13, #1, mul vl] \n\t"
+            "st1d z10.d, p0, [x14] \n\t"
+            "st1d z11.d, p0, [x14, #1, mul vl] \n\t"
+            "st1d z12.d, p0, [x15] \n\t"
+            "st1d z13.d, p0, [x15, #1, mul vl] \n\t"
+            "st1d z14.d, p0, [x16] \n\t"
+            "st1d z15.d, p0, [x16, #1, mul vl] \n\t"
+            "add  x0, x7, x3 \n\t"
+            "add  x1, x16, x2 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .ACOLSTORMKER \n\t"
+            " .ACOLSTORMKEREND: \n\t"
+            " .ACOLSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            "add  x0, x0, x3 \n\t"
+            "add  x1, x1, x2 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .ACOLSTORLEFT \n\t"
+            // A stored in rows.
+            " .AROWSTOR: \n\t"
+            // Prepare predicates for in-reg transpose.
+            SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
+            " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
+            "cmp  x8, xzr \n\t"
+            "b.eq .AROWSTORMKEREND \n\t"
+            "add  x10, x0, x4 \n\t"
+            "add  x11, x10, x4 \n\t"
+            "add  x12, x11, x4 \n\t"
+            "add  x13, x12, x4 \n\t"
+            "add  x14, x13, x4 \n\t"
+            "add  x15, x14, x4 \n\t"
+            "add  x16, x15, x4 \n\t"
+            "ld1d z0.d, p0/z, [x0] \n\t"
+            "ld1d z1.d, p0/z, [x10] \n\t"
+            "ld1d z2.d, p0/z, [x11] \n\t"
+            "ld1d z3.d, p0/z, [x12] \n\t"
+            "ld1d z4.d, p0/z, [x13] \n\t"
+            "ld1d z5.d, p0/z, [x14] \n\t"
+            "ld1d z6.d, p0/z, [x15] \n\t"
+            "ld1d z7.d, p0/z, [x16] \n\t"
+            "add  x5, x16, x4 \n\t"
+            "add  x10, x5, x4 \n\t"
+            "add  x11, x10, x4 \n\t"
+            "add  x12, x11, x4 \n\t"
+            "add  x13, x12, x4 \n\t"
+            "add  x14, x13, x4 \n\t"
+            "add  x15, x14, x4 \n\t"
+            "add  x16, x15, x4 \n\t"
+            "ld1d z16.d, p0/z, [x5] \n\t"
+            "ld1d z17.d, p0/z, [x10] \n\t"
+            "ld1d z18.d, p0/z, [x11] \n\t"
+            "ld1d z19.d, p0/z, [x12] \n\t"
+            "ld1d z20.d, p0/z, [x13] \n\t"
+            "ld1d z21.d, p0/z, [x14] \n\t"
+            "ld1d z22.d, p0/z, [x15] \n\t"
+            "ld1d z23.d, p0/z, [x16] \n\t"
+            // Transpose first 8 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6)
+            // Transpose last 8 rows.
+            SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6)
+            "add  x10, x1, x2 \n\t"
+            "add  x11, x10, x2 \n\t"
+            "add  x12, x11, x2 \n\t"
+            "add  x13, x12, x2 \n\t"
+            "add  x14, x13, x2 \n\t"
+            "add  x15, x14, x2 \n\t"
+            "add  x16, x15, x2 \n\t"
+            "st1d z8.d, p0, [x1] \n\t"
+            "st1d z24.d, p0, [x1, #1, mul vl] \n\t"
+            "st1d z9.d, p0, [x10] \n\t"
+            "st1d z25.d, p0, [x10, #1, mul vl] \n\t"
+            "st1d z10.d, p0, [x11] \n\t"
+            "st1d z26.d, p0, [x11, #1, mul vl] \n\t"
+            "st1d z11.d, p0, [x12] \n\t"
+            "st1d z27.d, p0, [x12, #1, mul vl] \n\t"
+            "st1d z12.d, p0, [x13] \n\t"
+            "st1d z28.d, p0, [x13, #1, mul vl] \n\t"
+            "st1d z13.d, p0, [x14] \n\t"
+            "st1d z29.d, p0, [x14, #1, mul vl] \n\t"
+            "st1d z14.d, p0, [x15] \n\t"
+            "st1d z30.d, p0, [x15, #1, mul vl] \n\t"
+            "st1d z15.d, p0, [x16] \n\t"
+            "st1d z31.d, p0, [x16, #1, mul vl] \n\t"
+            "add  x0, x0, #64 \n\t"
+            "add  x1, x16, x2 \n\t"
+            "sub  x8, x8, #1 \n\t"
+            "b    .AROWSTORMKER \n\t"
+            " .AROWSTORMKEREND: \n\t"
+            "mov  x4, %[inca] \n\t" // Restore unshifted inca.
+            "index z30.d, xzr, x4 \n\t" // Generate index.
+            "lsl  x4, x4, #3 \n\t" // Shift again.
+            "lsl  x5, x4, #3 \n\t" // Virtual column vl.
+            " .AROWSTORLEFT: \n\t"
+            "cmp  x9, xzr \n\t"
+            "b.eq .UNITKDONE \n\t"
+            "add  x6, x0, x5 \n\t"
+            "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
+            "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
+            "st1d z0.d, p0, [x1] \n\t"
+            "st1d z1.d, p0, [x1, #1, mul vl] \n\t"
+            "add  x1, x1, x2 \n\t"
+            "add  x0, x0, #8 \n\t"
+            "sub  x9, x9, #1 \n\t"
+            "b    .AROWSTORLEFT \n\t"
+            " .UNITKDONE: \n\t"
+            "mov  x0, #0 \n\t"
+            :
+            : [a]      "r" (a),
+              [p]      "r" (p),
+              [lda]    "r" (lda),
+              [ldp]    "r" (ldp),
+              [inca]   "r" (inca),
+              [n_mker] "r" (n_mker),
+              [n_left] "r" (n_left)
+            : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+              "x8", "x9", "x10","x11","x12","x13","x14","x15",
+              "x16","x17","x18",
+              "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+              "z8", "z9", "z10","z11","z12","z13","z14","z15",
+              // "z16","z17","z18","z19","z20","z21","z22","z23",
+              // "z24","z25","z26","z27","z28","z29","z30","z31",
+              "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7"
+            );
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
--- a/kernels/armsve/3/armsve_asm_2vx10.h
+++ b/kernels/armsve/3/armsve_asm_2vx10.h
@@ -0,0 +1,191 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \
+  GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+  GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \
+  GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \
+  GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \
+  GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \
+  GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \
+  GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \
+  \
+  GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \
+  GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7)
+
+// Second through forth microkernels are the first one with B vectors rotated.
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT)
+// NOTE:
+//  The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit
+//  (sth. akin to loop-invariant):
+//   - BV[0-7] holds B[0:7, 4*k_cur]
+//   - B's address stops at B[0, 4*k_cur+1]
+
+// Final loop inside K=4 microkernels.
+#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \
+  GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \
+  GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+  GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
+  GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
+  GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
+  GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
+  GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
+  GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
+  GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
+  GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
+
+// K=4 MKer loop with B memory scattered.
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+" mov             "#BELMADDR", "#BADDR"           \n\t" \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
+  \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT)
+
+#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \
+  GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \
+" add             "#BADDR", "#BRSBIT", "#BADDR"   \n\t" /* B address forward */ \
+" mov             "#BELMADDR", "#BADDR"           \n\t" \
+  GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \
+  GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \
+  GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \
+  GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \
+  GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \
+  GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \
+  GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \
+  GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7)
+
+
+#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \
+  CLEAR_COL4(Z00,Z01,Z02,Z03) \
+  CLEAR_COL4(Z04,Z05,Z06,Z07) \
+  CLEAR_COL4(Z08,Z09,Z10,Z11) \
+  CLEAR_COL4(Z12,Z13,Z14,Z15) \
+  CLEAR_COL4(Z16,Z17,Z18,Z19)
+
+#define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \
+  SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \
+  SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \
+  SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \
+  SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \
+  SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR)
+
+#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
+  GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE)
+
+#define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
+
+#define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS)
+
+#define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \
+  GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
+  GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS)
+
+#define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
+
+#define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
+
+#define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \
+  GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP)
+
--- a/kernels/armsve/3/armsve_asm_macros.h
+++ b/kernels/armsve/3/armsve_asm_macros.h
@@ -0,0 +1,123 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#define CLEAR_COL2(Z0,Z1) \
+" dup  "#Z0"."DT", #0 \n\t" \
+" dup  "#Z1"."DT", #0 \n\t"
+
+#define CLEAR_COL4(Z0,Z1,Z2,Z3) \
+  CLEAR_COL2(Z0,Z1) \
+  CLEAR_COL2(Z2,Z3)
+
+#define SCALE_COL2(Z0,Z1,ZFACTOR) \
+" fmul  "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
+" fmul  "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
+
+#define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \
+  SCALE_COL2(Z0,Z1,ZFACTOR) \
+  SCALE_COL2(Z2,Z3,ZFACTOR)
+
+// Prefetch or not.
+#define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT)
+#define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \
+" prfm  PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t"
+
+#define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" fmla  "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \
+" fmla  "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */
+
+#define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
+  GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" "LD1R"  "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
+
+#define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \
+  GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" "LD1R"  "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \
+" add     "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */
+
+#define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
+" "LD1"  "#ZFH"."DT", "#PFH"/z, ["#AADDR"]            \n\t" \
+" "LD1"  "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t"
+
+#define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
+" "LD1"  "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
+" add    "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
+" "LD1"  "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t"
+
+// Prefetch or not.
+#define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
+#define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \
+" "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \
+" add    "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \
+" "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t"
+
+#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \
+" add  "#ATEMP", "#AADDR", "#A4KS" \n\t" \
+" add  "#AADDR", "#AADDR", "#ACS"  \n\t" /* Forward A's address to the next column. */ \
+  GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \
+  PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0)
+
+#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \
+" add  "#ATEMP", "#AADDR", "#A4KS" \n\t" \
+  GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
+" add  "#ATEMP", "#AADDR", "#APS"  \n\t" \
+  GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \
+" add  "#AADDR", "#AADDR", "#ACS"  \n\t" /* Forward A's address to the next column. */ \
+  GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP)
+
+#define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
+  GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \
+" add  "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */
+
+#define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \
+" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"]             \n\t" \
+" "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \
+" add    "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */
+
+#define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \
+" fmad  "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \
+" fmad  "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t"
+
+#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+  GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \
+" add  "#CADDR", "#CADDR", "#CCS"      \n\t"
+
+#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \
+" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \
+" add   "#CTEMP", "#CADDR", "#CVSKIP"   \n\t" \
+" "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \
+" add   "#CADDR", "#CADDR", "#CCS"      \n\t"
+
+
--- a/kernels/armsve/3/armsve_asm_macros_double.h
+++ b/kernels/armsve/3/armsve_asm_macros_double.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use double precision.
+#define DT    "d"
+#define LD1   "ld1d"
+#define ST1   "st1d"
+#define LD1R  "ld1rd"
+#define PRFG  "prfd"
+#define SZ    "8"
+#define OFFS  "lsl #3"
+// Include macros.
+#include "armsve_asm_macros.h"
+
--- a/kernels/armsve/3/armsve_asm_macros_half.h
+++ b/kernels/armsve/3/armsve_asm_macros_half.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use half precision.
+#define DT    "h"
+#define LD1   "ld1h"
+#define ST1   "st1h"
+#define LD1R  "ld1rh"
+#define PRFG  "prfh"
+#define SZ    "2"
+// #define OFFS UNSUPPORTED
+// Include macros.
+#include "armsve_asm_macros.h"
+
--- a/kernels/armsve/3/armsve_asm_macros_single.h
+++ b/kernels/armsve/3/armsve_asm_macros_single.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use single precision.
+#define DT    "s"
+#define LD1   "ld1w"
+#define ST1   "st1w"
+#define LD1R  "ld1rw"
+#define PRFG  "prfw"
+#define SZ    "4"
+#define OFFS  "uxtw #2"
+// Include macros.
+#include "armsve_asm_macros.h"
+
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -0,0 +1,318 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Double-precision composite instructions.
+#include "armsve_asm_macros_double.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10.h"
+
+void bli_dgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a,
+       double*    restrict b,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  __asm__ volatile (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incd            x2, ALL, MUL #2                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x8, 0x3                         \n\t" // Tag C address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x5, x5, x8                      \n\t"
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" ptrue           p0.d                            \n\t"
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+
+" ld1rd           z20.d, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rd           z21.d, p0/z, [x1, 8]            \n\t"
+" ld1rd           z22.d, p0/z, [x1, 16]           \n\t"
+" ld1rd           z23.d, p0/z, [x1, 24]           \n\t"
+" ld1rd           z24.d, p0/z, [x1, 32]           \n\t"
+" ld1rd           z25.d, p0/z, [x1, 40]           \n\t"
+" ld1rd           z26.d, p0/z, [x1, 48]           \n\t"
+" ld1rd           z27.d, p0/z, [x1, 56]           \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x8, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+" ld1rd           z20.d, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rd           z21.d, p0/z, [x1, 8]            \n\t"
+" ld1rd           z22.d, p0/z, [x1, 16]           \n\t"
+" ld1rd           z23.d, p0/z, [x1, 24]           \n\t"
+" ld1rd           z24.d, p0/z, [x1, 32]           \n\t"
+" ld1rd           z25.d, p0/z, [x1, 40]           \n\t"
+" ld1rd           z26.d, p0/z, [x1, 48]           \n\t"
+" ld1rd           z27.d, p0/z, [x1, 56]           \n\t"
+" ld1rd           z28.d, p0/z, [x1, 64]           \n\t"
+" ld1rd           z29.d, p0/z, [x1, 72]           \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x0, x0, x2                      \n\t" // Forward A.
+" add             x1, x1, x3                      \n\t" // Forward B.
+" sub             x8, x8, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ldr             x4, [x4]                        \n\t" // Load alpha & beta (value).
+" ldr             x8, [x8]                        \n\t"
+" dup             z30.d, x4                       \n\t" // Broadcast alpha & beta into vectors.
+" dup             z31.d, x8                       \n\t"
+" fmov            d28, #1.0                       \n\t" // Prepare FP 1.0.
+" fmov            x16, d28                        \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+#ifdef _A64FX
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+" prfm            PLDL1STRM, [x0]                 \n\t"
+" prfm            PLDL1STRM, [x0, 256*1]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*2]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*3]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*4]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*5]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*6]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*7]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*8]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*9]          \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*10]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*11]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*12]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*13]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*14]         \n\t"
+// " prfm            PLDL2KEEP, [x0, 256*15]         \n\t"
+" prfm            PLDL1STRM, [x1]                 \n\t"
+" prfm            PLDL1STRM, [x1, 256*1]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+// " prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+"                                                 \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t" // Preload first half of C for contiguous case.
+" b.ne            WRITE_MEM                       \n\t"
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" cmp             x16, x4                         \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+// First half of C is already loaded in this case.
+GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+   );
+}
+
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -0,0 +1,307 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+   Copyright (C) 2019, Forschunszentrum Juelich
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Single-precision composite instructions.
+#include "armsve_asm_macros_single.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10.h"
+
+void bli_sgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,
+       float*     restrict b,
+       float*     restrict beta,
+       float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  __asm__ volatile (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incw            x2, ALL, MUL #2                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x8, 0x3                         \n\t" // Tag C address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x5, x5, x8                      \n\t"
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #4                          \n\t" // Multiply some address skips by sizeof(float).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" ptrue           p0.s                            \n\t"
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+
+" ld1rw           z20.s, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rw           z21.s, p0/z, [x1, 4]            \n\t"
+" ld1rw           z22.s, p0/z, [x1, 8]            \n\t"
+" ld1rw           z23.s, p0/z, [x1, 12]           \n\t"
+" ld1rw           z24.s, p0/z, [x1, 16]           \n\t"
+" ld1rw           z25.s, p0/z, [x1, 20]           \n\t"
+" ld1rw           z26.s, p0/z, [x1, 24]           \n\t"
+" ld1rw           z27.s, p0/z, [x1, 28]           \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x8, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+" ld1rw           z20.s, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rw           z21.s, p0/z, [x1, 4]            \n\t"
+" ld1rw           z22.s, p0/z, [x1, 8]            \n\t"
+" ld1rw           z23.s, p0/z, [x1, 12]           \n\t"
+" ld1rw           z24.s, p0/z, [x1, 16]           \n\t"
+" ld1rw           z25.s, p0/z, [x1, 20]           \n\t"
+" ld1rw           z26.s, p0/z, [x1, 24]           \n\t"
+" ld1rw           z27.s, p0/z, [x1, 28]           \n\t"
+" ld1rw           z28.s, p0/z, [x1, 32]           \n\t"
+" ld1rw           z29.s, p0/z, [x1, 36]           \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x0, x0, x2                      \n\t" // Forward A.
+" add             x1, x1, x3                      \n\t" // Forward B.
+" sub             x8, x8, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ldr             w4, [x4]                        \n\t" // Load alpha & beta (value).
+" ldr             w8, [x8]                        \n\t"
+" dup             z30.s, w4                       \n\t" // Broadcast alpha & beta into vectors.
+" dup             z31.s, w8                       \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+" prfm            PLDL2KEEP, [x0]                 \n\t"
+" prfm            PLDL2KEEP, [x0, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            s28, #1.0                       \n\t"
+" fmov            w16, s28                        \n\t"
+" cmp             w16, w4                         \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x8, x8, x6, xzr                 \n\t" // C-column's logical 1-vector skip.
+" index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+   );
+}
+
--- a/kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c
@@ -0,0 +1,343 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+   Copyright (C) 2019, Forschunszentrum Juelich
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Half-precision composite instructions.
+#include "armsve_asm_macros_half.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10.h"
+
+// Gather-load / scatter-store instruction for half-precision
+//  needs being defined separately.
+#undef GEMM_CCOL_GATHER_LOAD_FWD
+#undef GEMM_CCOL_SCATTER_STORE_FWD
+
+#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
+" add   x28, "#CADDR", "#CRS2"        \n\t" \
+" ld1h  z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
+" ld1h  "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1]   \n\t" \
+" revh  "#ZFH".s, "#PT"/m, "#ZFH".s                     \n\t" \
+" fadd  "#ZFH".h, "#ZFH".h, z31.h     \n\t" \
+" add   "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
+" add   x28, "#CTEMP", "#CRS2"        \n\t" \
+" ld1h  z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
+" ld1h  "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1]   \n\t" \
+" revh  "#ZLH".s, "#PT"/m, "#ZLH".s                     \n\t" \
+" fadd  "#ZLH".h, "#ZLH".h, z31.h     \n\t" \
+" add  "#CADDR", "#CADDR", "#CCS"     \n\t"
+
+#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \
+" add   x28, "#CADDR", "#CRS2"        \n\t" \
+" st1h  "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \
+" revh  "#ZFH".s, "#PT"/m, "#ZFH".s   \n\t" \
+" st1h  "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1]      \n\t" \
+" add   "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \
+" add   x28, "#CTEMP", "#CRS2"        \n\t" \
+" st1h  "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \
+" revh  "#ZLH".s, "#PT"/m, "#ZLH".s   \n\t" \
+" st1h  "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1]      \n\t" \
+" add   "#CADDR", "#CADDR", "#CCS"    \n\t"
+
+
+void bli_shgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       void*      restrict alpha,
+       void*      restrict a,
+       void*      restrict b,
+       void*      restrict beta,
+       void*      restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+
+  __asm__ volatile (
+" ldr             x0, %[a]                        \n\t"
+" ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" inch            x2, ALL, MUL #2                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+" ldr             x5, %[c]                        \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x8, 0x3                         \n\t" // Tag C address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x5, x5, x8                      \n\t"
+" mov             x8, 0x2                         \n\t" // Tag B address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x1, x1, x8                      \n\t"
+" mov             x8, 0x1                         \n\t" // Tag A address.
+" lsl             x8, x8, #56                     \n\t"
+" orr             x0, x0, x8                      \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #2                          \n\t" // Multiply some address skips by sizeof(float16_t).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" ptrue           p0.b                            \n\t"
+"                                                 \n\t"
+" ldr             x4, %[k_mker]                   \n\t" // Number of loops.
+" ldr             x8, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             x4, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+
+" ld1rh           z20.h, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rh           z21.h, p0/z, [x1, 2]            \n\t"
+" ld1rh           z22.h, p0/z, [x1, 4]            \n\t"
+" ld1rh           z23.h, p0/z, [x1, 6]            \n\t"
+" ld1rh           z24.h, p0/z, [x1, 8]            \n\t"
+" ld1rh           z25.h, p0/z, [x1, 10]           \n\t"
+" ld1rh           z26.h, p0/z, [x1, 12]           \n\t"
+" ld1rh           z27.h, p0/z, [x1, 14]           \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x4, #0                          \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+"                                                 \n\t"
+" subs            x4, x4, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" add             x0, x0, x2                      \n\t" // Forward A's address to the next column.
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3)
+" add             x0, x0, x2                      \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x8, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0)
+" ld1rh           z20.h, p0/z, [x1]               \n\t" // Load 8/10 of first B row.
+" ld1rh           z21.h, p0/z, [x1, 2]            \n\t"
+" ld1rh           z22.h, p0/z, [x1, 4]            \n\t"
+" ld1rh           z23.h, p0/z, [x1, 6]            \n\t"
+" ld1rh           z24.h, p0/z, [x1, 8]            \n\t"
+" ld1rh           z25.h, p0/z, [x1, 10]           \n\t"
+" ld1rh           z26.h, p0/z, [x1, 12]           \n\t"
+" ld1rh           z27.h, p0/z, [x1, 14]           \n\t"
+" ld1rh           z28.h, p0/z, [x1, 16]           \n\t"
+" ld1rh           z29.h, p0/z, [x1, 18]           \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x0, x0, x2                      \n\t" // Forward A.
+" add             x1, x1, x3                      \n\t" // Forward B.
+" sub             x8, x8, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x4, %[alpha]                    \n\t" // Load alpha & beta (address).
+" ldr             x8, %[beta]                     \n\t"
+" ld1rh           z30.h, p0/z, [x4]               \n\t" // Load alpha & beta into vectors.
+" ld1rh           z31.h, p0/z, [x8]               \n\t"
+" fmov            w4, h28                         \n\t" // Copy alpha & beta to GP registers.
+" fmov            w8, h29                         \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x0, %[a_next]                   \n\t"
+" ldr             x1, %[b_next]                   \n\t"
+" prfm            PLDL2KEEP, [x0]                 \n\t"
+" prfm            PLDL2KEEP, [x0, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x0, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x0, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            h28, #1.0                       \n\t"
+" fmov            w16, h28                        \n\t"
+" cmp             w16, w4                         \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x10, xzr                        \n\t"
+" incb            x10                             \n\t"
+" madd            x10, x10, x6, xzr               \n\t" // C-column's logical 1-vector skip.
+" mov             x28, #2                         \n\t"
+" madd            x6, x28, x6, xzr                \n\t" // Double index skip for half-precision case.
+" index           z30.s, wzr, w6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16)
+" dup             z31.h, w8                       \n\t" // Restore beta destroyed by loading.
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16)
+"                                                 \n\t"
+" dup             z31.h, w8                       \n\t" // Restore beta destroyed by loading.
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [a]      "m" (a),
+  [b]      "m" (b),
+  [c]      "m" (c),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x16","x10","x28",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+   );
+}
+
--- a/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
+++ b/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
@@ -0,0 +1,450 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// Separate instantiation for ArmSVE reference kernels.
+// Temporary workaround. Will be removed after upstream has switched to a better way
+//  of exposing gemmsup interface.
+
+//
+// -- Row storage case ---------------------------------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       conj_t              conja, \
+       conj_t              conjb, \
+       dim_t               m, \
+       dim_t               n, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	/* NOTE: This microkernel can actually handle arbitrarily large
+       values of m, n, and k. */ \
+\
+	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
+			{ \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
+				PASTEMAC(ch,conjs)( ab ); \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 )
+
+//
+// -- Column storage case ------------------------------------------------------
+//
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       conj_t              conja, \
+       conj_t              conjb, \
+       dim_t               m, \
+       dim_t               n, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	/* NOTE: This microkernel can actually handle arbitrarily large
+       values of m, n, and k. */ \
+\
+	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
+			{ \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
+				PASTEMAC(ch,conjs)( ab ); \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 )
+
--- a/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c
@@ -0,0 +1,528 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+#include <assert.h>
+
+// Double-precision composite instructions.
+#include "../armsve_asm_macros_double.h"
+
+// 2vx10 microkernels.
+#include "../armsve_asm_2vx10.h"
+
+// Prototype reference kernel.
+GEMMSUP_KER_PROT( double,   d, gemmsup_c_armsve_ref2 )
+
+void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  static int called = 0;
+  if ( !called )
+  {
+    fprintf(stderr, "rv called.\n");
+    called = 1;
+  }
+  // c*c requires A to be stored in columns.
+  assert( rs_a0 == 1 );
+
+  dim_t n0_mker = n0 / 10;
+  dim_t n0_left = n0 % 10;
+
+  if ( n0_left )
+  {
+    // A[:, ::]
+    // B[::, n0_mker*10:n0]
+    // C[: , n0_mker*10:n0]
+    double *ai = a;
+    double *bi = b + n0_mker * 10 * cs_b0;
+    double *ci = c + n0_mker * 10 * cs_c0;
+    bli_dgemmsup_c_armsve_ref2
+    (
+      conja, conjb,
+      m0, n0_left, k0,
+      alpha,
+      ai, rs_a0, cs_a0,
+      bi, rs_b0, cs_b0,
+      beta,
+      ci, rs_c0, cs_c0,
+      data,
+      cntx
+    );
+  }
+  // Return if it's a pure edge case.
+  if ( !n0_mker )
+    return;
+
+  // Determine VL.
+  uint64_t vlen2;
+  __asm__ (
+    " mov  x0, xzr          \n\t"
+    " incd x0, ALL, MUL #2  \n\t"
+    " mov  %[vlen2], x0     \n\t"
+  : [vlen2] "=r" (vlen2)
+  :
+  : "x0"
+   );
+
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  // uint64_t rs_a   = 1;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  uint64_t cs_b   = cs_b0;
+
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t n_mker = n0_mker;
+
+  dim_t m0_mker = m0 / vlen2;
+  dim_t m0_left = m0 % vlen2;
+  if ( m0_left )
+  {
+    // Edge case on A side can be handled with one more (predicated) loop.
+    m0_mker++;
+  } else
+    m0_left = vlen2;
+  // uint64_t ps_a = bli_auxinfo_ps_a( data );
+  uint64_t ps_b = bli_auxinfo_ps_b( data );
+
+  for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker )
+  {
+    uint64_t m_curr = vlen2;
+    if ( im0_mker == m0_mker - 1 )
+    {
+      // Last m-loop. Maybe unnecessary.
+      m_curr = m0_left;
+    }
+    double *ai = a + im0_mker * vlen2 * rs_a0;
+    double *bi = b;
+    double *ci = c + im0_mker * vlen2 * rs_c0;
+
+    void* a_next = bli_auxinfo_next_a( data );
+    void* b_next = bli_auxinfo_next_b( data );
+
+    __asm__ volatile (
+" ldr             x0, %[bi]                       \n\t"
+" ldr             x1, %[rs_b]                     \n\t" // Row-skip of B.
+" ldr             x2, %[cs_b]                     \n\t" // Column-skip of B (element skip of B[l, :]).
+" ldr             x3, %[ps_b]                     \n\t" // Panel-skip (10*k) of B.
+" ldr             x4, %[cs_a]                     \n\t" // Column-Skip of A.
+"                                                 \n\t" // Element skip of A[:, l] is guaranteed to be 1.
+" ldr             x5, %[ci]                       \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x5, x5, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x0, x0, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
+" madd            x1, x8, x1, xzr                 \n\t" // rs_b
+" madd            x2, x8, x2, xzr                 \n\t" // cs_b
+" madd            x3, x8, x3, xzr                 \n\t" // ps_b
+" madd            x4, x8, x4, xzr                 \n\t" // cs_a
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" mov             x8, #4                          \n\t"
+" madd            x15, x8, x4, xzr                \n\t" // Logical K=4 microkernel skip for A.
+"                                                 \n\t"
+#ifdef _A64FX
+" mov             x16, 0x20                       \n\t" // Higher 6bit for Control#2:
+" lsl             x16, x16, #58                   \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong
+" orr             x16, x16, x4                    \n\t" // Stride.
+" msr             S3_3_C11_C6_2, x16              \n\t" // Write system register.
+#endif
+"                                                 \n\t"
+" ldr             x8, %[m_curr]                   \n\t" // Size of first dimension.
+" mov             x9, xzr                         \n\t"
+" incd            x9                              \n\t"
+" ptrue           p0.d                            \n\t"
+" whilelo         p1.d, xzr, x8                   \n\t"
+" whilelo         p2.d, x9, x8                    \n\t"
+"                                                 \n\t"
+" ldr             x8, %[n_mker]                   \n\t" // Number of N-loops.
+"                                                 \n\t"
+" ldr             x20, %[ai]                      \n\t" // Parameters to be reloaded
+" ldr             x21, %[k_mker]                  \n\t" //  within each millikernel loop.
+" ldr             x22, %[k_left]                  \n\t"
+" ldr             x23, %[alpha]                   \n\t"
+" ldr             x24, %[beta]                    \n\t"
+" ldr             x25, %[a_next]                  \n\t"
+" ldr             x26, %[b_next]                  \n\t"
+" ldr             x23, [x23]                      \n\t" // Directly load alpha and beta.
+" ldr             x24, [x24]                      \n\t"
+"                                                 \n\t"
+" MILLIKER_MLOOP:                                 \n\t"
+"                                                 \n\t"
+" mov             x11, x0                         \n\t" // B's address.
+// " ldr             x10, %[ai]                      \n\t" // A's address.
+" mov             x10, x20                        \n\t"
+// " ldr             x12, %[k_mker]                  \n\t"
+" mov             x12, x21                        \n\t"
+// " ldr             x13, %[k_left]                  \n\t"
+" mov             x13, x22                        \n\t"
+#ifdef _A64FX
+" mov             x16, 0x3                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x10, x10, x16                   \n\t"
+" mov             x16, 0xa                        \n\t" // Control#2 for A address.
+" lsl             x16, x16, #60                   \n\t"
+" orr             x10, x10, x16                   \n\t"
+#endif
+"                                                 \n\t"
+" cmp             x12, #0                         \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+"                                                 \n\t"
+" mov             x14, x11                        \n\t"
+" ld1rd           z20.d, p0/z, [x14]              \n\t" // Load 8/10 of first B row.
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z21.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z22.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z23.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z24.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z25.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z26.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z27.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" prfm            PLDL1KEEP, [x14]                \n\t" // And prefetch the 2/10 left.
+" add             x14, x14, x2                    \n\t"
+" prfm            PLDL1KEEP, [x14]                \n\t"
+" sub             x14, x14, x2                    \n\t" // Restore x14 to load edge.
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10)
+" add             x16, x10, x4                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t" // Prefetch 3/4 of A.
+" add             x16, x10, x4                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x10, x4                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             x6, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, x5                         \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" add             x16, x16, x7                    \n\t"
+" prfm            PLDL1STRM, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x12, #0                         \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
+" add             x10, x10, x4                    \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x13, #0                         \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10)
+" mov             x14, x11                        \n\t"
+" ld1rd           z20.d, p0/z, [x14]              \n\t" // Load 10/10 B.
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z21.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z22.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z23.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z24.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z25.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z26.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z27.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z28.d, p0/z, [x14]              \n\t"
+" add             x14, x14, x2                    \n\t"
+" ld1rd           z29.d, p0/z, [x14]              \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x10, x10, x4                    \n\t" // Forward A.
+" add             x11, x11, x1                    \n\t" // Forward B.
+" sub             x13, x13, #1                    \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+// " ldr             x10, %[ai]                      \n\t"
+" mov             x10, x20                        \n\t"
+" add             x11, x0, x3                     \n\t"
+" dup             z30.d, x23                      \n\t" // Broadcast alpha & beta into vectors.
+" dup             z31.d, x24                      \n\t"
+"                                                 \n\t"
+" cmp             x8, #1                          \n\t"
+" b.eq            PREFETCH_ABNEXT                 \n\t"
+" prfm            PLDL1STRM, [x10]                \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" add             x11, x11, x2                    \n\t"
+" prfm            PLDL1KEEP, [x11]                \n\t"
+" b               WRITE_MEM                       \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+// " ldr             x1, %[a_next]                   \n\t" // Final Millikernel loop, x1 and x2 not needed.
+" mov             x1, x25                         \n\t"
+// " ldr             x2, %[b_next]                   \n\t"
+" mov             x2, x26                         \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x2]                 \n\t"
+" prfm            PLDL2KEEP, [x2, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            d28, #1.0                       \n\t"
+" fmov            x16, d28                        \n\t"
+" cmp             x16, x23                        \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is x5 itself.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+" mov             x13, xzr                        \n\t" // C-column's physical 1-vector skip.
+" incb            x13                             \n\t"
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x12, xzr                        \n\t"
+" incb            x12                             \n\t"
+" madd            x13, x12, x6, xzr               \n\t" // C-column's logical 1-vector skip.
+" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" subs            x8, x8, #1                      \n\t"
+" b.eq            END_EXEC                        \n\t"
+"                                                 \n\t" // Address of C already forwarded to next column.
+" add             x0, x0, x3                      \n\t" // Forward B's base address to the next logic panel.
+" b               MILLIKER_MLOOP                  \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [bi]     "m" (bi),
+  [rs_b]   "m" (rs_b),
+  [cs_b]   "m" (cs_b),
+  [ps_b]   "m" (ps_b),
+  [cs_a]   "m" (cs_a),
+  [ci]     "m" (ci),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [m_curr] "m" (m_curr),
+  [n_mker] "m" (n_mker),
+  [ai]     "m" (ai),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x10","x11","x12","x13","x14","x15","x16","x17",
+  "x20","x21","x22","x23","x24","x25","x26",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+     );
+  }
+}
+
+void bli_dgemmsup_rv_armsve_10x2v_unindexed
+     (
+       conj_t              conjat,
+       conj_t              conjbt,
+       dim_t               m0t,
+       dim_t               n0t,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict at, inc_t rs_at0, inc_t cs_at0,
+       double*    restrict bt, inc_t rs_bt0, inc_t cs_bt0,
+       double*    restrict beta,
+       double*    restrict ct, inc_t rs_ct0, inc_t cs_ct0,
+       auxinfo_t* restrict datat,
+       cntx_t*    restrict cntx
+     )
+{
+  auxinfo_t data;
+  bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data );
+  bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data );
+  bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data );
+  bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data );
+  bli_dgemmsup_cv_armsve_2vx10_unindexed
+  (
+    conjbt, conjat,
+    n0t, m0t, k0,
+    alpha,
+    bt, cs_bt0, rs_bt0,
+    at, cs_at0, rs_at0,
+    beta,
+    ct, cs_ct0, rs_ct0,
+    &data,
+    cntx
+  );
+}
+
--- a/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c
@@ -0,0 +1,412 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+#include <assert.h>
+
+// Double-precision composite instructions.
+#include "../armsve_asm_macros_double.h"
+
+// 2vx10 microkernels.
+#include "../armsve_asm_2vx10.h"
+
+// Prototype reference kernel.
+GEMMSUP_KER_PROT( double,   d, gemmsup_r_armsve_ref2 )
+
+void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  static int called = 0;
+  if ( !called )
+  {
+    fprintf(stderr, "rv called.\n");
+    called = 1;
+  }
+  // r*r requires B to be stored in rows.
+  assert(cs_b0 == 1);
+
+  dim_t n0_mker = n0 / 10;
+  dim_t n0_left = n0 % 10;
+
+  if ( n0_left )
+  {
+    // A[:, ::]
+    // B[::, n0_mker*10:n0]
+    // C[: , n0_mker*10:n0]
+    double *ai = a;
+    double *bi = b + n0_mker * 10 * cs_b0;
+    double *ci = c + n0_mker * 10 * cs_c0;
+    bli_dgemmsup_r_armsve_ref2
+    (
+      conja, conjb,
+      m0, n0_left, k0,
+      alpha,
+      ai, rs_a0, cs_a0,
+      bi, rs_b0, cs_b0,
+      beta,
+      ci, rs_c0, cs_c0,
+      data,
+      cntx
+    );
+  }
+  // Return if it's a pure edge case.
+  if ( !n0_mker )
+    return;
+
+  // Determine VL.
+  uint64_t vlen2;
+  __asm__ (
+    " mov  x0, xzr          \n\t"
+    " incd x0, ALL, MUL #2  \n\t"
+    " mov  %[vlen2], x0     \n\t"
+  : [vlen2] "=r" (vlen2)
+  :
+  : "x0"
+   );
+
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  uint64_t rs_a   = rs_a0;
+  uint64_t cs_a   = cs_a0;
+  uint64_t rs_b   = rs_b0;
+  // uint64_t cs_b   = 1;
+
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t m_mker = m0 / vlen2;
+  uint64_t m_left = m0 % vlen2;
+  if ( m_left )
+  {
+    // Edge case on A side can be handled with one more (predicated) loop.
+    m_mker++;
+  } else
+    m_left = vlen2;
+  uint64_t ps_a = bli_auxinfo_ps_a( data );
+  // uint64_t ps_b = bli_auxinfo_ps_b( data );
+
+  for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker )
+  {
+    double *ai = a;
+    double *bi = b + in0_mker * 10 * cs_b0;
+    double *ci = c + in0_mker * 10 * cs_c0;
+
+    void* a_next = bli_auxinfo_next_a( data );
+    void* b_next = bli_auxinfo_next_b( data );
+
+    __asm__ volatile (
+" ldr             x0, %[ai]                       \n\t"
+" ldr             x1, %[rs_a]                     \n\t" // Row-skip of A (element skip of A[:, l]).
+" ldr             x2, %[cs_a]                     \n\t" // Column-skip of A.
+" ldr             x3, %[ps_a]                     \n\t" // Panel-skip (vlen2*k) of A.
+" ldr             x4, %[rs_b]                     \n\t" // Row-Skip of B.
+"                                                 \n\t" // Element skip of B[l, :] is guaranteed to be 1.
+" ldr             x5, %[ci]                       \n\t"
+" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
+" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x5, x5, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x0, x0, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
+" madd            x2, x8, x2, xzr                 \n\t" // cs_a
+" madd            x3, x8, x3, xzr                 \n\t" // ps_a
+" madd            x4, x8, x4, xzr                 \n\t" // rs_b
+" madd            x7, x8, x7, xzr                 \n\t" // cs_c
+" mov             x8, xzr                         \n\t"
+" incb            x8                              \n\t"
+" madd            x14, x8, x1, xzr                \n\t" // A-column's logical 1-vector skip.
+" mov             x8, #4                          \n\t"
+" madd            x15, x8, x2, xzr                \n\t" // Logical K=4 microkernel skip for A.
+// " mov             x8, #4                          \n\t"
+// " madd            x17, x8, x4, xzr                \n\t" // Logical K=4 microkernel skip for B.
+"                                                 \n\t"
+" ldr             x8, %[m_mker]                   \n\t" // Number of M-loops.
+" ptrue           p0.d                            \n\t"
+" ptrue           p1.d                            \n\t"
+" ptrue           p2.d                            \n\t"
+"                                                 \n\t"
+" MILLIKER_MLOOP:                                 \n\t"
+"                                                 \n\t"
+" cmp             x8, #1                          \n\t"
+" b.ne            UKER_BEGIN                      \n\t"
+"                                                 \n\t"
+" ldr             x10, %[m_left]                  \n\t" // Final (incomplete) millikernel loop.
+" mov             x11, xzr                        \n\t"
+" incd            x11                             \n\t"
+" whilelo         p1.d, xzr, x10                  \n\t" // Overwrite p1/p2.
+" whilelo         p2.d, x11, x10                  \n\t"
+"                                                 \n\t"
+" UKER_BEGIN:                                     \n\t"
+" mov             x10, x0                         \n\t" // A's address.
+" ldr             x11, %[bi]                      \n\t" // B's address.
+" ldr             x12, %[k_mker]                  \n\t"
+" ldr             x13, %[k_left]                  \n\t"
+#ifdef _A64FX
+" mov             x16, 0x3                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             x11, x11, x16                   \n\t"
+#endif
+"                                                 \n\t"
+" mov             x16, x11                        \n\t" // Prefetch first kernel of B.
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, x4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+"                                                 \n\t"
+" ld1rd           z20.d, p0/z, [x11]              \n\t" // (Partial) first B row.
+" ld1rd           z21.d, p0/z, [x11, #8]          \n\t"
+" ld1rd           z22.d, p0/z, [x11, #16]         \n\t"
+" ld1rd           z23.d, p0/z, [x11, #24]         \n\t"
+" ld1rd           z24.d, p0/z, [x11, #32]         \n\t"
+" ld1rd           z25.d, p0/z, [x11, #40]         \n\t"
+" ld1rd           z26.d, p0/z, [x11, #48]         \n\t"
+" ld1rd           z27.d, p0/z, [x11, #56]         \n\t"
+"                                                 \n\t"
+" index           z29.d, xzr, x1                  \n\t" // First A column.
+"                                                 \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16)
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             x12, #0                         \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t" // Unroll the 4-loop.
+"                                                 \n\t"
+" index           z31.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+"                                                 \n\t"
+" index           z29.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+"                                                 \n\t"
+" index           z31.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+"                                                 \n\t"
+" subs            x12, x12, #1                    \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+" index           z29.d, xzr, x1                  \n\t"
+GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
+" add             x10, x10, x2                    \n\t" // Forward A to fill the blank.
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             x13, #0                         \n\t"
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+" index           z31.d, xzr, x1                  \n\t"
+GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16)
+" ld1rd           z20.d, p0/z, [x11]              \n\t"
+" ld1rd           z21.d, p0/z, [x11, #8]          \n\t"
+" ld1rd           z22.d, p0/z, [x11, #16]         \n\t"
+" ld1rd           z23.d, p0/z, [x11, #24]         \n\t"
+" ld1rd           z24.d, p0/z, [x11, #32]         \n\t"
+" ld1rd           z25.d, p0/z, [x11, #40]         \n\t"
+" ld1rd           z26.d, p0/z, [x11, #48]         \n\t"
+" ld1rd           z27.d, p0/z, [x11, #56]         \n\t"
+" ld1rd           z28.d, p0/z, [x11, #64]         \n\t"
+" ld1rd           z29.d, p0/z, [x11, #72]         \n\t"
+GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
+GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
+GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
+GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
+GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
+GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
+GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
+GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
+GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
+GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
+" add             x10, x10, x2                    \n\t" // Forward A.
+" add             x11, x11, x4                    \n\t" // Forward B.
+" sub             x13, x13, #1                    \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+" ldr             x11, %[bi]                      \n\t"
+" ldr             x12, %[alpha]                   \n\t" // Load alpha & beta.
+" ldr             x13, %[beta]                    \n\t"
+" ld1rd           z30.d, p0/z, [x12]              \n\t"
+" ld1rd           z31.d, p0/z, [x13]              \n\t"
+" ldr             x12, [x12]                      \n\t"
+"                                                 \n\t"
+" cmp             x8, #1                          \n\t"
+" b.eq            PREFETCH_ABNEXT                 \n\t"
+" prfm            PLDL2STRM, [x11]                \n\t"
+" b               WRITE_MEM                       \n\t"
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+" ldr             x1, %[a_next]                   \n\t" // Final Millikernel loop, x1 and x2 not needed.
+" ldr             x2, %[b_next]                   \n\t"
+" prfm            PLDL2KEEP, [x1]                 \n\t"
+" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
+" prfm            PLDL2KEEP, [x1, 256*10]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*11]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*12]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*13]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*14]         \n\t"
+" prfm            PLDL2KEEP, [x1, 256*15]         \n\t"
+" prfm            PLDL2KEEP, [x2]                 \n\t"
+" prfm            PLDL2KEEP, [x2, 256*1]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*2]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*3]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*4]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*5]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*6]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*7]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*8]          \n\t"
+" prfm            PLDL2KEEP, [x2, 256*9]          \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+" fmov            d28, #1.0                       \n\t"
+" fmov            x16, d28                        \n\t"
+" cmp             x16, x12                        \n\t"
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, x5                          \n\t" // C address for loading.
+" mov             x10, x5                         \n\t" // C address for storing.
+" cmp             x6, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-29].
+" mov             x13, xzr                        \n\t" // C-column's physical 1-vector skip.
+" incb            x13                             \n\t"
+GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
+"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
+" mov             x12, xzr                        \n\t"
+" incb            x12                             \n\t"
+" madd            x13, x12, x6, xzr               \n\t" // C-column's logical 1-vector skip.
+" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
+GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
+GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
+GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
+"                                                 \n\t"
+GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16)
+GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
+GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" subs            x8, x8, #1                      \n\t"
+" b.eq            END_EXEC                        \n\t"
+"                                                 \n\t"
+" add             x0, x0, x3                      \n\t" // Forward A's base address to the next logic panel.
+" add             x5, x5, x13                     \n\t" // Forward C's base address to the next logic panel.
+" add             x5, x5, x13                     \n\t"
+" b               MILLIKER_MLOOP                  \n\t"
+"                                                 \n\t"
+" END_ERROR:                                      \n\t"
+" mov             x0, #1                          \n\t" // Return error.
+" END_EXEC:                                       \n\t"
+" mov             x0, #0                          \n\t" // Return normal.
+:
+: [ai]     "m" (ai),
+  [rs_a]   "m" (rs_a),
+  [cs_a]   "m" (cs_a),
+  [ps_a]   "m" (ps_a),
+  [rs_b]   "m" (rs_b),
+  [ci]     "m" (ci),
+  [rs_c]   "m" (rs_c),
+  [cs_c]   "m" (cs_c),
+  [m_mker] "m" (m_mker),
+  [m_left] "m" (m_left),
+  [bi]     "m" (bi),
+  [k_mker] "m" (k_mker),
+  [k_left] "m" (k_left),
+  [alpha]  "m" (alpha),
+  [beta]   "m" (beta),
+  [a_next] "m" (a_next),
+  [b_next] "m" (b_next)
+: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
+  "x9","x10","x11","x12","x13","x14","x15","x16",//"x17",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+     );
+  }
+}
+
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -33,5 +33,13 @@
 */

 GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )
+GEMM_UKR_PROT( double,   d, gemm_armsve_asm_2vx10_unindexed )
+GEMM_UKR_PROT( float,    s, gemm_armsve_asm_2vx10_unindexed )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_2vx10_unindexed )
+GEMMSUP_KER_PROT( double,   d, gemmsup_cv_armsve_2vx10_unindexed )
+GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_10x2v_unindexed )

 PACKM_KER_PROT( double,   d, packm_armsve256_asm_8xk )
+PACKM_KER_PROT( double,   d, packm_armsve512_asm_16xk )
+PACKM_KER_PROT( double,   d, packm_armsve512_asm_12xk )
+PACKM_KER_PROT( double,   d, packm_armsve512_asm_10xk )