From 7118f87e18b4941423472afc00215c1d1f2a1fcd Mon Sep 17 00:00:00 2001
From: Tony Kelman <tony@kelman.net>
Date: Sat, 14 Jun 2014 06:53:20 -0700
Subject: [PATCH 01/19] Use _aligned_malloc instead of posix_memalign on
 Windows

---
 frame/base/bli_malloc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/frame/base/bli_malloc.c b/frame/base/bli_malloc.c
index 3f3e8e804..1b9104b9d 100644
--- a/frame/base/bli_malloc.c
+++ b/frame/base/bli_malloc.c
@@ -43,6 +43,9 @@ void* bli_malloc( siz_t size )
 
 #if BLIS_HEAP_ADDR_ALIGN_SIZE == 1
 	p = malloc( ( size_t )size );
+#elif defined(_WIN32)
+	p = _aligned_malloc( ( size_t )size,
+	                     ( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE );
 #else
 	r_val = posix_memalign( &p,
 	                        ( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE,
@@ -58,6 +61,10 @@ void* bli_malloc( siz_t size )
 
 void bli_free( void* p )
 {
+#if BLIS_HEAP_ADDR_ALIGN_SIZE == 1 || !defined(_WIN32)
 	free( p );
+#else
+	_aligned_free( p );
+#endif
 }
 

From f064711a5e6fb3852c17c7520909b09dc27665f2 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Sun, 15 Jun 2014 06:27:37 -0400
Subject: [PATCH 02/19] SGEMM and DGEMM kernels for PNaCl

---
 .gitignore                          |   2 +
 config/pnacl/bli_config.h           | 165 ++++++++++++++++
 config/pnacl/bli_kernel.h           | 232 ++++++++++++++++++++++
 config/pnacl/kernels                |   1 +
 config/pnacl/make_defs.mk           | 117 +++++++++++
 kernels/nacl/pnacl/3/bli_gemm_opt.c | 294 ++++++++++++++++++++++++++++
 testsuite/Makefile                  |  79 +++++++-
 7 files changed, 889 insertions(+), 1 deletion(-)
 create mode 100644 config/pnacl/bli_config.h
 create mode 100644 config/pnacl/bli_kernel.h
 create mode 120000 config/pnacl/kernels
 create mode 100644 config/pnacl/make_defs.mk
 create mode 100644 kernels/nacl/pnacl/3/bli_gemm_opt.c

diff --git a/.gitignore b/.gitignore
index 94936ab08..8accdfa50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,8 @@
 *.a
 # test executables
 *.x
+*.pexe
+*.nexe
 
 # -- build system files --
 
diff --git a/config/pnacl/bli_config.h b/config/pnacl/bli_config.h
new file mode 100644
index 000000000..f6be2e573
--- /dev/null
+++ b/config/pnacl/bli_config.h
@@ -0,0 +1,165 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_CONFIG_H
+#define BLIS_CONFIG_H
+
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
+
+
+// -- INTEGER PROPERTIES -------------------------------------------------------
+
+// The bit size of the integer type used to track values such as dimensions,
+// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
+// integers while 64 results in 64-bit integers. Any other value results in use
+// of the C99 type "long int". Note that this ONLY affects integers used
+// internally within BLIS as well as those exposed in the native BLAS-like BLIS
+// interface.
+#define BLIS_INT_TYPE_SIZE               32
+
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES                4
+#define BLIS_MAX_TYPE_SIZE               sizeof(dcomplex)
+
+// Enable use of built-in C99 "float complex" and "double complex" types and
+// associated overloaded operations and functions? Disabling results in
+// scomplex and dcomplex being defined in terms of simple structs.
+//#define BLIS_ENABLE_C99_COMPLEX
+
+
+
+// -- MULTITHREADING -----------------------------------------------------------
+
+// The maximum number of BLIS threads that will run concurrently.
+#define BLIS_MAX_NUM_THREADS             1
+
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+// -- Contiguous (static) memory allocator --
+
+// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
+// contiguous memory pools.
+#define BLIS_NUM_MC_X_KC_BLOCKS          BLIS_MAX_NUM_THREADS
+#define BLIS_NUM_KC_X_NC_BLOCKS          BLIS_MAX_NUM_THREADS
+#define BLIS_NUM_MC_X_NC_BLOCKS          0
+
+// The maximum preload byte offset is used to pad the end of the contiguous
+// memory pools so that the micro-kernel, when computing with the end of the
+// last block, can exceed the bounds of the usable portion of the memory
+// region without causing a segmentation fault.
+#define BLIS_MAX_PRELOAD_BYTE_OFFSET     128
+
+// -- Memory alignment --
+
+// It is sometimes useful to define the various memory alignments in terms
+// of some other characteristics of the system, such as the cache line size
+// and the page size.
+#define BLIS_CACHE_LINE_SIZE             64
+#define BLIS_PAGE_SIZE                   4096
+
+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
+// Alignment size used to align local stack buffers within macro-kernel
+// functions.
+#define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when allocating memory dynamically from the operating
+// system (eg: posix_memalign()). To disable heap alignment and just use
+// malloc() instead, set this to 1.
+#define BLIS_HEAP_ADDR_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when sizing leading dimensions of dynamically
+// allocated memory.
+#define BLIS_HEAP_STRIDE_ALIGN_SIZE      BLIS_CACHE_LINE_SIZE
+
+// Alignment size used when allocating entire blocks of contiguous memory
+// from the contiguous memory allocator.
+#define BLIS_CONTIG_ADDR_ALIGN_SIZE      BLIS_PAGE_SIZE
+
+
+
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
+// Enable mixed domain operations?
+//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+
+// Enable extra mixed precision operations?
+//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+
+
+
+// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
+
+// Stay initialized after auto-initialization, unless and until the user
+// explicitly calls bli_finalize().
+#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
+
+
+
+// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
+
+// Enable the BLAS compatibility layer?
+#define BLIS_ENABLE_BLAS2BLIS
+
+// The bit size of the integer type used to track values such as dimensions and
+// leading dimensions (ie: column strides) within the BLAS compatibility layer.
+// A value of 32 results in the compatibility layer using 32-bit signed integers
+// while 64 results in 64-bit integers. Any other value results in use of the
+// C99 type "long int". Note that this ONLY affects integers used within the
+// BLAS compatibility layer.
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     32
+
+// Fortran-77 name-mangling macros.
+#define PASTEF770(name)                        name ## _
+#define PASTEF77(ch1,name)       ch1        ## name ## _
+#define PASTEF772(ch1,ch2,name)  ch1 ## ch2 ## name ## _
+
+
+
+
+#endif
+
diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h
new file mode 100644
index 000000000..d1d29185c
--- /dev/null
+++ b/config/pnacl/bli_kernel.h
@@ -0,0 +1,232 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_KERNEL_H
+#define BLIS_KERNEL_H
+
+/*
+ * SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible.
+ * Therefore, if compilation targets an older Chrome release, we use scalar kernels.
+ * The target Chrome version is indicated by PPAPI_MACRO defined in the header below.
+ */
+#include <ppapi/c/pp_macros.h>
+
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Cache blocksizes --
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes)
+//     (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes)
+//     (b) MR (for zero-padding purposes when MR and NR are "swapped")
+// (3) KC must be a multiple of
+//     (a) MR and
+//     (b) NR (for triangular operations such as trmm and trsm).
+//
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_DEFAULT_MC_S              256
+#define BLIS_DEFAULT_KC_S              256
+#define BLIS_DEFAULT_NC_S              8192
+#else
+#define BLIS_DEFAULT_MC_S              252
+#define BLIS_DEFAULT_KC_S              264
+#define BLIS_DEFAULT_NC_S              8196
+#endif
+
+#define BLIS_DEFAULT_MC_D              1080
+#define BLIS_DEFAULT_KC_D              120
+#define BLIS_DEFAULT_NC_D              8400
+
+#define BLIS_DEFAULT_MC_C              128
+#define BLIS_DEFAULT_KC_C              256
+#define BLIS_DEFAULT_NC_C              4096
+
+#define BLIS_DEFAULT_MC_Z              64
+#define BLIS_DEFAULT_KC_Z              256
+#define BLIS_DEFAULT_NC_Z              2048
+
+// -- Register blocksizes --
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_DEFAULT_MR_S              8
+#define BLIS_DEFAULT_NR_S              4
+#else
+#define BLIS_DEFAULT_MR_S              4
+#define BLIS_DEFAULT_NR_S              3 
+#endif
+
+/* So far, PNaCl does not support double-precision SIMD */
+#define BLIS_DEFAULT_MR_D              4
+#define BLIS_DEFAULT_NR_D              3
+
+#define BLIS_DEFAULT_MR_C              4
+#define BLIS_DEFAULT_NR_C              4
+
+#define BLIS_DEFAULT_MR_Z              4
+#define BLIS_DEFAULT_NR_Z              4
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+//#define BLIS_DEFAULT_KR_S              1
+//#define BLIS_DEFAULT_KR_D              1
+//#define BLIS_DEFAULT_KR_C              1
+//#define BLIS_DEFAULT_KR_Z              1
+
+// -- Cache blocksize extensions (for optimizing edge cases) --
+
+// NOTE: These cache blocksize "extensions" have the same constraints as
+// the corresponding default blocksizes above. When these values are
+// non-zero, blocksizes used at edge cases are extended (enlarged) if
+// such an extension would encompass the remaining portion of the
+// matrix dimension.
+
+//#define BLIS_EXTEND_MC_S               0 //(BLIS_DEFAULT_MC_S/4)
+//#define BLIS_EXTEND_KC_S               0 //(BLIS_DEFAULT_KC_S/4)
+//#define BLIS_EXTEND_NC_S               0 //(BLIS_DEFAULT_NC_S/4)
+
+//#define BLIS_EXTEND_MC_D               0 //(BLIS_DEFAULT_MC_D/4)
+//#define BLIS_EXTEND_KC_D               0 //(BLIS_DEFAULT_KC_D/4)
+//#define BLIS_EXTEND_NC_D               0 //(BLIS_DEFAULT_NC_D/4)
+
+//#define BLIS_EXTEND_MC_C               0 //(BLIS_DEFAULT_MC_C/4)
+//#define BLIS_EXTEND_KC_C               0 //(BLIS_DEFAULT_KC_C/4)
+//#define BLIS_EXTEND_NC_C               0 //(BLIS_DEFAULT_NC_C/4)
+
+//#define BLIS_EXTEND_MC_Z               0 //(BLIS_DEFAULT_MC_Z/4)
+//#define BLIS_EXTEND_KC_Z               0 //(BLIS_DEFAULT_KC_Z/4)
+//#define BLIS_EXTEND_NC_Z               0 //(BLIS_DEFAULT_NC_Z/4)
+
+// -- Register blocksize extensions (for packed micro-panels) --
+
+// NOTE: These register blocksize "extensions" determine whether the
+// leading dimensions used within the packed micro-panels are equal to
+// or greater than their corresponding register blocksizes above.
+
+//#define BLIS_EXTEND_MR_S               0
+//#define BLIS_EXTEND_NR_S               0
+
+//#define BLIS_EXTEND_MR_D               0
+//#define BLIS_EXTEND_NR_D               0
+
+//#define BLIS_EXTEND_MR_C               0
+//#define BLIS_EXTEND_NR_C               0
+
+//#define BLIS_EXTEND_MR_Z               0
+//#define BLIS_EXTEND_NR_Z               0
+
+
+
+// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
+
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+
+
+
+// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
+
+// -- gemm --
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_opt_8x4
+#endif
+
+// -- trsm-related --
+
+
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+
+// -- unpackm --
+
+
+
+
+// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
+
+// -- axpy2v --
+
+// -- dotaxpyv --
+
+// -- axpyf --
+
+// -- dotxf --
+
+// -- dotxaxpyf --
+
+
+
+
+// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
+
+// -- addv --
+
+// -- axpyv --
+
+// -- copyv --
+
+// -- dotv --
+
+// -- dotxv --
+
+// -- invertv --
+
+// -- scal2v --
+
+// -- scalv --
+
+// -- setv --
+
+// -- subv --
+
+// -- swapv --
+
+
+
+#endif
+
diff --git a/config/pnacl/kernels b/config/pnacl/kernels
new file mode 120000
index 000000000..4fa74dbb3
--- /dev/null
+++ b/config/pnacl/kernels
@@ -0,0 +1 @@
+../../kernels/nacl/pnacl
\ No newline at end of file
diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk
new file mode 100644
index 000000000..540531203
--- /dev/null
+++ b/config/pnacl/make_defs.mk
@@ -0,0 +1,117 @@
+#!/bin/bash
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# Only include this block of code once.
+ifndef MAKE_DEFS_MK_INCLUDED
+MAKE_DEFS_MK_INCLUDED := yes
+
+
+
+#
+# --- Build definitions --------------------------------------------------------
+#
+
+# Variables corresponding to other configure-time options.
+BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
+BLIS_ENABLE_STATIC_BUILD        := yes
+BLIS_ENABLE_DYNAMIC_BUILD       := no
+
+
+
+#
+# --- Utility program definitions ----------------------------------------------
+#
+
+SH         := /bin/sh
+MV         := mv
+MKDIR      := mkdir -p
+RM_F       := rm -f
+RM_RF      := rm -rf
+SYMLINK    := ln -sf
+FIND       := find
+GREP       := grep
+XARGS      := xargs
+RANLIB     := pnacl-ranlib
+INSTALL    := install -c
+
+# Used to refresh CHANGELOG.
+GIT        := git
+GIT_LOG    := $(GIT) log --decorate
+
+
+
+#
+# --- Development tools definitions --------------------------------------------
+#
+
+# --- Determine the C compiler and related flags ---
+CC             := pnacl-clang
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
+# NOTE: This is needed to enable posix_memalign().
+CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
+CMISCFLAGS     := -std=c99 -I$(NACL_SDK_ROOT)/include
+CDBGFLAGS      := -g
+CWARNFLAGS     := -Wall
+COPTFLAGS      := -O3
+CKOPTFLAGS     := $(COPTFLAGS) -ffast-math
+CVECFLAGS      :=
+
+# Aggregate all of the flags into multiple groups: one for standard
+# compilation, and one for each of the supported "special" compilation
+# modes.
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+
+# --- Determine the archiver and related flags ---
+AR             := ar
+ARFLAGS        := cru
+
+# --- Determine the linker and related flags ---
+LINKER         := $(CC)
+LDFLAGS        := -lm
+
+# --- Determine the finalizer and related flags ---
+FINALIZER      := pnacl-finalize
+FINFLAGS        :=
+
+# --- Determine the translator and related flags ---
+TRANSLATOR     := pnacl-translate
+TRNSFLAGS      := -O3
+TRNSAMD64FLAGS := -arch x86-64
+TRNSX86FLAGS   := -arch i686
+TRNSARMFLAGS   := -arch armv7
+
+# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
+endif
diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c
new file mode 100644
index 000000000..8261cf0dd
--- /dev/null
+++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c
@@ -0,0 +1,294 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2012, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#if PPAPI_RELEASE >= 36
+typedef float v4sf __attribute__ ((vector_size(16)));
+
+inline v4sf v4sf_splat(float x) {
+	return (v4sf) { x, x, x, x };
+}
+
+inline v4sf v4sf_load(const float* a) {
+	return *((const v4sf*)a);
+}
+
+inline void v4sf_store(float* a, v4sf x) {
+	*((v4sf*)a) = x;
+}
+
+inline v4sf v4sf_zero() {
+	return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
+}
+#endif
+
+#if PPAPI_RELEASE >= 36 /* 8x4 blocks, SIMD enabled */
+void bli_sgemm_opt_8x4(
+	dim_t           k,
+	float *restrict alpha,
+	float *restrict a,
+	float *restrict b,
+	float *restrict beta,
+	float *restrict c,
+	inc_t           rs_c,
+	inc_t           cs_c,
+	auxinfo_t*      data)
+{
+	// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
+	v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
+	v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
+	for (dim_t i = 0; i < k; i += 1) {
+		const v4sf avt = v4sf_load(a);
+		const v4sf avb = v4sf_load(a+4);
+
+		const v4sf bv_xxxx = v4sf_splat(b[0]);
+		abv0t += avt * bv_xxxx;
+		abv0b += avb * bv_xxxx;
+
+		const v4sf bv_yyyy = v4sf_splat(b[1]);
+		abv1t += avt * bv_yyyy;
+		abv1b += avb * bv_yyyy;
+
+		const v4sf bv_zzzz = v4sf_splat(b[2]);
+		abv2t += avt * bv_zzzz;
+		abv2b += avb * bv_zzzz;
+
+		const v4sf bv_wwww = v4sf_splat(b[3]);
+		abv3t += avt * bv_wwww;
+		abv3b += avb * bv_wwww;
+
+		a += 8;
+		b += 4;
+	}
+
+	const v4sf alphav = v4sf_splat(*alpha);
+	abv0t *= alphav;
+	abv0b *= alphav;
+	abv1t *= alphav;
+	abv1b *= alphav;
+	abv2t *= alphav;
+	abv2b *= alphav;
+	abv3t *= alphav;
+	abv3b *= alphav;
+
+	if (rs_c == 1) {
+		v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
+		v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); 
+		v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); 
+		v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); 
+		v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
+		v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); 
+		v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); 
+		v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); 
+
+		const v4sf betav = v4sf_splat(*beta);
+		cv0t = cv0t * betav + abv0t;
+		cv1t = cv1t * betav + abv1t;
+		cv2t = cv2t * betav + abv2t;
+		cv3t = cv3t * betav + abv3t;
+		cv0b = cv0b * betav + abv0b;
+		cv1b = cv1b * betav + abv1b;
+		cv2b = cv2b * betav + abv2b;
+		cv3b = cv3b * betav + abv3b;
+
+		v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
+		v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); 
+		v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); 
+		v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); 
+		v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
+		v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); 
+		v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); 
+		v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); 
+	} else {
+		// Load columns 0, 1, 2, 3 (top part)
+		v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
+		v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
+		v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
+		v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
+		// Load columns 0, 1, 2, 3 (bottom part)
+		v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
+		v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
+		v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
+		v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
+
+		const v4sf betav = v4sf_splat(*beta);
+		cv0t = cv0t * betav + abv0t;
+		cv1t = cv1t * betav + abv1t;
+		cv2t = cv2t * betav + abv2t;
+		cv3t = cv3t * betav + abv3t;
+		cv0b = cv0b * betav + abv0b;
+		cv1b = cv1b * betav + abv1b;
+		cv2b = cv2b * betav + abv2b;
+		cv3b = cv3b * betav + abv3b;
+
+		// Store column 0
+		c[0*rs_c + 0*cs_c] = cv0t[0];
+		c[1*rs_c + 0*cs_c] = cv0t[1];
+		c[2*rs_c + 0*cs_c] = cv0t[2];
+		c[3*rs_c + 0*cs_c] = cv0t[3];
+		c[4*rs_c + 0*cs_c] = cv0b[0];
+		c[5*rs_c + 0*cs_c] = cv0b[1];
+		c[6*rs_c + 0*cs_c] = cv0b[2];
+		c[7*rs_c + 0*cs_c] = cv0b[3];
+
+		// Store column 1
+		c[0*rs_c + 1*cs_c] = cv1t[0];
+		c[1*rs_c + 1*cs_c] = cv1t[1];
+		c[2*rs_c + 1*cs_c] = cv1t[2];
+		c[3*rs_c + 1*cs_c] = cv1t[3];
+		c[4*rs_c + 1*cs_c] = cv1b[0];
+		c[5*rs_c + 1*cs_c] = cv1b[1];
+		c[6*rs_c + 1*cs_c] = cv1b[2];
+		c[7*rs_c + 1*cs_c] = cv1b[3];
+
+		// Store column 2
+		c[0*rs_c + 2*cs_c] = cv2t[0];
+		c[1*rs_c + 2*cs_c] = cv2t[1];
+		c[2*rs_c + 2*cs_c] = cv2t[2];
+		c[3*rs_c + 2*cs_c] = cv2t[3];
+		c[4*rs_c + 2*cs_c] = cv2b[0];
+		c[5*rs_c + 2*cs_c] = cv2b[1];
+		c[6*rs_c + 2*cs_c] = cv2b[2];
+		c[7*rs_c + 2*cs_c] = cv2b[3];
+
+		// Store column 3
+		c[0*rs_c + 3*cs_c] = cv3t[0];
+		c[1*rs_c + 3*cs_c] = cv3t[1];
+		c[2*rs_c + 3*cs_c] = cv3t[2];
+		c[3*rs_c + 3*cs_c] = cv3t[3];
+		c[4*rs_c + 3*cs_c] = cv3b[0];
+		c[5*rs_c + 3*cs_c] = cv3b[1];
+		c[6*rs_c + 3*cs_c] = cv3b[2];
+		c[7*rs_c + 3*cs_c] = cv3b[3];
+	}
+}
+#else /* PPAPI_RELEASE < 36 case: 4x4 blocks, no SIMD */
+void bli_sgemm_opt_4x4(
+	dim_t           k,
+	float *restrict alpha,
+	float *restrict a,
+	float *restrict b,
+	float *restrict beta,
+	float *restrict c,
+	inc_t           rs_c,
+	inc_t           cs_c,
+	auxinfo_t*      data)
+{
+	/* Just call the reference implementation. */
+	BLIS_SGEMM_UKERNEL_REF(
+		k,
+		alpha,
+		a,
+		b,
+		beta,
+		c,
+		rs_c,
+		cs_c,
+		data);
+}
+#endif
+
+void bli_dgemm_opt_4x4(
+	dim_t            k,
+	double *restrict alpha,
+	double *restrict a,
+	double *restrict b,
+	double *restrict beta,
+	double *restrict c,
+	inc_t            rs_c,
+	inc_t            cs_c,
+	auxinfo_t*       data)
+{
+	/* Just call the reference implementation. */
+	BLIS_DGEMM_UKERNEL_REF(
+		k,
+		alpha,
+		a,
+		b,
+		beta,
+		c,
+		rs_c,
+		cs_c,
+		data);
+}
+
+void bli_cgemm_opt_4x4(
+	dim_t              k,
+	scomplex *restrict alpha,
+	scomplex *restrict a,
+	scomplex *restrict b,
+	scomplex *restrict beta,
+	scomplex *restrict c,
+	inc_t              rs_c,
+	inc_t              cs_c,
+	auxinfo_t*         data)
+{
+	/* Just call the reference implementation. */
+	BLIS_CGEMM_UKERNEL_REF(
+		k,
+		alpha,
+		a,
+		b,
+		beta,
+		c,
+		rs_c,
+		cs_c,
+		data);
+}
+
+void bli_zgemm_opt_4x4(
+	dim_t              k,
+	dcomplex *restrict alpha,
+	dcomplex *restrict a,
+	dcomplex *restrict b,
+	dcomplex *restrict beta,
+	dcomplex *restrict c,
+	inc_t              rs_c,
+	inc_t              cs_c,
+	auxinfo_t*         data)
+{
+	/* Just call the reference implementation. */
+	BLIS_ZGEMM_UKERNEL_REF(
+		k,
+		alpha,
+		a,
+		b,
+		beta,
+		c,
+		rs_c,
+		cs_c,
+		data);
+}
+
diff --git a/testsuite/Makefile b/testsuite/Makefile
index abd4c8354..850543bb6 100644
--- a/testsuite/Makefile
+++ b/testsuite/Makefile
@@ -45,7 +45,8 @@
 #
 
 .PHONY: all bin clean \
-        check-env check-env-mk check-env-fragments check-env-make-defs
+        check-env check-env-mk check-env-fragments check-env-make-defs \
+	run run-amd64 run-x86 run-arm
 
 
 
@@ -241,8 +242,21 @@ TEST_OBJS      := $(patsubst $(TEST_SRC_PATH)/%.c, \
                              $(TEST_OBJ_PATH)/%.o, \
                              $(wildcard $(TEST_SRC_PATH)/*.c))
 
+ifeq ($(CONFIG_NAME),pnacl)
+# Linked executable
+TEST_BIN       := test_libblis.unstable.pexe
+# Finalized executable
+TEST_BIN_PNACL := test_libblis.pexe
+# Translated executable for x86-64
+TEST_BIN_AMD64 := test_libblis.x86-64.nexe
+# Translated executable for x86
+TEST_BIN_X86   := test_libblis.x86.nexe
+# Translated executable for ARM
+TEST_BIN_ARM   := test_libblis.arm.nexe
+else
 # Binary executable name.
 TEST_BIN       := test_libblis.x
+endif
 
 # Add installed and local header paths to CFLAGS
 CFLAGS         += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
@@ -257,7 +271,11 @@ CFLAGS         += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
 
 all: check-env bin
 
+ifeq ($(CONFIG_NAME),pnacl)
+bin: check-env $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
+else
 bin: check-env $(TEST_BIN)
+endif
 
 
 # --- Environment check rules ---
@@ -301,9 +319,68 @@ else
 	@$(LINKER) $(TEST_OBJS) $(BLIS_LIB) $(LDFLAGS) -o $@
 endif
 
+ifeq ($(CONFIG_NAME),pnacl)
+
+# Finalize PNaCl executable (i.e. convert from LLVM bitcode to PNaCl bitcode)
+$(TEST_BIN_PNACL): $(TEST_BIN)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
+else
+	@echo "Finalizing $@"
+	@$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
+endif
+
+# Translate PNaCl executable to x86-64 NaCl executable
+$(TEST_BIN_AMD64): $(TEST_BIN_PNACL)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
+else
+	@echo "Translating $< -> $@"
+	@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
+endif
+
+
+# Translate PNaCl executable to x86 NaCl executable
+$(TEST_BIN_X86): $(TEST_BIN_PNACL)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
+else
+	@echo "Translating $< -> $@"
+	@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
+endif
+
+# Translate PNaCl executable to ARMv7 NaCl executable
+$(TEST_BIN_ARM): $(TEST_BIN_PNACL)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
+else
+	@echo "Translating $< -> $@"
+	@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
+endif
+
+endif
+
+# -- Test run rules --
+
+ifeq ($(CONFIG_NAME),pnacl)
+run-amd64: $(TEST_BIN_AMD64)
+	$(NACL_SDK_ROOT)/tools/sel_ldr_x86_64 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_64.nexe -- $(TEST_BIN_AMD64)
+run-x86: $(TEST_BIN_X86)
+	$(NACL_SDK_ROOT)/tools/sel_ldr_x86_32 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_32.nexe -- $(TEST_BIN_X86)
+run-arm: $(TEST_BIN_ARM)
+	$(NACL_SDK_ROOT)/tools/sel_ldr_arm -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_arm.nexe -- $(TEST_BIN_ARM)
+else
+run: $(TEST_BIN)
+	./$(TEST_BIN)
+endif
 
 # -- Clean rules --
 
+ifeq ($(CONFIG_NAME),pnacl)
+clean:
+	- $(RM_F) $(TEST_OBJS) $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
+else
 clean:
 	- $(RM_F) $(TEST_OBJS) $(TEST_BIN)
+endif
 

From 6de2d472d98baa215264a776f3d5291780a6a085 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Sun, 15 Jun 2014 08:44:31 -0400
Subject: [PATCH 03/19] CGEMM and ZGEMM kernels for PNaCl

---
 config/pnacl/bli_kernel.h           |  23 +++-
 kernels/nacl/pnacl/3/bli_gemm_opt.c | 200 +++++++++++++++++++++++++++-
 2 files changed, 214 insertions(+), 9 deletions(-)

diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h
index d1d29185c..678fddd33 100644
--- a/config/pnacl/bli_kernel.h
+++ b/config/pnacl/bli_kernel.h
@@ -74,13 +74,19 @@
 #define BLIS_DEFAULT_KC_D              120
 #define BLIS_DEFAULT_NC_D              8400
 
+#if PPAPI_RELEASE >= 36
 #define BLIS_DEFAULT_MC_C              128
 #define BLIS_DEFAULT_KC_C              256
 #define BLIS_DEFAULT_NC_C              4096
+#else
+#define BLIS_DEFAULT_MC_C              120
+#define BLIS_DEFAULT_KC_C              264
+#define BLIS_DEFAULT_NC_C              4092
+#endif
 
-#define BLIS_DEFAULT_MC_Z              64
-#define BLIS_DEFAULT_KC_Z              256
-#define BLIS_DEFAULT_NC_Z              2048
+#define BLIS_DEFAULT_MC_Z              60
+#define BLIS_DEFAULT_KC_Z              264
+#define BLIS_DEFAULT_NC_Z              2040
 
 // -- Register blocksizes --
 
@@ -92,15 +98,19 @@
 #define BLIS_DEFAULT_NR_S              3 
 #endif
 
-/* So far, PNaCl does not support double-precision SIMD */
 #define BLIS_DEFAULT_MR_D              4
 #define BLIS_DEFAULT_NR_D              3
 
+#if PPAPI_RELEASE >= 36
 #define BLIS_DEFAULT_MR_C              4
 #define BLIS_DEFAULT_NR_C              4
+#else
+#define BLIS_DEFAULT_MR_C              2
+#define BLIS_DEFAULT_NR_C              3
+#endif
 
-#define BLIS_DEFAULT_MR_Z              4
-#define BLIS_DEFAULT_NR_Z              4
+#define BLIS_DEFAULT_MR_Z              2
+#define BLIS_DEFAULT_NR_Z              3
 
 // NOTE: If the micro-kernel, which is typically unrolled to a factor
 // of f, handles leftover edge cases (ie: when k % f > 0) then these
@@ -171,6 +181,7 @@
 
 #if PPAPI_RELEASE >= 36
 #define BLIS_SGEMM_UKERNEL         bli_sgemm_opt_8x4
+#define BLIS_CGEMM_UKERNEL         bli_cgemm_opt_4x4
 #endif
 
 // -- trsm-related --
diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c
index 8261cf0dd..36f719a14 100644
--- a/kernels/nacl/pnacl/3/bli_gemm_opt.c
+++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2012, The University of Texas
+   Copyright (C) 2014, The University of Texas
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,16 +45,24 @@ inline v4sf v4sf_load(const float* a) {
 	return *((const v4sf*)a);
 }
 
+inline v4sf v4sf_cload(const scomplex* a) {
+	return *((const v4sf*)a);
+}
+
 inline void v4sf_store(float* a, v4sf x) {
 	*((v4sf*)a) = x;
 }
 
+inline void v4sf_cstore(scomplex* a, v4sf x) {
+	*((v4sf*)a) = x;
+}
+
 inline v4sf v4sf_zero() {
 	return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
 }
 #endif
 
-#if PPAPI_RELEASE >= 36 /* 8x4 blocks, SIMD enabled */
+#if PPAPI_RELEASE >= 36
 void bli_sgemm_opt_8x4(
 	dim_t           k,
 	float *restrict alpha,
@@ -194,7 +202,7 @@ void bli_sgemm_opt_8x4(
 		c[7*rs_c + 3*cs_c] = cv3b[3];
 	}
 }
-#else /* PPAPI_RELEASE < 36 case: 4x4 blocks, no SIMD */
+#else
 void bli_sgemm_opt_4x4(
 	dim_t           k,
 	float *restrict alpha,
@@ -244,6 +252,191 @@ void bli_dgemm_opt_4x4(
 		data);
 }
 
+
+#if PPAPI_RELEASE >= 36
+void bli_cgemm_opt_4x4(
+	dim_t              k,
+	scomplex *restrict alpha,
+	scomplex *restrict a,
+	scomplex *restrict b,
+	scomplex *restrict beta,
+	scomplex *restrict c,
+	inc_t              rs_c,
+	inc_t              cs_c,
+	auxinfo_t*         data)
+{
+	// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
+	v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
+	v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
+	for (dim_t i = 0; i < k; i += 1) {
+		const v4sf avt = v4sf_cload(a);
+		const v4sf avb = v4sf_cload(a+2);
+		const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
+		const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
+
+		const v4sf bv0r = v4sf_splat(b[0].real);
+		const v4sf bv0i = v4sf_splat(b[0].imag);
+		abv0r += avr * bv0r - avi * bv0i;
+		abv0i += avr * bv0i + avi * bv0r;
+
+
+		const v4sf bv1r = v4sf_splat(b[1].real);
+		const v4sf bv1i = v4sf_splat(b[1].imag);
+		abv1r += avr * bv1r - avi * bv1i;
+		abv1i += avr * bv1i + avi * bv1r;
+
+		const v4sf bv2r = v4sf_splat(b[2].real);
+		const v4sf bv2i = v4sf_splat(b[2].imag);
+		abv2r += avr * bv2r - avi * bv2i;
+		abv2i += avr * bv2i + avi * bv2r;
+
+		const v4sf bv3r = v4sf_splat(b[3].real);
+		const v4sf bv3i = v4sf_splat(b[3].imag);
+		abv3r += avr * bv3r - avi * bv3i;
+		abv3i += avr * bv3i + avi * bv3r;
+
+		a += 4;
+		b += 4;
+	}
+
+	const v4sf alphavr = v4sf_splat(alpha->real);
+	const v4sf alphavi = v4sf_splat(alpha->imag);
+	v4sf temp;
+
+	temp  = abv0r * alphavr - abv0i * alphavi;
+	abv0i = abv0r * alphavi + abv0i * alphavr;
+	abv0r = temp;
+
+	temp  = abv1r * alphavr - abv1i * alphavi;
+	abv1i = abv1r * alphavi + abv1i * alphavr;
+	abv1r = temp;
+
+	temp  = abv2r * alphavr - abv2i * alphavi;
+	abv2i = abv2r * alphavi + abv2i * alphavr;
+	abv2r = temp;
+
+	temp  = abv3r * alphavr - abv3i * alphavi;
+	abv3i = abv3r * alphavi + abv3i * alphavr;
+	abv3r = temp;
+
+	if (rs_c == 1) {
+		const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
+		const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
+		const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
+		const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
+		const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
+		const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
+		const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
+		const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
+
+		v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
+		v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
+		v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
+		v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
+		v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
+		v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
+		v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
+		v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
+
+		const v4sf betavr = v4sf_splat(beta->real);
+		const v4sf betavi = v4sf_splat(beta->imag);
+		
+		temp = abv0r + cv0r * betavr - cv0i * betavi;
+		cv0i = abv0i + cv0r * betavi + cv0i * betavr;
+		cv0r = temp;
+
+		temp = abv1r + cv1r * betavr - cv1i * betavi;
+		cv1i = abv1i + cv1r * betavi + cv1i * betavr;
+		cv1r = temp;
+
+		temp = abv2r + cv2r * betavr - cv2i * betavi;
+		cv2i = abv2i + cv2r * betavi + cv2i * betavr;
+		cv2r = temp;
+
+		temp = abv3r + cv3r * betavr - cv3i * betavi;
+		cv3i = abv3i + cv3r * betavi + cv3i * betavr;
+		cv3r = temp;
+
+		v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
+		v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
+		v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
+		v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
+		v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
+		v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
+		v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
+		v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
+	} else {
+		// Load columns 0, 1, 2, 3 (real part)
+		v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
+		v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
+		v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
+		v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
+		// Load columns 0, 1, 2, 3 (imaginary part)
+		v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
+		v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
+		v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
+		v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
+
+		const v4sf betavr = v4sf_splat(beta->real);
+		const v4sf betavi = v4sf_splat(beta->imag);
+		temp = abv0r + cv0r * betavr - cv0i * betavi;
+		cv0i = abv0i + cv0r * betavi + cv0i * betavr;
+		cv0r = temp;
+
+		temp = abv1r + cv1r * betavr - cv1i * betavi;
+		cv1i = abv1i + cv1r * betavi + cv1i * betavr;
+		cv1r = temp;
+
+		temp = abv2r + cv2r * betavr - cv2i * betavi;
+		cv2i = abv2i + cv2r * betavi + cv2i * betavr;
+		cv2r = temp;
+
+		temp = abv3r + cv3r * betavr - cv3i * betavi;
+		cv3i = abv3i + cv3r * betavi + cv3i * betavr;
+		cv3r = temp;
+
+		// Store column 0
+		c[0*rs_c + 0*cs_c].real = cv0r[0];
+		c[0*rs_c + 0*cs_c].imag = cv0i[0];
+		c[1*rs_c + 0*cs_c].real = cv0r[1];
+		c[1*rs_c + 0*cs_c].imag = cv0i[1];
+		c[2*rs_c + 0*cs_c].real = cv0r[2];
+		c[2*rs_c + 0*cs_c].imag = cv0i[2];
+		c[3*rs_c + 0*cs_c].real = cv0r[3];
+		c[3*rs_c + 0*cs_c].imag = cv0i[3];
+
+		// Store column 1
+		c[0*rs_c + 1*cs_c].real = cv1r[0];
+		c[0*rs_c + 1*cs_c].imag = cv1i[0];
+		c[1*rs_c + 1*cs_c].real = cv1r[1];
+		c[1*rs_c + 1*cs_c].imag = cv1i[1];
+		c[2*rs_c + 1*cs_c].real = cv1r[2];
+		c[2*rs_c + 1*cs_c].imag = cv1i[2];
+		c[3*rs_c + 1*cs_c].real = cv1r[3];
+		c[3*rs_c + 1*cs_c].imag = cv1i[3];
+
+		// Store column 2
+		c[0*rs_c + 2*cs_c].real = cv2r[0];
+		c[0*rs_c + 2*cs_c].imag = cv2i[0];
+		c[1*rs_c + 2*cs_c].real = cv2r[1];
+		c[1*rs_c + 2*cs_c].imag = cv2i[1];
+		c[2*rs_c + 2*cs_c].real = cv2r[2];
+		c[2*rs_c + 2*cs_c].imag = cv2i[2];
+		c[3*rs_c + 2*cs_c].real = cv2r[3];
+		c[3*rs_c + 2*cs_c].imag = cv2i[3];
+
+		// Store column 3
+		c[0*rs_c + 3*cs_c].real = cv3r[0];
+		c[0*rs_c + 3*cs_c].imag = cv3i[0];
+		c[1*rs_c + 3*cs_c].real = cv3r[1];
+		c[1*rs_c + 3*cs_c].imag = cv3i[1];
+		c[2*rs_c + 3*cs_c].real = cv3r[2];
+		c[2*rs_c + 3*cs_c].imag = cv3i[2];
+		c[3*rs_c + 3*cs_c].real = cv3r[3];
+		c[3*rs_c + 3*cs_c].imag = cv3i[3];
+	}
+}
+#else
 void bli_cgemm_opt_4x4(
 	dim_t              k,
 	scomplex *restrict alpha,
@@ -267,6 +460,7 @@ void bli_cgemm_opt_4x4(
 		cs_c,
 		data);
 }
+#endif
 
 void bli_zgemm_opt_4x4(
 	dim_t              k,

From b2ffb4de8b6872cb23537ad282e557d11dcd9c8b Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Sun, 15 Jun 2014 18:41:30 -0400
Subject: [PATCH 04/19] Reformatted PNaCl GEMM kernels

---
 kernels/nacl/pnacl/3/bli_gemm_opt.c | 110 ++++++++++++++--------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c
index 36f719a14..2dca6ae45 100644
--- a/kernels/nacl/pnacl/3/bli_gemm_opt.c
+++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c
@@ -64,15 +64,15 @@ inline v4sf v4sf_zero() {
 
 #if PPAPI_RELEASE >= 36
 void bli_sgemm_opt_8x4(
-	dim_t           k,
-	float *restrict alpha,
-	float *restrict a,
-	float *restrict b,
-	float *restrict beta,
-	float *restrict c,
-	inc_t           rs_c,
-	inc_t           cs_c,
-	auxinfo_t*      data)
+	dim_t      k,
+	float      alpha[restrict static 1],
+	float      a[restrict static 8*k],
+	float      b[restrict static k*4],
+	float      beta[restrict static 1],
+	float      c[restrict static 8*4],
+	inc_t      rs_c,
+	inc_t      cs_c,
+	auxinfo_t* data)
 {
 	// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
 	v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
@@ -204,15 +204,15 @@ void bli_sgemm_opt_8x4(
 }
 #else
 void bli_sgemm_opt_4x4(
-	dim_t           k,
-	float *restrict alpha,
-	float *restrict a,
-	float *restrict b,
-	float *restrict beta,
-	float *restrict c,
-	inc_t           rs_c,
-	inc_t           cs_c,
-	auxinfo_t*      data)
+	dim_t      k,
+	float      alpha[restrict static 1],
+	float      a[restrict static 4*k],
+	float      b[restrict static k*4],
+	float      beta[restrict static 1],
+	float      c[restrict static 4*4],
+	inc_t      rs_c,
+	inc_t      cs_c,
+	auxinfo_t* data)
 {
 	/* Just call the reference implementation. */
 	BLIS_SGEMM_UKERNEL_REF(
@@ -229,15 +229,15 @@ void bli_sgemm_opt_4x4(
 #endif
 
 void bli_dgemm_opt_4x4(
-	dim_t            k,
-	double *restrict alpha,
-	double *restrict a,
-	double *restrict b,
-	double *restrict beta,
-	double *restrict c,
-	inc_t            rs_c,
-	inc_t            cs_c,
-	auxinfo_t*       data)
+	dim_t      k,
+	double     alpha[restrict static 1],
+	double     a[restrict static 4*k],
+	double     b[restrict static k*4],
+	double     beta[restrict static 1],
+	double     c[restrict static 4*4],
+	inc_t      rs_c,
+	inc_t      cs_c,
+	auxinfo_t* data)
 {
 	/* Just call the reference implementation. */
 	BLIS_DGEMM_UKERNEL_REF(
@@ -255,15 +255,15 @@ void bli_dgemm_opt_4x4(
 
 #if PPAPI_RELEASE >= 36
 void bli_cgemm_opt_4x4(
-	dim_t              k,
-	scomplex *restrict alpha,
-	scomplex *restrict a,
-	scomplex *restrict b,
-	scomplex *restrict beta,
-	scomplex *restrict c,
-	inc_t              rs_c,
-	inc_t              cs_c,
-	auxinfo_t*         data)
+	dim_t      k,
+	scomplex   alpha[restrict static 1],
+	scomplex   a[restrict static 4*k],
+	scomplex   b[restrict static k*4],
+	scomplex   beta[restrict static 1],
+	scomplex   c[restrict static 4*4],
+	inc_t      rs_c,
+	inc_t      cs_c,
+	auxinfo_t* data)
 {
 	// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
 	v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
@@ -279,7 +279,6 @@ void bli_cgemm_opt_4x4(
 		abv0r += avr * bv0r - avi * bv0i;
 		abv0i += avr * bv0i + avi * bv0r;
 
-
 		const v4sf bv1r = v4sf_splat(b[1].real);
 		const v4sf bv1i = v4sf_splat(b[1].imag);
 		abv1r += avr * bv1r - avi * bv1i;
@@ -379,6 +378,7 @@ void bli_cgemm_opt_4x4(
 
 		const v4sf betavr = v4sf_splat(beta->real);
 		const v4sf betavi = v4sf_splat(beta->imag);
+
 		temp = abv0r + cv0r * betavr - cv0i * betavi;
 		cv0i = abv0i + cv0r * betavi + cv0i * betavr;
 		cv0r = temp;
@@ -438,15 +438,15 @@ void bli_cgemm_opt_4x4(
 }
 #else
 void bli_cgemm_opt_4x4(
-	dim_t              k,
-	scomplex *restrict alpha,
-	scomplex *restrict a,
-	scomplex *restrict b,
-	scomplex *restrict beta,
-	scomplex *restrict c,
-	inc_t              rs_c,
-	inc_t              cs_c,
-	auxinfo_t*         data)
+	dim_t      k,
+	scomplex   alpha[restrict static 1],
+	scomplex   a[restrict static 4*k],
+	scomplex   b[restrict static k*4],
+	scomplex   beta[restrict static 1],
+	scomplex   c[restrict static 4*4],
+	inc_t      rs_c,
+	inc_t      cs_c,
+	auxinfo_t* data)
 {
 	/* Just call the reference implementation. */
 	BLIS_CGEMM_UKERNEL_REF(
@@ -463,15 +463,15 @@ void bli_cgemm_opt_4x4(
 #endif
 
 void bli_zgemm_opt_4x4(
-	dim_t              k,
-	dcomplex *restrict alpha,
-	dcomplex *restrict a,
-	dcomplex *restrict b,
-	dcomplex *restrict beta,
-	dcomplex *restrict c,
-	inc_t              rs_c,
-	inc_t              cs_c,
-	auxinfo_t*         data)
+	dim_t      k,
+	dcomplex   alpha[restrict static 1],
+	dcomplex   a[restrict static 4*k],
+	dcomplex   b[restrict static k*4],
+	dcomplex   beta[restrict static 1],
+	dcomplex   c[restrict static 4*4],
+	inc_t      rs_c,
+	inc_t      cs_c,
+	auxinfo_t* data)
 {
 	/* Just call the reference implementation. */
 	BLIS_ZGEMM_UKERNEL_REF(

From 6f8462eb0ec278b89731e73ef583386a3371d095 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Wed, 18 Jun 2014 03:08:46 -0700
Subject: [PATCH 05/19] Fix inconsistent VERBOSE macro in Makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c6636932a..ca1d50fd5 100644
--- a/Makefile
+++ b/Makefile
@@ -469,7 +469,7 @@ endif
 blis-lib: check-env $(MK_LIBS)
 
 $(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS)
-ifeq ($(FLA_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
 	$(AR) $(ARFLAGS) $@ $?
 	$(RANLIB) $@
 else

From 68a02976e3c3638f0a9821342e269a1743e3ace3 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Wed, 18 Jun 2014 03:10:25 -0700
Subject: [PATCH 06/19] Compile pnacl configuration in GNU11 mode to avoid
 warning about non-standard features

---
 config/pnacl/make_defs.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk
index 540531203..b709441dc 100644
--- a/config/pnacl/make_defs.mk
+++ b/config/pnacl/make_defs.mk
@@ -80,7 +80,7 @@ CC             := pnacl-clang
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -I$(NACL_SDK_ROOT)/include
+CMISCFLAGS     := -std=gnu11 -I$(NACL_SDK_ROOT)/include
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O3

From 031deb2a5c718d569bde842590a791b812f4cf1d Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Wed, 18 Jun 2014 03:11:34 -0700
Subject: [PATCH 07/19] PNaCl configuration: use pnacl-ar instead or ar (fixes
 build issue on Mac)

---
 config/pnacl/make_defs.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk
index b709441dc..1a2dd8a18 100644
--- a/config/pnacl/make_defs.mk
+++ b/config/pnacl/make_defs.mk
@@ -95,7 +95,7 @@ CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCF
 CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
-AR             := ar
+AR             := pnacl-ar
 ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---

From 4b8e71aab80182873a2e138eb07902b8d8fd5480 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Thu, 19 Jun 2014 00:43:25 -0700
Subject: [PATCH 08/19] Use AR rcs flags for PNaCl target to avoid warning

---
 config/pnacl/make_defs.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk
index 1a2dd8a18..1c4566012 100644
--- a/config/pnacl/make_defs.mk
+++ b/config/pnacl/make_defs.mk
@@ -96,7 +96,7 @@ CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCF
 
 # --- Determine the archiver and related flags ---
 AR             := pnacl-ar
-ARFLAGS        := cru
+ARFLAGS        := rcs
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)

From 491be4f91ed725522f5cc7184053857c6c376ada Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Thu, 19 Jun 2014 00:45:44 -0700
Subject: [PATCH 09/19] Optimized dot product kernels for PNaCl

---
 config/pnacl/bli_kernel.h           |   3 +
 kernels/nacl/pnacl/1/bli_dotv_opt.c | 617 ++++++++++++++++++++++++++++
 2 files changed, 620 insertions(+)
 create mode 100644 kernels/nacl/pnacl/1/bli_dotv_opt.c

diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h
index 678fddd33..728f08f12 100644
--- a/config/pnacl/bli_kernel.h
+++ b/config/pnacl/bli_kernel.h
@@ -222,6 +222,9 @@
 // -- copyv --
 
 // -- dotv --
+#if PPAPI_RELEASE >= 36
+#define BLIS_SDOTV_KERNEL          bli_sdotv_opt
+#endif
 
 // -- dotxv --
 
diff --git a/kernels/nacl/pnacl/1/bli_dotv_opt.c b/kernels/nacl/pnacl/1/bli_dotv_opt.c
new file mode 100644
index 000000000..14d203bd6
--- /dev/null
+++ b/kernels/nacl/pnacl/1/bli_dotv_opt.c
@@ -0,0 +1,617 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#if PPAPI_RELEASE >= 36
+typedef float v4sf __attribute__ ((vector_size(16)));
+
+inline v4sf v4sf_splat(float x) {
+        return (v4sf) { x, x, x, x };
+}
+
+inline v4sf v4sf_load(const float* a) {
+        return *((const v4sf*)a);
+}
+
+inline v4sf v4sf_cload(const scomplex* a) {
+        return *((const v4sf*)a);
+}
+
+inline void v4sf_store(float* a, v4sf x) {
+        *((v4sf*)a) = x;
+}
+
+inline void v4sf_cstore(scomplex* a, v4sf x) {
+        *((v4sf*)a) = x;
+}
+
+inline v4sf v4sf_zero() {
+        return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
+}
+#endif
+
+void bli_sdotv_opt(
+	conj_t conjx,
+	conj_t conjy,
+	dim_t  n,
+	float  x[restrict static n],
+	inc_t  incx,
+	float  y[restrict static n],
+	inc_t  incy,
+	float  rho[restrict static 1])
+{
+#if PPAPI_RELEASE >= 36
+	// If the vector lengths are zero, set rho to zero and return.
+	if (bli_zero_dim1(n)) {
+		*rho = 0.0f;
+		return;
+	}
+	
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if (bli_has_nonunit_inc2(incx, incy)) {
+		float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
+		while (n >= 6) {
+			sum0 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum1 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum2 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum3 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum4 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum5 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			n -= 6;
+		}
+		float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
+		while (n--) {
+			sum += (*x) * (*y);
+			x += incx;
+			y += incy;
+		}
+		*rho = sum;
+	} else {
+		v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero();
+		v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero();
+		while (n >= 24) {
+			vsum0 += v4sf_load(x) * v4sf_load(y);
+			vsum1 += v4sf_load(x+4) * v4sf_load(y+4);
+			vsum2 += v4sf_load(x+8) * v4sf_load(y+8);
+			vsum3 += v4sf_load(x+12) * v4sf_load(y+12);
+			vsum4 += v4sf_load(x+16) * v4sf_load(y+16);
+			vsum5 += v4sf_load(x+20) * v4sf_load(y+20);
+
+			x += 24;
+			y += 24;
+			n -= 24;
+		}
+		v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5);
+		while (n >= 4) {
+			vsum += v4sf_load(x) * v4sf_load(y);
+
+			x += 4;
+			y += 4;
+			n -= 4;
+		}
+		float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]);
+		while (n--) {
+			sum += (*x++) * (*y++);
+		}
+		*rho = sum;
+	}
+#else
+	float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
+	while (n >= 6) {
+		sum0 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum1 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum2 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum3 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum4 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum5 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		n -= 6;
+	}
+	float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
+	while (n--) {
+		sum += (*x) * (*y);
+		x += incx;
+		y += incy;
+	}
+	*rho = sum;
+#endif
+}
+
+void bli_ddotv_opt_var1(
+	conj_t conjx,
+	conj_t conjy,
+	dim_t  n,
+	double x[restrict static n],
+	inc_t  incx,
+	double y[restrict static n],
+	inc_t  incy,
+	double rho[restrict static 1])
+{
+	double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0;
+	while (n >= 6) {
+		sum0 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum1 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum2 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum3 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum4 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum5 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		n -= 6;
+	}
+	double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
+	while (n--) {
+		sum += (*x) * (*y);
+		x += incx;
+		y += incy;
+	}
+	*rho = sum;
+}
+
+void bli_cdotv_opt_var1(
+	conj_t   conjx,
+	conj_t   conjy,
+	dim_t    n,
+	scomplex x[restrict static n],
+	inc_t    incx,
+	scomplex y[restrict static n],
+	inc_t    incy,
+	scomplex rho[restrict static 1])
+{
+	if (bli_is_conj(conjy)) {
+		bli_toggle_conj(conjx);
+	}
+
+	if (bli_zero_dim1(n)) {
+		rho->real = 0.0f;
+		rho->imag = 0.0f;
+		return;
+	}
+
+	float sumr;
+	float sumi;
+#if PPAPI_RELEASE >= 36
+	if (bli_is_noconj(conjx)) {
+		if (bli_has_nonunit_inc2(incx, incy)) {
+			float sum0r = 0.0f, sum1r = 0.0f;
+			float sum0i = 0.0f, sum1i = 0.0f;
+			while (n >= 2) {
+				const float x0r = x->real;
+				const float x0i = x->imag;
+				const float y0r = y->real;
+				const float y0i = y->imag;
+
+				sum0r += x0r * y0r - x0i * y0i;
+				sum0i += x0r * y0i + x0i * y0r;
+
+				x += incx;
+				y += incy;
+
+				const float x1r = x->real;
+				const float x1i = x->imag;
+				const float y1r = y->real;
+				const float y1i = y->imag;
+
+				sum1r += x1r * y1r - x1i * y1i;
+				sum1i += x1r * y1i + x1i * y1r;
+
+				x += incx;
+				y += incy;
+
+				n -= 2;
+			}
+			sumr = sum0r + sum1r;
+			sumi = sum0i + sum1i;
+		} else {
+			v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
+			v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
+			while (n >= 8) {
+				const v4sf xv0t = v4sf_cload(x);
+				const v4sf xv0b = v4sf_cload(x+2);
+				const v4sf yv0t = v4sf_cload(y);
+				const v4sf yv0b = v4sf_cload(y+2);
+
+				const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
+				const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
+				const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
+				const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
+
+				sumv0r += xv0r * yv0r - xv0i * yv0i;
+				sumv0i += xv0r * yv0i + xv0i * yv0r;
+
+				const v4sf xv1t = v4sf_cload(x+4);
+				const v4sf xv1b = v4sf_cload(x+6);
+				const v4sf yv1t = v4sf_cload(y+4);
+				const v4sf yv1b = v4sf_cload(y+6);
+
+				const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
+				const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
+				const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
+				const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
+
+				sumv1r += xv1r * yv1r - xv1i * yv1i;
+				sumv1i += xv1r * yv1i + xv1i * yv1r;
+
+				x += 8;
+				y += 8;
+
+				n -= 8;
+			}
+			const v4sf sumvr = sumv0r + sumv1r;
+			const v4sf sumvi = sumv0i + sumv1i;
+			sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
+			sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
+		}
+		while (n--) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr - xi * yi;
+			sumi += xr * yi + xi * yr;
+
+			x += incx;
+			y += incy;
+		}
+	} else {
+		if (bli_has_nonunit_inc2(incx, incy)) {
+			float sum0r = 0.0f, sum1r = 0.0f;
+			float sum0i = 0.0f, sum1i = 0.0f;
+			while (n >= 2) {
+				const float x0r = x->real;
+				const float x0i = x->imag;
+				const float y0r = y->real;
+				const float y0i = y->imag;
+
+				sum0r += x0r * y0r + x0i * y0i;
+				sum0i += x0r * y0i - x0i * y0r;
+
+				x += incx;
+				y += incy;
+
+				const float x1r = x->real;
+				const float x1i = x->imag;
+				const float y1r = y->real;
+				const float y1i = y->imag;
+
+				sum1r += x1r * y1r + x1i * y1i;
+				sum1i += x1r * y1i - x1i * y1r;
+
+				x += incx;
+				y += incy;
+
+				n -= 2;
+			}
+			sumr = sum0r + sum1r;
+			sumi = sum0i + sum1i;
+		} else {
+			v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
+			v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
+			while (n >= 8) {
+				const v4sf xv0t = v4sf_cload(x);
+				const v4sf xv0b = v4sf_cload(x+2);
+				const v4sf yv0t = v4sf_cload(y);
+				const v4sf yv0b = v4sf_cload(y+2);
+
+				const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
+				const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
+				const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
+				const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
+
+				sumv0r += xv0r * yv0r + xv0i * yv0i;
+				sumv0i += xv0r * yv0i - xv0i * yv0r;
+
+				const v4sf xv1t = v4sf_cload(x+4);
+				const v4sf xv1b = v4sf_cload(x+6);
+				const v4sf yv1t = v4sf_cload(y+4);
+				const v4sf yv1b = v4sf_cload(y+6);
+
+				const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
+				const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
+				const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
+				const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
+
+				sumv1r += xv1r * yv1r + xv1i * yv1i;
+				sumv1i += xv1r * yv1i - xv1i * yv1r;
+
+				x += 8;
+				y += 8;
+
+				n -= 8;
+			}
+			const v4sf sumvr = sumv0r + sumv1r;
+			const v4sf sumvi = sumv0i + sumv1i;
+			sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
+			sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
+		}
+		while (n--) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr + xi * yi;
+			sumi += xr * yi - xi * yr;
+
+			x += incx;
+			y += incy;
+		}
+	}
+#else
+	if (bli_is_noconj(conjx)) {
+		float sum0r = 0.0f, sum1r = 0.0f;
+		float sum0i = 0.0f, sum1i = 0.0f;
+		while (n >= 2) {
+			const float x0r = x->real;
+			const float x0i = x->imag;
+			const float y0r = y->real;
+			const float y0i = y->imag;
+
+			sum0r += x0r * y0r - x0i * y0i;
+			sum0i += x0r * y0i + x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const float x1r = x->real;
+			const float x1i = x->imag;
+			const float y1r = y->real;
+			const float y1i = y->imag;
+
+			sum1r += x1r * y1r - x1i * y1i;
+			sum1i += x1r * y1i + x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr - xi * yi;
+			sumi += xr * yi + xi * yr;
+		}
+	} else {
+		float sum0r = 0.0f, sum1r = 0.0f;
+		float sum0i = 0.0f, sum1i = 0.0f;
+		while (n >= 2) {
+			const float x0r = x->real;
+			const float x0i = x->imag;
+			const float y0r = y->real;
+			const float y0i = y->imag;
+
+			sum0r += x0r * y0r + x0i * y0i;
+			sum0i += x0r * y0i - x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const float x1r = x->real;
+			const float x1i = x->imag;
+			const float y1r = y->real;
+			const float y1i = y->imag;
+
+			sum1r += x1r * y1r + x1i * y1i;
+			sum1i += x1r * y1i - x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr + xi * yi;
+			sumi += xr * yi - xi * yr;
+		}
+	}
+#endif
+
+	rho->real = sumr;
+	rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
+}
+
+
+
+void bli_zdotv_opt_var1(
+	conj_t   conjx,
+	conj_t   conjy,
+	dim_t    n,
+	dcomplex x[restrict static n],
+	inc_t    incx,
+	dcomplex y[restrict static n],
+	inc_t    incy,
+	dcomplex rho[restrict static 1])
+{
+	if (bli_is_conj(conjy)) {
+		bli_toggle_conj(conjx);
+	}
+
+	if (bli_zero_dim1(n)) {
+		rho->real = 0.0;
+		rho->imag = 0.0;
+		return;
+	}
+
+	double sumr;
+	double sumi;
+	if (bli_is_noconj(conjx)) {
+		double sum0r = 0.0, sum1r = 0.0;
+		double sum0i = 0.0, sum1i = 0.0;
+		while (n >= 2) {
+			const double x0r = x->real;
+			const double x0i = x->imag;
+			const double y0r = y->real;
+			const double y0i = y->imag;
+
+			sum0r += x0r * y0r - x0i * y0i;
+			sum0i += x0r * y0i + x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const double x1r = x->real;
+			const double x1i = x->imag;
+			const double y1r = y->real;
+			const double y1i = y->imag;
+
+			sum1r += x1r * y1r - x1i * y1i;
+			sum1i += x1r * y1i + x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const double xr = x->real;
+			const double xi = x->imag;
+			const double yr = y->real;
+			const double yi = y->imag;
+
+			sumr += xr * yr - xi * yi;
+			sumi += xr * yi + xi * yr;
+		}
+	} else {
+		double sum0r = 0.0, sum1r = 0.0;
+		double sum0i = 0.0, sum1i = 0.0;
+		while (n >= 2) {
+			const double x0r = x->real;
+			const double x0i = x->imag;
+			const double y0r = y->real;
+			const double y0i = y->imag;
+
+			sum0r += x0r * y0r + x0i * y0i;
+			sum0i += x0r * y0i - x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const double x1r = x->real;
+			const double x1i = x->imag;
+			const double y1r = y->real;
+			const double y1i = y->imag;
+
+			sum1r += x1r * y1r + x1i * y1i;
+			sum1i += x1r * y1i - x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const double xr = x->real;
+			const double xi = x->imag;
+			const double yr = y->real;
+			const double yi = y->imag;
+
+			sumr += xr * yr + xi * yi;
+			sumi += xr * yi - xi * yr;
+		}
+	}
+
+	rho->real = sumr;
+	rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
+}

From 020a831bc5f61744cb8354886aa679b99b1285f6 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Thu, 19 Jun 2014 00:58:26 -0700
Subject: [PATCH 10/19] Code clean-up in PNaCl port

---
 config/pnacl/bli_kernel.h           |   9 +-
 kernels/nacl/pnacl/1/bli_dotv_opt.c |   7 +-
 kernels/nacl/pnacl/3/bli_gemm_opt.c | 786 ++++++++++++----------------
 3 files changed, 351 insertions(+), 451 deletions(-)

diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h
index 728f08f12..d3ad1e721 100644
--- a/config/pnacl/bli_kernel.h
+++ b/config/pnacl/bli_kernel.h
@@ -180,8 +180,8 @@
 // -- gemm --
 
 #if PPAPI_RELEASE >= 36
-#define BLIS_SGEMM_UKERNEL         bli_sgemm_opt_8x4
-#define BLIS_CGEMM_UKERNEL         bli_cgemm_opt_4x4
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_opt
+#define BLIS_CGEMM_UKERNEL         bli_cgemm_opt
 #endif
 
 // -- trsm-related --
@@ -222,9 +222,10 @@
 // -- copyv --
 
 // -- dotv --
-#if PPAPI_RELEASE >= 36
 #define BLIS_SDOTV_KERNEL          bli_sdotv_opt
-#endif
+#define BLIS_DDOTV_KERNEL          bli_ddotv_opt
+#define BLIS_CDOTV_KERNEL          bli_cdotv_opt
+#define BLIS_ZDOTV_KERNEL          bli_zdotv_opt
 
 // -- dotxv --
 
diff --git a/kernels/nacl/pnacl/1/bli_dotv_opt.c b/kernels/nacl/pnacl/1/bli_dotv_opt.c
index 14d203bd6..9d6408245 100644
--- a/kernels/nacl/pnacl/1/bli_dotv_opt.c
+++ b/kernels/nacl/pnacl/1/bli_dotv_opt.c
@@ -185,7 +185,7 @@ void bli_sdotv_opt(
 #endif
 }
 
-void bli_ddotv_opt_var1(
+void bli_ddotv_opt(
 	conj_t conjx,
 	conj_t conjy,
 	dim_t  n,
@@ -232,7 +232,7 @@ void bli_ddotv_opt_var1(
 	*rho = sum;
 }
 
-void bli_cdotv_opt_var1(
+void bli_cdotv_opt(
 	conj_t   conjx,
 	conj_t   conjy,
 	dim_t    n,
@@ -510,7 +510,7 @@ void bli_cdotv_opt_var1(
 
 
 
-void bli_zdotv_opt_var1(
+void bli_zdotv_opt(
 	conj_t   conjx,
 	conj_t   conjy,
 	dim_t    n,
@@ -615,3 +615,4 @@ void bli_zdotv_opt_var1(
 	rho->real = sumr;
 	rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
 }
+
diff --git a/kernels/nacl/pnacl/3/bli_gemm_opt.c b/kernels/nacl/pnacl/3/bli_gemm_opt.c
index 2dca6ae45..2c37397c7 100644
--- a/kernels/nacl/pnacl/3/bli_gemm_opt.c
+++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c
@@ -35,454 +35,352 @@
 #include "blis.h"
 
 #if PPAPI_RELEASE >= 36
-typedef float v4sf __attribute__ ((vector_size(16)));
+	typedef float v4sf __attribute__ ((vector_size(16)));
 
-inline v4sf v4sf_splat(float x) {
-	return (v4sf) { x, x, x, x };
-}
+	inline v4sf v4sf_splat(float x) {
+		return (v4sf) { x, x, x, x };
+	}
 
-inline v4sf v4sf_load(const float* a) {
-	return *((const v4sf*)a);
-}
+	inline v4sf v4sf_load(const float* a) {
+		return *((const v4sf*)a);
+	}
 
-inline v4sf v4sf_cload(const scomplex* a) {
-	return *((const v4sf*)a);
-}
+	inline v4sf v4sf_cload(const scomplex* a) {
+		return *((const v4sf*)a);
+	}
 
-inline void v4sf_store(float* a, v4sf x) {
-	*((v4sf*)a) = x;
-}
+	inline void v4sf_store(float* a, v4sf x) {
+		*((v4sf*)a) = x;
+	}
 
-inline void v4sf_cstore(scomplex* a, v4sf x) {
-	*((v4sf*)a) = x;
-}
+	inline void v4sf_cstore(scomplex* a, v4sf x) {
+		*((v4sf*)a) = x;
+	}
 
-inline v4sf v4sf_zero() {
-	return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
-}
+	inline v4sf v4sf_zero() {
+		return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
+	}
+
+	void bli_sgemm_opt(
+		dim_t      k,
+		float      alpha[restrict static 1],
+		float      a[restrict static 8*k],
+		float      b[restrict static k*4],
+		float      beta[restrict static 1],
+		float      c[restrict static 8*4],
+		inc_t      rs_c,
+		inc_t      cs_c,
+		auxinfo_t* data)
+	{
+		// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
+		v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
+		v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
+		for (dim_t i = 0; i < k; i += 1) {
+			const v4sf avt = v4sf_load(a);
+			const v4sf avb = v4sf_load(a+4);
+
+			const v4sf bv_xxxx = v4sf_splat(b[0]);
+			abv0t += avt * bv_xxxx;
+			abv0b += avb * bv_xxxx;
+
+			const v4sf bv_yyyy = v4sf_splat(b[1]);
+			abv1t += avt * bv_yyyy;
+			abv1b += avb * bv_yyyy;
+
+			const v4sf bv_zzzz = v4sf_splat(b[2]);
+			abv2t += avt * bv_zzzz;
+			abv2b += avb * bv_zzzz;
+
+			const v4sf bv_wwww = v4sf_splat(b[3]);
+			abv3t += avt * bv_wwww;
+			abv3b += avb * bv_wwww;
+
+			a += 8;
+			b += 4;
+		}
+
+		const v4sf alphav = v4sf_splat(*alpha);
+		abv0t *= alphav;
+		abv0b *= alphav;
+		abv1t *= alphav;
+		abv1b *= alphav;
+		abv2t *= alphav;
+		abv2b *= alphav;
+		abv3t *= alphav;
+		abv3b *= alphav;
+
+		if (rs_c == 1) {
+			v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
+			v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); 
+			v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); 
+			v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); 
+			v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
+			v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); 
+			v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); 
+			v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); 
+
+			const v4sf betav = v4sf_splat(*beta);
+			cv0t = cv0t * betav + abv0t;
+			cv1t = cv1t * betav + abv1t;
+			cv2t = cv2t * betav + abv2t;
+			cv3t = cv3t * betav + abv3t;
+			cv0b = cv0b * betav + abv0b;
+			cv1b = cv1b * betav + abv1b;
+			cv2b = cv2b * betav + abv2b;
+			cv3b = cv3b * betav + abv3b;
+
+			v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
+			v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); 
+			v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); 
+			v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); 
+			v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
+			v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); 
+			v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); 
+			v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); 
+		} else {
+			// Load columns 0, 1, 2, 3 (top part)
+			v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
+			v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
+			v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
+			v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
+			// Load columns 0, 1, 2, 3 (bottom part)
+			v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
+			v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
+			v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
+			v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
+
+			const v4sf betav = v4sf_splat(*beta);
+			cv0t = cv0t * betav + abv0t;
+			cv1t = cv1t * betav + abv1t;
+			cv2t = cv2t * betav + abv2t;
+			cv3t = cv3t * betav + abv3t;
+			cv0b = cv0b * betav + abv0b;
+			cv1b = cv1b * betav + abv1b;
+			cv2b = cv2b * betav + abv2b;
+			cv3b = cv3b * betav + abv3b;
+
+			// Store column 0
+			c[0*rs_c + 0*cs_c] = cv0t[0];
+			c[1*rs_c + 0*cs_c] = cv0t[1];
+			c[2*rs_c + 0*cs_c] = cv0t[2];
+			c[3*rs_c + 0*cs_c] = cv0t[3];
+			c[4*rs_c + 0*cs_c] = cv0b[0];
+			c[5*rs_c + 0*cs_c] = cv0b[1];
+			c[6*rs_c + 0*cs_c] = cv0b[2];
+			c[7*rs_c + 0*cs_c] = cv0b[3];
+
+			// Store column 1
+			c[0*rs_c + 1*cs_c] = cv1t[0];
+			c[1*rs_c + 1*cs_c] = cv1t[1];
+			c[2*rs_c + 1*cs_c] = cv1t[2];
+			c[3*rs_c + 1*cs_c] = cv1t[3];
+			c[4*rs_c + 1*cs_c] = cv1b[0];
+			c[5*rs_c + 1*cs_c] = cv1b[1];
+			c[6*rs_c + 1*cs_c] = cv1b[2];
+			c[7*rs_c + 1*cs_c] = cv1b[3];
+
+			// Store column 2
+			c[0*rs_c + 2*cs_c] = cv2t[0];
+			c[1*rs_c + 2*cs_c] = cv2t[1];
+			c[2*rs_c + 2*cs_c] = cv2t[2];
+			c[3*rs_c + 2*cs_c] = cv2t[3];
+			c[4*rs_c + 2*cs_c] = cv2b[0];
+			c[5*rs_c + 2*cs_c] = cv2b[1];
+			c[6*rs_c + 2*cs_c] = cv2b[2];
+			c[7*rs_c + 2*cs_c] = cv2b[3];
+
+			// Store column 3
+			c[0*rs_c + 3*cs_c] = cv3t[0];
+			c[1*rs_c + 3*cs_c] = cv3t[1];
+			c[2*rs_c + 3*cs_c] = cv3t[2];
+			c[3*rs_c + 3*cs_c] = cv3t[3];
+			c[4*rs_c + 3*cs_c] = cv3b[0];
+			c[5*rs_c + 3*cs_c] = cv3b[1];
+			c[6*rs_c + 3*cs_c] = cv3b[2];
+			c[7*rs_c + 3*cs_c] = cv3b[3];
+		}
+	}
+
+	void bli_cgemm_opt(
+		dim_t      k,
+		scomplex   alpha[restrict static 1],
+		scomplex   a[restrict static 4*k],
+		scomplex   b[restrict static k*4],
+		scomplex   beta[restrict static 1],
+		scomplex   c[restrict static 4*4],
+		inc_t      rs_c,
+		inc_t      cs_c,
+		auxinfo_t* data)
+	{
+		// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
+		v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
+		v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
+		for (dim_t i = 0; i < k; i += 1) {
+			const v4sf avt = v4sf_cload(a);
+			const v4sf avb = v4sf_cload(a+2);
+			const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
+			const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
+
+			const v4sf bv0r = v4sf_splat(b[0].real);
+			const v4sf bv0i = v4sf_splat(b[0].imag);
+			abv0r += avr * bv0r - avi * bv0i;
+			abv0i += avr * bv0i + avi * bv0r;
+
+			const v4sf bv1r = v4sf_splat(b[1].real);
+			const v4sf bv1i = v4sf_splat(b[1].imag);
+			abv1r += avr * bv1r - avi * bv1i;
+			abv1i += avr * bv1i + avi * bv1r;
+
+			const v4sf bv2r = v4sf_splat(b[2].real);
+			const v4sf bv2i = v4sf_splat(b[2].imag);
+			abv2r += avr * bv2r - avi * bv2i;
+			abv2i += avr * bv2i + avi * bv2r;
+
+			const v4sf bv3r = v4sf_splat(b[3].real);
+			const v4sf bv3i = v4sf_splat(b[3].imag);
+			abv3r += avr * bv3r - avi * bv3i;
+			abv3i += avr * bv3i + avi * bv3r;
+
+			a += 4;
+			b += 4;
+		}
+
+		const v4sf alphavr = v4sf_splat(alpha->real);
+		const v4sf alphavi = v4sf_splat(alpha->imag);
+		v4sf temp;
+
+		temp  = abv0r * alphavr - abv0i * alphavi;
+		abv0i = abv0r * alphavi + abv0i * alphavr;
+		abv0r = temp;
+
+		temp  = abv1r * alphavr - abv1i * alphavi;
+		abv1i = abv1r * alphavi + abv1i * alphavr;
+		abv1r = temp;
+
+		temp  = abv2r * alphavr - abv2i * alphavi;
+		abv2i = abv2r * alphavi + abv2i * alphavr;
+		abv2r = temp;
+
+		temp  = abv3r * alphavr - abv3i * alphavi;
+		abv3i = abv3r * alphavi + abv3i * alphavr;
+		abv3r = temp;
+
+		if (rs_c == 1) {
+			const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
+			const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
+			const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
+			const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
+			const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
+			const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
+			const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
+			const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
+
+			v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
+			v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
+			v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
+			v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
+			v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
+			v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
+			v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
+			v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
+
+			const v4sf betavr = v4sf_splat(beta->real);
+			const v4sf betavi = v4sf_splat(beta->imag);
+			
+			temp = abv0r + cv0r * betavr - cv0i * betavi;
+			cv0i = abv0i + cv0r * betavi + cv0i * betavr;
+			cv0r = temp;
+
+			temp = abv1r + cv1r * betavr - cv1i * betavi;
+			cv1i = abv1i + cv1r * betavi + cv1i * betavr;
+			cv1r = temp;
+
+			temp = abv2r + cv2r * betavr - cv2i * betavi;
+			cv2i = abv2i + cv2r * betavi + cv2i * betavr;
+			cv2r = temp;
+
+			temp = abv3r + cv3r * betavr - cv3i * betavi;
+			cv3i = abv3i + cv3r * betavi + cv3i * betavr;
+			cv3r = temp;
+
+			v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
+			v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
+			v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
+			v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
+		} else {
+			// Load columns 0, 1, 2, 3 (real part)
+			v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
+			v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
+			v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
+			v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
+			// Load columns 0, 1, 2, 3 (imaginary part)
+			v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
+			v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
+			v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
+			v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
+
+			const v4sf betavr = v4sf_splat(beta->real);
+			const v4sf betavi = v4sf_splat(beta->imag);
+
+			temp = abv0r + cv0r * betavr - cv0i * betavi;
+			cv0i = abv0i + cv0r * betavi + cv0i * betavr;
+			cv0r = temp;
+
+			temp = abv1r + cv1r * betavr - cv1i * betavi;
+			cv1i = abv1i + cv1r * betavi + cv1i * betavr;
+			cv1r = temp;
+
+			temp = abv2r + cv2r * betavr - cv2i * betavi;
+			cv2i = abv2i + cv2r * betavi + cv2i * betavr;
+			cv2r = temp;
+
+			temp = abv3r + cv3r * betavr - cv3i * betavi;
+			cv3i = abv3i + cv3r * betavi + cv3i * betavr;
+			cv3r = temp;
+
+			// Store column 0
+			c[0*rs_c + 0*cs_c].real = cv0r[0];
+			c[0*rs_c + 0*cs_c].imag = cv0i[0];
+			c[1*rs_c + 0*cs_c].real = cv0r[1];
+			c[1*rs_c + 0*cs_c].imag = cv0i[1];
+			c[2*rs_c + 0*cs_c].real = cv0r[2];
+			c[2*rs_c + 0*cs_c].imag = cv0i[2];
+			c[3*rs_c + 0*cs_c].real = cv0r[3];
+			c[3*rs_c + 0*cs_c].imag = cv0i[3];
+
+			// Store column 1
+			c[0*rs_c + 1*cs_c].real = cv1r[0];
+			c[0*rs_c + 1*cs_c].imag = cv1i[0];
+			c[1*rs_c + 1*cs_c].real = cv1r[1];
+			c[1*rs_c + 1*cs_c].imag = cv1i[1];
+			c[2*rs_c + 1*cs_c].real = cv1r[2];
+			c[2*rs_c + 1*cs_c].imag = cv1i[2];
+			c[3*rs_c + 1*cs_c].real = cv1r[3];
+			c[3*rs_c + 1*cs_c].imag = cv1i[3];
+
+			// Store column 2
+			c[0*rs_c + 2*cs_c].real = cv2r[0];
+			c[0*rs_c + 2*cs_c].imag = cv2i[0];
+			c[1*rs_c + 2*cs_c].real = cv2r[1];
+			c[1*rs_c + 2*cs_c].imag = cv2i[1];
+			c[2*rs_c + 2*cs_c].real = cv2r[2];
+			c[2*rs_c + 2*cs_c].imag = cv2i[2];
+			c[3*rs_c + 2*cs_c].real = cv2r[3];
+			c[3*rs_c + 2*cs_c].imag = cv2i[3];
+
+			// Store column 3
+			c[0*rs_c + 3*cs_c].real = cv3r[0];
+			c[0*rs_c + 3*cs_c].imag = cv3i[0];
+			c[1*rs_c + 3*cs_c].real = cv3r[1];
+			c[1*rs_c + 3*cs_c].imag = cv3i[1];
+			c[2*rs_c + 3*cs_c].real = cv3r[2];
+			c[2*rs_c + 3*cs_c].imag = cv3i[2];
+			c[3*rs_c + 3*cs_c].real = cv3r[3];
+			c[3*rs_c + 3*cs_c].imag = cv3i[3];
+		}
+	}
 #endif
-
-#if PPAPI_RELEASE >= 36
-void bli_sgemm_opt_8x4(
-	dim_t      k,
-	float      alpha[restrict static 1],
-	float      a[restrict static 8*k],
-	float      b[restrict static k*4],
-	float      beta[restrict static 1],
-	float      c[restrict static 8*4],
-	inc_t      rs_c,
-	inc_t      cs_c,
-	auxinfo_t* data)
-{
-	// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
-	v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
-	v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
-	for (dim_t i = 0; i < k; i += 1) {
-		const v4sf avt = v4sf_load(a);
-		const v4sf avb = v4sf_load(a+4);
-
-		const v4sf bv_xxxx = v4sf_splat(b[0]);
-		abv0t += avt * bv_xxxx;
-		abv0b += avb * bv_xxxx;
-
-		const v4sf bv_yyyy = v4sf_splat(b[1]);
-		abv1t += avt * bv_yyyy;
-		abv1b += avb * bv_yyyy;
-
-		const v4sf bv_zzzz = v4sf_splat(b[2]);
-		abv2t += avt * bv_zzzz;
-		abv2b += avb * bv_zzzz;
-
-		const v4sf bv_wwww = v4sf_splat(b[3]);
-		abv3t += avt * bv_wwww;
-		abv3b += avb * bv_wwww;
-
-		a += 8;
-		b += 4;
-	}
-
-	const v4sf alphav = v4sf_splat(*alpha);
-	abv0t *= alphav;
-	abv0b *= alphav;
-	abv1t *= alphav;
-	abv1b *= alphav;
-	abv2t *= alphav;
-	abv2b *= alphav;
-	abv3t *= alphav;
-	abv3b *= alphav;
-
-	if (rs_c == 1) {
-		v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
-		v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); 
-		v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); 
-		v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); 
-		v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
-		v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); 
-		v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); 
-		v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); 
-
-		const v4sf betav = v4sf_splat(*beta);
-		cv0t = cv0t * betav + abv0t;
-		cv1t = cv1t * betav + abv1t;
-		cv2t = cv2t * betav + abv2t;
-		cv3t = cv3t * betav + abv3t;
-		cv0b = cv0b * betav + abv0b;
-		cv1b = cv1b * betav + abv1b;
-		cv2b = cv2b * betav + abv2b;
-		cv3b = cv3b * betav + abv3b;
-
-		v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
-		v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); 
-		v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); 
-		v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); 
-		v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
-		v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); 
-		v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); 
-		v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); 
-	} else {
-		// Load columns 0, 1, 2, 3 (top part)
-		v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
-		v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
-		v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
-		v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
-		// Load columns 0, 1, 2, 3 (bottom part)
-		v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
-		v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
-		v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
-		v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
-
-		const v4sf betav = v4sf_splat(*beta);
-		cv0t = cv0t * betav + abv0t;
-		cv1t = cv1t * betav + abv1t;
-		cv2t = cv2t * betav + abv2t;
-		cv3t = cv3t * betav + abv3t;
-		cv0b = cv0b * betav + abv0b;
-		cv1b = cv1b * betav + abv1b;
-		cv2b = cv2b * betav + abv2b;
-		cv3b = cv3b * betav + abv3b;
-
-		// Store column 0
-		c[0*rs_c + 0*cs_c] = cv0t[0];
-		c[1*rs_c + 0*cs_c] = cv0t[1];
-		c[2*rs_c + 0*cs_c] = cv0t[2];
-		c[3*rs_c + 0*cs_c] = cv0t[3];
-		c[4*rs_c + 0*cs_c] = cv0b[0];
-		c[5*rs_c + 0*cs_c] = cv0b[1];
-		c[6*rs_c + 0*cs_c] = cv0b[2];
-		c[7*rs_c + 0*cs_c] = cv0b[3];
-
-		// Store column 1
-		c[0*rs_c + 1*cs_c] = cv1t[0];
-		c[1*rs_c + 1*cs_c] = cv1t[1];
-		c[2*rs_c + 1*cs_c] = cv1t[2];
-		c[3*rs_c + 1*cs_c] = cv1t[3];
-		c[4*rs_c + 1*cs_c] = cv1b[0];
-		c[5*rs_c + 1*cs_c] = cv1b[1];
-		c[6*rs_c + 1*cs_c] = cv1b[2];
-		c[7*rs_c + 1*cs_c] = cv1b[3];
-
-		// Store column 2
-		c[0*rs_c + 2*cs_c] = cv2t[0];
-		c[1*rs_c + 2*cs_c] = cv2t[1];
-		c[2*rs_c + 2*cs_c] = cv2t[2];
-		c[3*rs_c + 2*cs_c] = cv2t[3];
-		c[4*rs_c + 2*cs_c] = cv2b[0];
-		c[5*rs_c + 2*cs_c] = cv2b[1];
-		c[6*rs_c + 2*cs_c] = cv2b[2];
-		c[7*rs_c + 2*cs_c] = cv2b[3];
-
-		// Store column 3
-		c[0*rs_c + 3*cs_c] = cv3t[0];
-		c[1*rs_c + 3*cs_c] = cv3t[1];
-		c[2*rs_c + 3*cs_c] = cv3t[2];
-		c[3*rs_c + 3*cs_c] = cv3t[3];
-		c[4*rs_c + 3*cs_c] = cv3b[0];
-		c[5*rs_c + 3*cs_c] = cv3b[1];
-		c[6*rs_c + 3*cs_c] = cv3b[2];
-		c[7*rs_c + 3*cs_c] = cv3b[3];
-	}
-}
-#else
-void bli_sgemm_opt_4x4(
-	dim_t      k,
-	float      alpha[restrict static 1],
-	float      a[restrict static 4*k],
-	float      b[restrict static k*4],
-	float      beta[restrict static 1],
-	float      c[restrict static 4*4],
-	inc_t      rs_c,
-	inc_t      cs_c,
-	auxinfo_t* data)
-{
-	/* Just call the reference implementation. */
-	BLIS_SGEMM_UKERNEL_REF(
-		k,
-		alpha,
-		a,
-		b,
-		beta,
-		c,
-		rs_c,
-		cs_c,
-		data);
-}
-#endif
-
-void bli_dgemm_opt_4x4(
-	dim_t      k,
-	double     alpha[restrict static 1],
-	double     a[restrict static 4*k],
-	double     b[restrict static k*4],
-	double     beta[restrict static 1],
-	double     c[restrict static 4*4],
-	inc_t      rs_c,
-	inc_t      cs_c,
-	auxinfo_t* data)
-{
-	/* Just call the reference implementation. */
-	BLIS_DGEMM_UKERNEL_REF(
-		k,
-		alpha,
-		a,
-		b,
-		beta,
-		c,
-		rs_c,
-		cs_c,
-		data);
-}
-
-
-#if PPAPI_RELEASE >= 36
-void bli_cgemm_opt_4x4(
-	dim_t      k,
-	scomplex   alpha[restrict static 1],
-	scomplex   a[restrict static 4*k],
-	scomplex   b[restrict static k*4],
-	scomplex   beta[restrict static 1],
-	scomplex   c[restrict static 4*4],
-	inc_t      rs_c,
-	inc_t      cs_c,
-	auxinfo_t* data)
-{
-	// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
-	v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
-	v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
-	for (dim_t i = 0; i < k; i += 1) {
-		const v4sf avt = v4sf_cload(a);
-		const v4sf avb = v4sf_cload(a+2);
-		const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
-		const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
-
-		const v4sf bv0r = v4sf_splat(b[0].real);
-		const v4sf bv0i = v4sf_splat(b[0].imag);
-		abv0r += avr * bv0r - avi * bv0i;
-		abv0i += avr * bv0i + avi * bv0r;
-
-		const v4sf bv1r = v4sf_splat(b[1].real);
-		const v4sf bv1i = v4sf_splat(b[1].imag);
-		abv1r += avr * bv1r - avi * bv1i;
-		abv1i += avr * bv1i + avi * bv1r;
-
-		const v4sf bv2r = v4sf_splat(b[2].real);
-		const v4sf bv2i = v4sf_splat(b[2].imag);
-		abv2r += avr * bv2r - avi * bv2i;
-		abv2i += avr * bv2i + avi * bv2r;
-
-		const v4sf bv3r = v4sf_splat(b[3].real);
-		const v4sf bv3i = v4sf_splat(b[3].imag);
-		abv3r += avr * bv3r - avi * bv3i;
-		abv3i += avr * bv3i + avi * bv3r;
-
-		a += 4;
-		b += 4;
-	}
-
-	const v4sf alphavr = v4sf_splat(alpha->real);
-	const v4sf alphavi = v4sf_splat(alpha->imag);
-	v4sf temp;
-
-	temp  = abv0r * alphavr - abv0i * alphavi;
-	abv0i = abv0r * alphavi + abv0i * alphavr;
-	abv0r = temp;
-
-	temp  = abv1r * alphavr - abv1i * alphavi;
-	abv1i = abv1r * alphavi + abv1i * alphavr;
-	abv1r = temp;
-
-	temp  = abv2r * alphavr - abv2i * alphavi;
-	abv2i = abv2r * alphavi + abv2i * alphavr;
-	abv2r = temp;
-
-	temp  = abv3r * alphavr - abv3i * alphavi;
-	abv3i = abv3r * alphavi + abv3i * alphavr;
-	abv3r = temp;
-
-	if (rs_c == 1) {
-		const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
-		const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
-		const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
-		const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
-		const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
-		const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
-		const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
-		const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
-
-		v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
-		v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
-		v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
-		v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
-		v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
-		v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
-		v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
-		v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
-
-		const v4sf betavr = v4sf_splat(beta->real);
-		const v4sf betavi = v4sf_splat(beta->imag);
-		
-		temp = abv0r + cv0r * betavr - cv0i * betavi;
-		cv0i = abv0i + cv0r * betavi + cv0i * betavr;
-		cv0r = temp;
-
-		temp = abv1r + cv1r * betavr - cv1i * betavi;
-		cv1i = abv1i + cv1r * betavi + cv1i * betavr;
-		cv1r = temp;
-
-		temp = abv2r + cv2r * betavr - cv2i * betavi;
-		cv2i = abv2i + cv2r * betavi + cv2i * betavr;
-		cv2r = temp;
-
-		temp = abv3r + cv3r * betavr - cv3i * betavi;
-		cv3i = abv3i + cv3r * betavi + cv3i * betavr;
-		cv3r = temp;
-
-		v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
-		v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
-		v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
-		v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
-		v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
-		v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
-		v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
-		v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
-	} else {
-		// Load columns 0, 1, 2, 3 (real part)
-		v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
-		v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
-		v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
-		v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
-		// Load columns 0, 1, 2, 3 (imaginary part)
-		v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
-		v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
-		v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
-		v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
-
-		const v4sf betavr = v4sf_splat(beta->real);
-		const v4sf betavi = v4sf_splat(beta->imag);
-
-		temp = abv0r + cv0r * betavr - cv0i * betavi;
-		cv0i = abv0i + cv0r * betavi + cv0i * betavr;
-		cv0r = temp;
-
-		temp = abv1r + cv1r * betavr - cv1i * betavi;
-		cv1i = abv1i + cv1r * betavi + cv1i * betavr;
-		cv1r = temp;
-
-		temp = abv2r + cv2r * betavr - cv2i * betavi;
-		cv2i = abv2i + cv2r * betavi + cv2i * betavr;
-		cv2r = temp;
-
-		temp = abv3r + cv3r * betavr - cv3i * betavi;
-		cv3i = abv3i + cv3r * betavi + cv3i * betavr;
-		cv3r = temp;
-
-		// Store column 0
-		c[0*rs_c + 0*cs_c].real = cv0r[0];
-		c[0*rs_c + 0*cs_c].imag = cv0i[0];
-		c[1*rs_c + 0*cs_c].real = cv0r[1];
-		c[1*rs_c + 0*cs_c].imag = cv0i[1];
-		c[2*rs_c + 0*cs_c].real = cv0r[2];
-		c[2*rs_c + 0*cs_c].imag = cv0i[2];
-		c[3*rs_c + 0*cs_c].real = cv0r[3];
-		c[3*rs_c + 0*cs_c].imag = cv0i[3];
-
-		// Store column 1
-		c[0*rs_c + 1*cs_c].real = cv1r[0];
-		c[0*rs_c + 1*cs_c].imag = cv1i[0];
-		c[1*rs_c + 1*cs_c].real = cv1r[1];
-		c[1*rs_c + 1*cs_c].imag = cv1i[1];
-		c[2*rs_c + 1*cs_c].real = cv1r[2];
-		c[2*rs_c + 1*cs_c].imag = cv1i[2];
-		c[3*rs_c + 1*cs_c].real = cv1r[3];
-		c[3*rs_c + 1*cs_c].imag = cv1i[3];
-
-		// Store column 2
-		c[0*rs_c + 2*cs_c].real = cv2r[0];
-		c[0*rs_c + 2*cs_c].imag = cv2i[0];
-		c[1*rs_c + 2*cs_c].real = cv2r[1];
-		c[1*rs_c + 2*cs_c].imag = cv2i[1];
-		c[2*rs_c + 2*cs_c].real = cv2r[2];
-		c[2*rs_c + 2*cs_c].imag = cv2i[2];
-		c[3*rs_c + 2*cs_c].real = cv2r[3];
-		c[3*rs_c + 2*cs_c].imag = cv2i[3];
-
-		// Store column 3
-		c[0*rs_c + 3*cs_c].real = cv3r[0];
-		c[0*rs_c + 3*cs_c].imag = cv3i[0];
-		c[1*rs_c + 3*cs_c].real = cv3r[1];
-		c[1*rs_c + 3*cs_c].imag = cv3i[1];
-		c[2*rs_c + 3*cs_c].real = cv3r[2];
-		c[2*rs_c + 3*cs_c].imag = cv3i[2];
-		c[3*rs_c + 3*cs_c].real = cv3r[3];
-		c[3*rs_c + 3*cs_c].imag = cv3i[3];
-	}
-}
-#else
-void bli_cgemm_opt_4x4(
-	dim_t      k,
-	scomplex   alpha[restrict static 1],
-	scomplex   a[restrict static 4*k],
-	scomplex   b[restrict static k*4],
-	scomplex   beta[restrict static 1],
-	scomplex   c[restrict static 4*4],
-	inc_t      rs_c,
-	inc_t      cs_c,
-	auxinfo_t* data)
-{
-	/* Just call the reference implementation. */
-	BLIS_CGEMM_UKERNEL_REF(
-		k,
-		alpha,
-		a,
-		b,
-		beta,
-		c,
-		rs_c,
-		cs_c,
-		data);
-}
-#endif
-
-void bli_zgemm_opt_4x4(
-	dim_t      k,
-	dcomplex   alpha[restrict static 1],
-	dcomplex   a[restrict static 4*k],
-	dcomplex   b[restrict static k*4],
-	dcomplex   beta[restrict static 1],
-	dcomplex   c[restrict static 4*4],
-	inc_t      rs_c,
-	inc_t      cs_c,
-	auxinfo_t* data)
-{
-	/* Just call the reference implementation. */
-	BLIS_ZGEMM_UKERNEL_REF(
-		k,
-		alpha,
-		a,
-		b,
-		beta,
-		c,
-		rs_c,
-		cs_c,
-		data);
-}
-

From b693b0cddcfb41450e3c09a3ab97acb44c1ccdec Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@gmail.com>
Date: Sun, 22 Jun 2014 13:44:25 -0700
Subject: [PATCH 11/19] [SC]AXPY kernels for PNaCl

---
 config/pnacl/bli_kernel.h            |   4 +
 kernels/nacl/pnacl/1/bli_axpyv_opt.c | 203 +++++++++++++++++++++++++++
 2 files changed, 207 insertions(+)
 create mode 100644 kernels/nacl/pnacl/1/bli_axpyv_opt.c

diff --git a/config/pnacl/bli_kernel.h b/config/pnacl/bli_kernel.h
index d3ad1e721..439d9d715 100644
--- a/config/pnacl/bli_kernel.h
+++ b/config/pnacl/bli_kernel.h
@@ -218,6 +218,10 @@
 // -- addv --
 
 // -- axpyv --
+#if PPAPI_RELEASE >= 36
+#define BLIS_SAXPYV_KERNEL         bli_saxpyv_opt
+#define BLIS_CAXPYV_KERNEL         bli_caxpyv_opt
+#endif
 
 // -- copyv --
 
diff --git a/kernels/nacl/pnacl/1/bli_axpyv_opt.c b/kernels/nacl/pnacl/1/bli_axpyv_opt.c
new file mode 100644
index 000000000..c26a34b98
--- /dev/null
+++ b/kernels/nacl/pnacl/1/bli_axpyv_opt.c
@@ -0,0 +1,203 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#if PPAPI_RELEASE >= 36
+typedef float v4sf __attribute__ ((vector_size(16)));
+
+inline v4sf v4sf_splat(float x) {
+        return (v4sf) { x, x, x, x };
+}
+
+inline v4sf v4sf_load(const float* a) {
+        return *((const v4sf*)a);
+}
+
+inline v4sf v4sf_cload(const scomplex* a) {
+        return *((const v4sf*)a);
+}
+
+inline void v4sf_store(float* a, v4sf x) {
+        *((v4sf*)a) = x;
+}
+
+inline void v4sf_cstore(scomplex* a, v4sf x) {
+        *((v4sf*)a) = x;
+}
+
+inline v4sf v4sf_zero() {
+        return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
+}
+#endif
+
+
+void bli_saxpyv_opt(
+	conj_t conjx,
+	dim_t  n,
+	float  alpha[restrict static 1],
+	float  x[restrict static n],
+	inc_t  incx,
+	float  y[restrict static n],
+	inc_t  incy)
+{
+	if (bli_zero_dim1(n)) {
+		return;
+	}
+
+	if (bli_seq0(*alpha)) {
+		return;
+	}
+
+#if PPAPI_RELEASE >= 36
+	if (!bli_has_nonunit_inc2(incx, incy)) {
+		const v4sf alphav = v4sf_splat(*alpha);
+		while (n >= 4) {
+			const v4sf xv = v4sf_load(x);
+			v4sf yv = v4sf_load(y);
+			yv += xv * alphav;
+			v4sf_store(y, yv);
+
+			x += 4;
+			y += 4;
+			n -= 4;
+		}
+		const float alphac = *alpha;
+		while (n--) {
+			(*y++) += (*x++) * alphac;
+		}
+	}
+#endif
+	/* Just call the reference implementation. */
+	BLIS_SAXPYV_KERNEL_REF(
+		conjx,
+		n,
+		alpha,
+		x,
+		incx,
+		y,
+		incy);
+}
+
+
+void bli_caxpyv_opt(
+	conj_t   conjx,
+	dim_t    n,
+	scomplex alpha[restrict static 1],
+	scomplex x[restrict static n],
+	inc_t    incx,
+	scomplex y[restrict static n],
+	inc_t    incy)
+{
+	if (bli_zero_dim1(n)) {
+		return;
+	}
+
+	if (bli_ceq0(*alpha)) {
+		return;
+	}
+
+#if PPAPI_RELEASE >= 36
+	if (!bli_has_nonunit_inc2(incx, incy)) {
+		if (bli_is_noconj(conjx)) {
+			const v4sf alphav0 = v4sf_splat(alpha->real);
+			const v4sf alphav1 = (v4sf) { -alpha->imag, alpha->imag, -alpha->imag, alpha->imag };
+			while (n >= 2) {
+				const v4sf xv0 = v4sf_cload(x);
+				v4sf yv = v4sf_cload(y);
+				const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2);
+				yv += xv0 * alphav0 + xv1 * alphav1;
+				v4sf_cstore(y, yv);
+
+				x += 2;
+				y += 2;
+				n -= 2;
+			}
+			const float alphar = alpha->real;
+			const float alphai = alpha->imag;
+			while (n--) {
+				const float xr = x->real;
+				const float xi = x->imag;
+				const float yr = y->real;
+				const float yi = y->imag;
+
+				y->real = yr + xr * alphar - xi * alphai;
+				y->imag = yi + xr * alphai + xi * alphar; 
+
+				x += 1;
+				y += 1;
+			}
+		} else {
+			const v4sf alphav0 = (v4sf) { alpha->real, -alpha->real, alpha->real, -alpha->real };
+			const v4sf alphav1 = v4sf_splat(alpha->imag);
+			while (n >= 2) {
+				const v4sf xv0 = v4sf_cload(x);
+				v4sf yv = v4sf_cload(y);
+				const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2);
+				yv += xv0 * alphav0 + xv1 * alphav1;
+				v4sf_cstore(y, yv);
+
+				x += 2;
+				y += 2;
+				n -= 2;
+			}
+			const float alphar = alpha->real;
+			const float alphai = alpha->imag;
+			while (n--) {
+				const float xr = x->real;
+				const float xi = x->imag;
+				const float yr = y->real;
+				const float yi = y->imag;
+
+				y->real = yr + xr * alphar + xi * alphai;
+				y->imag = yi + xr * alphai - xi * alphar; 
+
+				x += 1;
+				y += 1;
+			}
+		}
+	}
+#endif
+
+
+	/* Just call the reference implementation. */
+	BLIS_CAXPYV_KERNEL_REF(
+		conjx,
+		n,
+		alpha,
+		x,
+		incx,
+		y,
+		incy);
+}

From 2cb9a5501a3cbeb6692cf68e896087ba73b6af69 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 23 Jun 2014 10:42:29 -0500
Subject: [PATCH 12/19] Removed 'version' from .gitignore file.

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8accdfa50..69e5f00be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,6 @@
 # -- build system files --
 
 config.mk
-version
 
 # -- makefile fragments --
 

From ebb33965981dcb2b0bdee5fc7fdf6c959420f311 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 23 Jun 2014 11:22:50 -0500
Subject: [PATCH 13/19] Added 'version' file.

---
 version | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 version

diff --git a/version b/version
new file mode 100644
index 000000000..b1e80bb24
--- /dev/null
+++ b/version
@@ -0,0 +1 @@
+0.1.3

From 09d9a3bf6763932d9f571085b2cfd1b8631eccba Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 23 Jun 2014 13:43:26 -0500
Subject: [PATCH 14/19] Reverting version file to test new version script.

Details:
- Changed version file contents to 0.1.2 so that I can test out a new
  version file bumping script.
---
 version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version b/version
index b1e80bb24..d917d3e26 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-0.1.3
+0.1.2

From 036cc634918463b1caa0fd89c9a211f2f5639af7 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 23 Jun 2014 13:48:17 -0500
Subject: [PATCH 15/19] Version file update (0.1.3)

---
 version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version b/version
index d917d3e26..b1e80bb24 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-0.1.2
+0.1.3

From 9ef1f1e21d083697fc730e48d7d9169c201f3da2 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 23 Jun 2014 13:48:17 -0500
Subject: [PATCH 16/19] CHANGELOG update (0.1.3)

---
 CHANGELOG | 135 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 134 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index eb33b31ef..9c18f47ae 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,137 @@
-commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (HEAD, tag: 0.1.2, origin/master, master)
+commit 036cc634918463b1caa0fd89c9a211f2f5639af7 (HEAD, tag: 0.1.3, master)
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 23 13:48:17 2014 -0500
+
+    Version file update (0.1.3)
+
+commit 09d9a3bf6763932d9f571085b2cfd1b8631eccba
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 23 13:43:26 2014 -0500
+
+    Reverting version file to test new version script.
+    
+    Details:
+    - Changed version file contents to 0.1.2 so that I can test out a new
+      version file bumping script.
+
+commit ebb33965981dcb2b0bdee5fc7fdf6c959420f311
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 23 11:22:50 2014 -0500
+
+    Added 'version' file.
+
+commit 2cb9a5501a3cbeb6692cf68e896087ba73b6af69
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 23 10:42:29 2014 -0500
+
+    Removed 'version' from .gitignore file.
+
+commit b40dcefc5ee31f67aa3990e2e9d2ef8ed1386a25 (origin/master)
+Merge: 7101a8e b693b0c
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Mon Jun 23 10:39:05 2014 -0500
+
+    Merge pull request #11 from Maratyszcza/stable
+    
+    [sc]axpy kernels for PNaCl
+
+commit b693b0cddcfb41450e3c09a3ab97acb44c1ccdec
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Sun Jun 22 13:44:25 2014 -0700
+
+    [SC]AXPY kernels for PNaCl
+
+commit 7101a8eec0327d6c3a7eb36eb4b0fd45c1c6d162
+Merge: ad48dca 020a831
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Jun 19 21:46:50 2014 -0500
+
+    Merge pull request #10 from Maratyszcza/stable
+    
+    Portable Native Client port
+
+commit 020a831bc5f61744cb8354886aa679b99b1285f6
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Thu Jun 19 00:58:26 2014 -0700
+
+    Code clean-up in PNaCl port
+
+commit 491be4f91ed725522f5cc7184053857c6c376ada
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Thu Jun 19 00:45:44 2014 -0700
+
+    Optimized dot product kernels for PNaCl
+
+commit 4b8e71aab80182873a2e138eb07902b8d8fd5480
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Thu Jun 19 00:43:25 2014 -0700
+
+    Use AR rcs flags for PNaCl target to avoid warning
+
+commit 031deb2a5c718d569bde842590a791b812f4cf1d
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Wed Jun 18 03:11:34 2014 -0700
+
+    PNaCl configuration: use pnacl-ar instead or ar (fixes build issue on Mac)
+
+commit 68a02976e3c3638f0a9821342e269a1743e3ace3
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Wed Jun 18 03:10:25 2014 -0700
+
+    Compile pnacl configuration in GNU11 mode to avoid warning about non-standard features
+
+commit 6f8462eb0ec278b89731e73ef583386a3371d095
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Wed Jun 18 03:08:46 2014 -0700
+
+    Fix inconsistent VERBOSE macro in Makefile
+
+commit b2ffb4de8b6872cb23537ad282e557d11dcd9c8b
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Sun Jun 15 18:41:30 2014 -0400
+
+    Reformatted PNaCl GEMM kernels
+
+commit 6de2d472d98baa215264a776f3d5291780a6a085
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Sun Jun 15 08:44:31 2014 -0400
+
+    CGEMM and ZGEMM kernels for PNaCl
+
+commit f064711a5e6fb3852c17c7520909b09dc27665f2
+Author: Marat Dukhan <maratek@gmail.com>
+Date:   Sun Jun 15 06:27:37 2014 -0400
+
+    SGEMM and DGEMM kernels for PNaCl
+
+commit ad48dca22913a363899f0bef45553898718eebb1
+Merge: ee2b679 7118f87
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Sat Jun 14 15:10:13 2014 -0500
+
+    Merge pull request #9 from tkelman/memalign_windows
+    
+    Use _aligned_malloc instead of posix_memalign on Windows
+
+commit 7118f87e18b4941423472afc00215c1d1f2a1fcd
+Author: Tony Kelman <tony@kelman.net>
+Date:   Sat Jun 14 06:53:20 2014 -0700
+
+    Use _aligned_malloc instead of posix_memalign on Windows
+
+commit ee2b679281ca45fb40b2198e293bc3bc3d446632
+Author: Tyler Smith <tms@cs.utexas.edu>
+Date:   Fri Jun 6 12:41:55 2014 -0500
+
+    Only include omp.h if BLIS_ENABLE_OPENMP is set
+
+commit 19c05dfaac43c627f86e897c8c00f1f9440754aa
+Author: Field G. Van Zee <field@cs.utexas.edu>
+Date:   Thu Jun 5 10:54:16 2014 -0500
+
+    CHANGELOG update (for 0.1.2).
+
+commit 00f232f8ed1f7c41619b12ebf779ebe2c3b2d3cd (tag: 0.1.2)
 Author: Tyler Smith <tms@cs.utexas.edu>
 Date:   Mon Jun 2 13:40:57 2014 -0500
 

From b80df0f2cffb015da02e70a82b8512da9891ab67 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 23 Jun 2014 13:52:39 -0500
Subject: [PATCH 17/19] Added bump-version.sh script to 'build' directory.

Details:
- Added a bash script, bump-version.sh, to aid in incrementing the BLIS
  version string.
---
 build/bump-version.sh | 201 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100755 build/bump-version.sh

diff --git a/build/bump-version.sh b/build/bump-version.sh
new file mode 100755
index 000000000..4b0fece2f
--- /dev/null
+++ b/build/bump-version.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+#
+# bump-version.sh
+#
+# Field G. Van Zee
+#
+
+
+print_usage()
+{
+	#local script_name
+	
+	# Get the script name
+	#script_name=${0##*/}
+	
+	# Echo usage info
+	echo " "
+	echo " "$script_name
+	echo " "
+	echo " Field G. Van Zee"
+	echo " "
+	echo " Performs a series of actions needed when incrementing (bumping) the"
+	echo " BLIS version number."
+	echo " "
+	echo " Usage:"
+	echo "   ${script_name} [options] new_vers"
+	echo " "
+	echo " Arguments:"
+	echo " "
+	echo "   new_vers     The new version string."
+	echo " "
+	echo " Options:"
+	echo " "
+	echo "   -d           dry-run"
+	echo "                  Go through all the motions, but don't actually make any"
+	echo "                  changes to files or perform any git commits."
+	echo "   -f VERSFILE  version file name"
+	echo "                  Update VERSFILE with new version string instead of default"
+	echo "                  'version' file."
+	
+	# Exit with non-zero exit status
+	exit 1
+}
+
+
+main()
+{
+	# -- BEGIN GLOBAL VARIABLE DECLARATIONS --
+
+	# The name of the script, stripped of any preceeding path.
+	script_name=${0##*/}
+
+	# The name of the CHANGELOG file.
+	changelog_file='CHANGELOG'
+
+	# The name of the default version file.
+	version_file_def='version'
+
+	# The name of the specified version file.
+	version_file=''
+
+	# Strings used during version query.
+	git_commit_str=''
+	new_version_str=''
+
+	# The script name to use instead of the $0 when outputting messages.
+	output_name=''
+
+	# The git directory.
+	gitdir='.git'
+	
+	# Whether we are performing a dry run or not.
+	dry_run_flag=""	
+
+	# -- END GLOBAL VARIABLE DECLARATIONS --
+
+
+	# Process our command line options.
+	while getopts ":dhf:" opt; do
+		case $opt in
+			d  ) dry_run_flag="1" ;;
+			f  ) version_file=$OPTARG ;;
+			h  ) print_usage ;;
+			\? ) print_usage
+		esac
+	done
+	shift $(($OPTIND - 1))
+
+
+	# If a version file name was not given, set version_file to the default
+	# value.
+	if [ -n "${version_file}" ]; then
+
+		echo "${script_name}: version file specified: '${version_file}'."
+	else
+
+		echo "${script_name}: no version file specified; defaulting to '${version_file_def}'."
+		version_file="${version_file_def}"
+	fi
+
+
+	# Check the number of arguments after command line option processing.
+	if [ $# = "1" ]; then
+
+		new_version_str=$1
+		echo "${script_name}: preparing to bump to version '${new_version_str}'."
+
+	else
+		print_usage
+	fi
+
+
+	# Check if the .git dir exists; if it does not, we do nothing.
+	if [ -d "${gitdir}" ]; then
+
+		echo "${script_name}: found '${gitdir}' directory; assuming git clone."
+
+		#echo "${script_name}: executing: git describe --always."
+		git_commit_str=$(git describe --always)
+		echo "${script_name}: starting commit: ${git_commit_str}."
+
+		echo "${script_name}: updating version file '${version_file}'."
+		if [ -z "$dry_run_flag" ]; then
+			echo "${new_version_str}" > ${version_file}
+		fi
+
+		echo "${script_name}: executing: git commit -m \"Version file update (${new_version_str})\" ${version_file}."
+		if [ -z "$dry_run_flag" ]; then
+			git commit -m "Version file update (${new_version_str})" ${version_file}
+		fi
+
+		#echo "${script_name}: executing: git describe --always."
+		git_commit_str=$(git describe --always)
+		echo "${script_name}: commit to be tagged: ${git_commit_str}."
+
+		echo "${script_name}: executing: git tag ${new_version_str} ${git_commit_str}."
+		if [ -z "$dry_run_flag" ]; then
+			git tag ${new_version_str} ${git_commit_str}
+		fi
+
+		echo "${script_name}: updating ${changelog_file}."
+		if [ -z "$dry_run_flag" ]; then
+			make changelog
+		fi
+
+		echo "${script_name}: executing: git commit -m \"CHANGELOG update (${new_version_str})\" ${changelog_file}."
+		if [ -z "$dry_run_flag" ]; then
+			git commit -m "CHANGELOG update (${new_version_str})" ${changelog_file}
+		fi
+
+		#echo "${script_name}: executing: git describe --always."
+		git_commit_str=$(git describe --always)
+		echo "${script_name}: latest commit: ${git_commit_str}."
+
+	else
+
+		echo "${script_name}: could not find '${gitdir}' directory; bailing out."
+
+	fi
+
+
+	# Exit peacefully.
+	return 0
+}
+
+
+# The script's main entry point, passing all parameters given.
+main "$@"

From e8ef69692831db07ddbe9485a5e504ac3f03e496 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 2 Jul 2014 14:59:27 -0500
Subject: [PATCH 18/19] Added shared library support to build system.

Details:
- Modified top-level Makefile to support building shared (dynamic)
  libraries.
- Updated most configurations' make_defs.mk files to include necessary
  compiler/linker flags needed by top-level Makefile.
- Note that by default, all configurations presently do NOT build
  shared libraries. To enable, one must change the value of
  BLIS_ENABLE_DYNAMIC_BUILD to 'yes'.
---
 Makefile                        | 72 ++++++++++++++++++++++++++++-----
 config/armv7a/make_defs.mk      |  8 ++--
 config/bgq/make_defs.mk         |  7 ++--
 config/cortex-a15/make_defs.mk  |  8 ++--
 config/cortex-a9/make_defs.mk   |  8 ++--
 config/dunnington/make_defs.mk  |  8 ++--
 config/loongson3a/make_defs.mk  |  8 ++--
 config/mic/make_defs.mk         |  8 ++--
 config/piledriver/make_defs.mk  |  8 ++--
 config/pnacl/make_defs.mk       |  8 ++--
 config/power7/make_defs.mk      | 11 ++---
 config/reference/make_defs.mk   | 10 +++--
 config/sandybridge/make_defs.mk |  8 ++--
 config/template/make_defs.mk    |  8 ++--
 14 files changed, 129 insertions(+), 51 deletions(-)

diff --git a/Makefile b/Makefile
index ca1d50fd5..8c49a2d6c 100644
--- a/Makefile
+++ b/Makefile
@@ -174,7 +174,7 @@ VERS_CONF              := $(VERSION)-$(CONFIG_NAME)
 # Note: These names will be modified later to include the configuration and
 # version strings.
 BLIS_LIB_NAME      := $(BLIS_LIB_BASE_NAME).a
-#BLIS_DLL_NAME      := $(BLIS_LIB_BASE_NAME).so
+BLIS_DLL_NAME      := $(BLIS_LIB_BASE_NAME).so
 
 # --- BLIS framework source and object variable names ---
 
@@ -196,8 +196,9 @@ MK_CONFIG_OBJS         :=
 MK_CONFIG_NOOPT_OBJS   :=
 MK_CONFIG_KERNELS_OBJS :=
 
-# Append the base library path to the library name.
+# Append the base library path to the library names.
 MK_ALL_BLIS_LIB        := $(BASE_LIB_PATH)/$(BLIS_LIB_NAME)
+MK_ALL_BLIS_DLL        := $(BASE_LIB_PATH)/$(BLIS_DLL_NAME)
 
 # --- Define install target names for static libraries ---
 
@@ -209,6 +210,16 @@ MK_BLIS_LIB_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.a, \
                                            $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).a, \
                                            $(MK_BLIS_LIB))
 
+# --- Define install target names for shared libraries ---
+
+MK_BLIS_DLL                  := $(MK_ALL_BLIS_DLL)
+MK_BLIS_DLL_INST             := $(patsubst $(BASE_LIB_PATH)/%.so, \
+                                           $(INSTALL_PREFIX)/lib/%.so, \
+                                           $(MK_BLIS_DLL))
+MK_BLIS_DLL_INST_W_VERS_CONF := $(patsubst $(BASE_LIB_PATH)/%.so, \
+                                           $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so, \
+                                           $(MK_BLIS_DLL))
+
 # --- Determine which libraries to build ---
 
 MK_LIBS                           :=
@@ -221,6 +232,12 @@ MK_LIBS_INST                      += $(MK_BLIS_LIB_INST)
 MK_LIBS_INST_W_VERS_CONF          += $(MK_BLIS_LIB_INST_W_VERS_CONF)
 endif
 
+ifeq ($(BLIS_ENABLE_DYNAMIC_BUILD),yes)
+MK_LIBS                           += $(MK_BLIS_DLL)
+MK_LIBS_INST                      += $(MK_BLIS_DLL_INST)
+MK_LIBS_INST_W_VERS_CONF          += $(MK_BLIS_DLL_INST_W_VERS_CONF)
+endif
+
 # Strip leading, internal, and trailing whitespace.
 MK_LIBS_INST                      := $(strip $(MK_LIBS_INST))
 MK_LIBS_INST_W_VERS_CONF          := $(strip $(MK_LIBS_INST_W_VERS_CONF))
@@ -385,12 +402,12 @@ TESTSUITE_BIN           := $(TESTSUITE_NAME).x
 # --- Uninstall definitions ----------------------------------------------------
 #
 
-# This shell command grabs all files named "libblis-*.a" in the installation
-# directory and then filters out the name of the library archive for the
-# current version/configuration. We consider this remaining set of libraries
-# to be "old" and eligible for removal upon running of the uninstall-old
-# target.
-UNINSTALL_LIBS   := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.a" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).a" | $(GREP) -v $(BLIS_LIB_NAME))
+# This shell command grabs all files named "libblis-*.a" or "libblis-*.so" in
+# the installation directory and then filters out the name of the library
+# archive for the current version/configuration. We consider this remaining set
+# of libraries to be "old" and eligible for removal upon running of the
+# uninstall-old target.
+UNINSTALL_LIBS   := $(shell $(FIND) $(INSTALL_PREFIX)/lib/ -name "$(BLIS_LIB_BASE_NAME)-*.[a|so]" 2> /dev/null | $(GREP) -v "$(BLIS_LIB_BASE_NAME)-$(VERS_CONF).[a|so]" | $(GREP) -v $(BLIS_LIB_NAME))
 
 
 
@@ -464,10 +481,13 @@ ifeq ($(MAKE_DEFS_MK_PRESENT),no)
 endif
 
 
-# --- Static library archiver rules ---
+# --- All-purpose library rule (static and shared) ---
 
 blis-lib: check-env $(MK_LIBS)
 
+
+# --- Static library archiver rules ---
+
 $(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS)
 ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
 	$(AR) $(ARFLAGS) $@ $?
@@ -479,6 +499,17 @@ else
 endif
 
 
+# --- Dynamic library linker rules ---
+
+$(MK_ALL_BLIS_DLL): $(MK_ALL_BLIS_OBJS)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $?
+else 
+	@echo "Dynamically linking $@"
+	@$(LINKER) $(SOFLAGS) $(LDFLAGS) -o $@ $?
+endif
+
+
 # --- Test suite rules ---
 
 testsuite: testsuite-run
@@ -540,6 +571,16 @@ else
 	@$(INSTALL) -m 0644 $< $@
 endif
 
+$(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so: $(BASE_LIB_PATH)/%.so $(CONFIG_MK_FILE)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(INSTALL) -m 0755 -d $(@D)
+	$(INSTALL) -m 0644 $< $@
+else
+	@echo "Installing $(@F) into $(INSTALL_PREFIX)/lib/"
+	@$(INSTALL) -m 0755 -d $(@D)
+	@$(INSTALL) -m 0644 $< $@
+endif
+
 
 # --- Install-symlinks rules ---
 
@@ -555,6 +596,16 @@ else
 	@$(MV) $(@F) $(INSTALL_PREFIX)/lib/
 endif
 
+$(INSTALL_PREFIX)/lib/%.so: $(INSTALL_PREFIX)/lib/%-$(VERS_CONF).so
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(SYMLINK) $(<F) $(@F)
+	$(MV) $(@F) $(INSTALL_PREFIX)/lib/
+else
+	@echo "Installing symlink $(@F) into $(INSTALL_PREFIX)/lib/"
+	@$(SYMLINK) $(<F) $(@F)
+	@$(MV) $(@F) $(INSTALL_PREFIX)/lib/
+endif
+
 
 # --- Query current configuration ---
 
@@ -569,6 +620,7 @@ ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
 	- $(FIND) $(BASE_OBJ_CONFIG_PATH) -name "*.o" | $(XARGS) $(RM_F)
 	- $(FIND) $(BASE_OBJ_FRAME_PATH) -name "*.o" | $(XARGS) $(RM_F)
 	- $(FIND) $(BASE_LIB_PATH) -name "*.a" | $(XARGS) $(RM_F)
+	- $(FIND) $(BASE_LIB_PATH) -name "*.so" | $(XARGS) $(RM_F)
 else
 	@echo "Removing .o files from $(BASE_OBJ_CONFIG_PATH)."
 	@- $(FIND) $(BASE_OBJ_CONFIG_PATH) -name "*.o" | $(XARGS) $(RM_F)
@@ -576,6 +628,8 @@ else
 	@- $(FIND) $(BASE_OBJ_FRAME_PATH) -name "*.o" | $(XARGS) $(RM_F)
 	@echo "Removing .a files from $(BASE_LIB_PATH)."
 	@- $(FIND) $(BASE_LIB_PATH) -name "*.a" | $(XARGS) $(RM_F)
+	@echo "Removing .so files from $(BASE_LIB_PATH)."
+	@- $(FIND) $(BASE_LIB_PATH) -name "*.so" | $(XARGS) $(RM_F)
 endif
 
 cleantest: check-env
diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk
index 0b13d31e1..ecb2bc6b1 100644
--- a/config/armv7a/make_defs.mk
+++ b/config/armv7a/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := #-g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
@@ -90,9 +91,9 @@ CVECFLAGS      := #-msse3  # -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk
index f21b673d6..dfb96ca95 100644
--- a/config/bgq/make_defs.mk
+++ b/config/bgq/make_defs.mk
@@ -82,6 +82,7 @@ CC             := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L \
                   -I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk
 CMISCFLAGS     := -qthreaded -qsmp=omp -qasm=gcc -qkeyword=asm # -qreport -qsource -qlistopt -qlist
+CPICFLAGS      := 
 CDBGFLAGS      :=
 CWARNFLAGS     := -w
 COPTFLAGS      := -O3
@@ -91,9 +92,9 @@ CVECFLAGS      := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk
index 92af82765..b06e36117 100644
--- a/config/cortex-a15/make_defs.mk
+++ b/config/cortex-a15/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 -mfloat-abi=hard -mfpu=neon 
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -march=armv7-a -mfpu=neon -O2
@@ -90,9 +91,9 @@ CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk
index 4a0398421..f564aa643 100644
--- a/config/cortex-a9/make_defs.mk
+++ b/config/cortex-a9/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 -mfloat-abi=hard -mfpu=neon 
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard
@@ -90,9 +91,9 @@ CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk
index a1f741d8e..1c0c0a67b 100644
--- a/config/dunnington/make_defs.mk
+++ b/config/dunnington/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 # -fopenmp -pg
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := #-g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O2 -mfpmath=sse -fomit-frame-pointer
@@ -90,9 +91,9 @@ CVECFLAGS      := -msse3 -march=native
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk
index 1a43c5b9b..6e8c4cdfa 100644
--- a/config/loongson3a/make_defs.mk
+++ b/config/loongson3a/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L -mabi=64
 CMISCFLAGS     := -std=c99 -fopenmp #-pg
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O3  -march=loongson3a -mtune=loongson3a
@@ -90,9 +91,9 @@ CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk
index b09bea493..68c9ad385 100644
--- a/config/mic/make_defs.mk
+++ b/config/mic/make_defs.mk
@@ -79,6 +79,7 @@ GIT_LOG    := $(GIT) log --decorate
 CC             := icc
 CPPROCFLAGS    :=
 CMISCFLAGS     := -mmic -fasm-blocks -std=c99 -openmp
+CPICFLAGS      := -fPIC
 CDBGFLAGS      :=
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O3
@@ -88,9 +89,9 @@ CVECFLAGS      :=
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -98,6 +99,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -mmic -lm -openmp
 
 
diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk
index fd7320db8..edb496303 100644
--- a/config/piledriver/make_defs.mk
+++ b/config/piledriver/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 -fopenmp
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O0 -malign-double -funroll-all-loops
@@ -90,9 +91,9 @@ CVECFLAGS      := -mavx -mfma -march=bdver2 -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk
index 1c4566012..a6e762984 100644
--- a/config/pnacl/make_defs.mk
+++ b/config/pnacl/make_defs.mk
@@ -81,6 +81,7 @@ CC             := pnacl-clang
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=gnu11 -I$(NACL_SDK_ROOT)/include
+CPICFLAGS      := 
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O3
@@ -90,9 +91,9 @@ CVECFLAGS      :=
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := pnacl-ar
@@ -100,6 +101,7 @@ ARFLAGS        := rcs
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := 
 LDFLAGS        := -lm
 
 # --- Determine the finalizer and related flags ---
diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk
index de3f05b59..5be05ee05 100644
--- a/config/power7/make_defs.mk
+++ b/config/power7/make_defs.mk
@@ -80,8 +80,8 @@ CC             := gcc
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 # -fopenmp -pg
-CMISCFLAGS     += -m64 -mcpu=power7
+CMISCFLAGS     := -std=c99 -m64 -mcpu=power7 #-fopenmp -pg
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O3 -mtune=power7
@@ -91,9 +91,9 @@ CVECFLAGS      := -mvsx
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -101,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk
index ab2b5a462..55abe4d6d 100644
--- a/config/reference/make_defs.mk
+++ b/config/reference/make_defs.mk
@@ -81,7 +81,8 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 # -fopenmp -pg
-CDBGFLAGS      := -g
+CPICFLAGS      := -fPIC
+CDBGFLAGS      := #-g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O2
 CKOPTFLAGS     := $(COPTFLAGS)
@@ -90,9 +91,9 @@ CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk
index c075bf28a..cfb208810 100644
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 -m64 -fopenmp  # -fopenmp -pg
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := #-g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O3 -march=native
@@ -90,9 +91,9 @@ CVECFLAGS      := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 
diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk
index ab2b5a462..5e9027aa7 100644
--- a/config/template/make_defs.mk
+++ b/config/template/make_defs.mk
@@ -81,6 +81,7 @@ CC             := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 # -fopenmp -pg
+CPICFLAGS      := -fPIC
 CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O2
@@ -90,9 +91,9 @@ CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
 # modes.
-CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 
 # --- Determine the archiver and related flags ---
 AR             := ar
@@ -100,6 +101,7 @@ ARFLAGS        := cru
 
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
+SOFLAGS        := -shared
 LDFLAGS        := -lm
 
 

From 4702350278af31f662b458127777dd4d85a3192f Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Thu, 3 Jul 2014 11:48:23 -0500
Subject: [PATCH 19/19] Defined _ukernel_void() wrappers to micro-kernels.

Details:
- Added wrappers for micro-kernels so that users may invoke the
  micro-kernels without knowing what the function names actually are.
  This is useful when an application wishes to call the micro-kernel
  from a shared library instance of BLIS, where the application may not
  necessarily have the luxury of grabbing the micro-kernel name(s) from
  C preprocessor macros at compile-time. Also, since the wrappers use
  void* pointers, one's environment does not need to be aware of some
  BLIS types such as scomplex and dcomplex. These wrappers now join the
  level-1 and level-1f kernel wrappers, which pre-dated this commit.
- Removed the wrapper definitions and prototypes from the micro-kernel
  test suite modules, and replaced calls to them with calls to the new
  wrappers mentioned above.
---
 frame/3/gemm/bli_gemm.h             |   2 +
 frame/3/gemm/bli_gemm_ukernel.c     | 126 +++++++++++++++++++++++++
 frame/3/gemm/bli_gemm_ukernel.h     |  60 ++++++++++++
 frame/3/trsm/bli_gemmtrsm_ukernel.c | 141 ++++++++++++++++++++++++++++
 frame/3/trsm/bli_gemmtrsm_ukernel.h |  63 +++++++++++++
 frame/3/trsm/bli_trsm.h             |   3 +
 frame/3/trsm/bli_trsm_ukernel.c     | 111 ++++++++++++++++++++++
 frame/3/trsm/bli_trsm_ukernel.h     |  56 +++++++++++
 testsuite/src/test_gemm_ukr.c       |  98 +------------------
 testsuite/src/test_gemm_ukr.h       |  25 -----
 testsuite/src/test_gemmtrsm_ukr.c   | 114 +---------------------
 testsuite/src/test_gemmtrsm_ukr.h   |  27 ------
 testsuite/src/test_trsm_ukr.c       |  83 +---------------
 testsuite/src/test_trsm_ukr.h       |  20 ----
 14 files changed, 565 insertions(+), 364 deletions(-)
 create mode 100644 frame/3/gemm/bli_gemm_ukernel.c
 create mode 100644 frame/3/gemm/bli_gemm_ukernel.h
 create mode 100644 frame/3/trsm/bli_gemmtrsm_ukernel.c
 create mode 100644 frame/3/trsm/bli_gemmtrsm_ukernel.h
 create mode 100644 frame/3/trsm/bli_trsm_ukernel.c
 create mode 100644 frame/3/trsm/bli_trsm_ukernel.h

diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h
index c47985a3e..704402f58 100644
--- a/frame/3/gemm/bli_gemm.h
+++ b/frame/3/gemm/bli_gemm.h
@@ -38,6 +38,8 @@
 #include "bli_gemm_int.h"
 #include "bli_gemm_target.h"
 
+#include "bli_gemm_ukernel.h"
+
 #include "bli_gemm_blk_var1f.h"
 #include "bli_gemm_blk_var2f.h"
 #include "bli_gemm_blk_var3f.h"
diff --git a/frame/3/gemm/bli_gemm_ukernel.c b/frame/3/gemm/bli_gemm_ukernel.c
new file mode 100644
index 000000000..5418d40a4
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_ukernel.c
@@ -0,0 +1,126 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_ukr_fp
+
+typedef void (*FUNCPTR_T)(
+                           dim_t      k,
+                           void*      alpha,
+                           void*      a,
+                           void*      b,
+                           void*      beta,
+                           void*      c, inc_t rs_c, inc_t cs_c,
+                           auxinfo_t* data
+                         );
+
+static FUNCPTR_T GENARRAY(ftypes,gemm_ukernel_void);
+
+
+void bli_gemm_ukernel( obj_t*  alpha,
+                       obj_t*  a,
+                       obj_t*  b,
+                       obj_t*  beta,
+                       obj_t*  c )
+{
+	num_t     dt        = bli_obj_datatype( *c );
+
+	dim_t     k         = bli_obj_width( *a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( *a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( *b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( *c );
+	inc_t     rs_c      = bli_obj_row_stride( *c );
+	inc_t     cs_c      = bli_obj_col_stride( *c );
+
+	void*     buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
+
+	void*     buf_beta  = bli_obj_buffer_for_1x1( dt, *beta );
+
+	inc_t     ps_a      = bli_obj_panel_stride( *a );
+	inc_t     ps_b      = bli_obj_panel_stride( *b );
+
+	FUNCPTR_T f;
+
+	auxinfo_t data;
+
+
+	// Fill the auxinfo_t struct in case the micro-kernel uses it.
+	bli_auxinfo_set_next_a( buf_a, data );
+	bli_auxinfo_set_next_b( buf_b, data );
+	bli_auxinfo_set_ps_a( ps_a, data );
+	bli_auxinfo_set_ps_b( ps_b, data );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt];
+
+	// Invoke the function.
+	f( k,
+	   buf_alpha,
+	   buf_a,
+	   buf_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   &data );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname, ukrname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t      k, \
+                           void*      alpha, \
+                           void*      a, \
+                           void*      b, \
+                           void*      beta, \
+                           void*      c, inc_t rs_c, inc_t cs_c, \
+                           auxinfo_t* data  \
+                         ) \
+{ \
+	PASTEMAC(ch,ukrname)( k, \
+	                      alpha, \
+	                      a, \
+	                      b, \
+	                      beta, \
+	                      c, rs_c, cs_c, \
+	                      data ); \
+}
+
+INSERT_GENTFUNC_BASIC( gemm_ukernel_void, GEMM_UKERNEL )
+
diff --git a/frame/3/gemm/bli_gemm_ukernel.h b/frame/3/gemm/bli_gemm_ukernel.h
new file mode 100644
index 000000000..a4c4e0576
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_ukernel.h
@@ -0,0 +1,60 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_gemm_ukernel( obj_t*  alpha,
+                       obj_t*  a,
+                       obj_t*  b,
+                       obj_t*  beta,
+                       obj_t*  c );
+
+
+//
+// Prototype the void pointer kernel wrappers.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t      k, \
+                           void*      alpha, \
+                           void*      a, \
+                           void*      b, \
+                           void*      beta, \
+                           void*      c, inc_t rs_c, inc_t cs_c, \
+                           auxinfo_t* data  \
+                         );
+
+INSERT_GENTPROT_BASIC( gemm_ukernel_void )
+
diff --git a/frame/3/trsm/bli_gemmtrsm_ukernel.c b/frame/3/trsm/bli_gemmtrsm_ukernel.c
new file mode 100644
index 000000000..65ef744d3
--- /dev/null
+++ b/frame/3/trsm/bli_gemmtrsm_ukernel.c
@@ -0,0 +1,141 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  FUNCPTR_T
+#define FUNCPTR_T gemmtrsm_ukr_fp
+
+typedef void (*FUNCPTR_T)(
+                           dim_t      k,
+                           void*      alpha,
+                           void*      a1x,
+                           void*      a11,
+                           void*      bx1,
+                           void*      b11,
+                           void*      c11, inc_t rs_c, inc_t cs_c,
+                           auxinfo_t* data
+                         );
+
+static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukernel_void);
+static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukernel_void);
+
+
+void bli_gemmtrsm_ukernel( obj_t*  alpha,
+                           obj_t*  a1x,
+                           obj_t*  a11,
+                           obj_t*  bx1,
+                           obj_t*  b11,
+                           obj_t*  c11 )
+{
+	dim_t     k         = bli_obj_width( *a1x );
+
+	num_t     dt        = bli_obj_datatype( *c11 );
+
+	void*     buf_a1x   = bli_obj_buffer_at_off( *a1x );
+
+	void*     buf_a11   = bli_obj_buffer_at_off( *a11 );
+
+	void*     buf_bx1   = bli_obj_buffer_at_off( *bx1 );
+
+	void*     buf_b11   = bli_obj_buffer_at_off( *b11 );
+
+	void*     buf_c11   = bli_obj_buffer_at_off( *c11 );
+	inc_t     rs_c      = bli_obj_row_stride( *c11 );
+	inc_t     cs_c      = bli_obj_col_stride( *c11 );
+
+	void*     buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
+
+	inc_t     ps_a      = bli_obj_panel_stride( *a1x );
+	inc_t     ps_b      = bli_obj_panel_stride( *bx1 );
+
+	FUNCPTR_T f;
+
+	auxinfo_t data;
+
+
+	// Fill the auxinfo_t struct in case the micro-kernel uses it.
+	if ( bli_obj_is_lower( *a11 ) )
+	{ bli_auxinfo_set_next_a( buf_a1x, data ); }
+	else
+	{ bli_auxinfo_set_next_a( buf_a11, data ); }
+	bli_auxinfo_set_next_b( buf_bx1, data );
+
+	bli_auxinfo_set_ps_a( ps_a, data );
+	bli_auxinfo_set_ps_b( ps_b, data );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt];
+	else                            f = ftypes_u[dt];
+
+	// Invoke the function.
+	f( k,
+	   buf_alpha,
+	   buf_a1x,
+	   buf_a11,
+	   buf_bx1,
+	   buf_b11,
+	   buf_c11, rs_c, cs_c,
+	   &data );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname, ukrname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t      k, \
+                           void*      alpha, \
+                           void*      a1x, \
+                           void*      a11, \
+                           void*      bx1, \
+                           void*      b11, \
+                           void*      c11, inc_t rs_c, inc_t cs_c, \
+                           auxinfo_t* data  \
+                         ) \
+{ \
+	PASTEMAC(ch,ukrname)( k, \
+	                      alpha, \
+	                      a1x, \
+	                      a11, \
+	                      bx1, \
+	                      b11, \
+	                      c11, rs_c, cs_c, \
+	                      data ); \
+}
+
+INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukernel_void, GEMMTRSM_L_UKERNEL )
+INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukernel_void, GEMMTRSM_U_UKERNEL )
+
diff --git a/frame/3/trsm/bli_gemmtrsm_ukernel.h b/frame/3/trsm/bli_gemmtrsm_ukernel.h
new file mode 100644
index 000000000..2cfb47bbd
--- /dev/null
+++ b/frame/3/trsm/bli_gemmtrsm_ukernel.h
@@ -0,0 +1,63 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_gemmtrsm_ukernel( obj_t*  alpha,
+                           obj_t*  a1x,
+                           obj_t*  a11,
+                           obj_t*  bx1,
+                           obj_t*  b11,
+                           obj_t*  c11 );
+
+
+//
+// Prototype the void pointer kernel wrappers.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t      k, \
+                           void*      alpha, \
+                           void*      a1x, \
+                           void*      a11, \
+                           void*      bx1, \
+                           void*      b11, \
+                           void*      c11, inc_t rs_c, inc_t cs_c, \
+                           auxinfo_t* data  \
+                         );
+
+INSERT_GENTPROT_BASIC( gemmtrsm_l_ukernel_void )
+INSERT_GENTPROT_BASIC( gemmtrsm_u_ukernel_void )
+
diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h
index 5fa21cccc..d358dd66c 100644
--- a/frame/3/trsm/bli_trsm.h
+++ b/frame/3/trsm/bli_trsm.h
@@ -37,6 +37,9 @@
 #include "bli_trsm_front.h"
 #include "bli_trsm_int.h"
 
+#include "bli_gemmtrsm_ukernel.h"
+#include "bli_trsm_ukernel.h"
+
 #include "bli_trsm_blk_var1f.h"
 #include "bli_trsm_blk_var1b.h"
 
diff --git a/frame/3/trsm/bli_trsm_ukernel.c b/frame/3/trsm/bli_trsm_ukernel.c
new file mode 100644
index 000000000..fb19e0abb
--- /dev/null
+++ b/frame/3/trsm/bli_trsm_ukernel.c
@@ -0,0 +1,111 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  FUNCPTR_T
+#define FUNCPTR_T trsm_ukr_fp
+
+typedef void (*FUNCPTR_T)(
+                           void*      a,
+                           void*      b,
+                           void*      c, inc_t rs_c, inc_t cs_c,
+                           auxinfo_t* data
+                         );
+
+static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukernel_void);
+static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukernel_void);
+
+
+void bli_trsm_ukernel( obj_t*  a,
+                       obj_t*  b,
+                       obj_t*  c )
+{
+	num_t     dt        = bli_obj_datatype( *c );
+
+	void*     buf_a     = bli_obj_buffer_at_off( *a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( *b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( *c );
+	inc_t     rs_c      = bli_obj_row_stride( *c );
+	inc_t     cs_c      = bli_obj_col_stride( *c );
+
+	inc_t     ps_a      = bli_obj_panel_stride( *a );
+	inc_t     ps_b      = bli_obj_panel_stride( *b );
+
+	FUNCPTR_T f;
+
+	auxinfo_t data;
+
+
+	// Fill the auxinfo_t struct in case the micro-kernel uses it.
+	bli_auxinfo_set_next_a( buf_a, data );
+	bli_auxinfo_set_next_b( buf_b, data );
+
+	bli_auxinfo_set_ps_a( ps_a, data );
+	bli_auxinfo_set_ps_b( ps_b, data );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt];
+	else                          f = ftypes_u[dt];
+
+	// Invoke the function.
+	f( buf_a,
+	   buf_b,
+	   buf_c, rs_c, cs_c,
+	   &data );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname, ukrname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           void*      a, \
+                           void*      b, \
+                           void*      c, inc_t rs_c, inc_t cs_c, \
+                           auxinfo_t* data  \
+                         ) \
+{ \
+	PASTEMAC(ch,ukrname)( a, \
+	                      b, \
+	                      c, rs_c, cs_c, \
+	                      data ); \
+}
+
+INSERT_GENTFUNC_BASIC( trsm_l_ukernel_void, TRSM_L_UKERNEL )
+INSERT_GENTFUNC_BASIC( trsm_u_ukernel_void, TRSM_U_UKERNEL )
+
diff --git a/frame/3/trsm/bli_trsm_ukernel.h b/frame/3/trsm/bli_trsm_ukernel.h
new file mode 100644
index 000000000..eaf18d1d4
--- /dev/null
+++ b/frame/3/trsm/bli_trsm_ukernel.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_trsm_ukernel( obj_t*  a,
+                       obj_t*  b,
+                       obj_t*  c );
+
+
+//
+// Prototype the void pointer kernel wrappers.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           void*      a, \
+                           void*      b, \
+                           void*      c, inc_t rs_c, inc_t cs_c, \
+                           auxinfo_t* data  \
+                         );
+
+INSERT_GENTPROT_BASIC( trsm_l_ukernel_void )
+INSERT_GENTPROT_BASIC( trsm_u_ukernel_void )
+
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index 3ce1b688f..c7464ce21 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -270,7 +270,7 @@ void libblis_test_gemm_ukr_impl( iface_t   iface,
 	switch ( iface )
 	{
 		case BLIS_TEST_SEQ_UKERNEL:
-		bli_gemm_ukr( alpha, a, b, beta, c );
+		bli_gemm_ukernel( alpha, a, b, beta, c );
 		break;
 
 		default:
@@ -354,99 +354,3 @@ void libblis_test_gemm_ukr_check( obj_t*  alpha,
 	bli_obj_free( &z );
 }
 
-
-
-//
-// Define object-wrapper to GEMM_UKERNEL micro-kernels.
-//
-
-#define FUNCPTR_T gemm_ukr_fp
-
-typedef void (*FUNCPTR_T)(
-                           dim_t      k,
-                           void*      alpha,
-                           void*      a,
-                           void*      b,
-                           void*      beta,
-                           void*      c, inc_t rs_c, inc_t cs_c,
-                           auxinfo_t* data
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,gemm_ukr);
-
-
-void bli_gemm_ukr( obj_t*  alpha,
-                   obj_t*  a,
-                   obj_t*  b,
-                   obj_t*  beta,
-                   obj_t*  c )
-{
-	num_t     dt        = bli_obj_datatype( *c );
-
-	dim_t     k         = bli_obj_width( *a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( *a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( *b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( *c );
-	inc_t     rs_c      = bli_obj_row_stride( *c );
-	inc_t     cs_c      = bli_obj_col_stride( *c );
-
-	void*     buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
-
-	void*     buf_beta  = bli_obj_buffer_for_1x1( dt, *beta );
-
-	inc_t     ps_a      = bli_obj_panel_stride( *a );
-	inc_t     ps_b      = bli_obj_panel_stride( *b );
-
-	FUNCPTR_T f;
-
-	auxinfo_t data;
-
-
-	// Fill the auxinfo_t struct in case the micro-kernel uses it.
-	bli_auxinfo_set_next_a( buf_a, data );
-	bli_auxinfo_set_next_b( buf_b, data );
-	bli_auxinfo_set_ps_a( ps_a, data );
-	bli_auxinfo_set_ps_b( ps_b, data );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt];
-
-	// Invoke the function.
-	f( k,
-	   buf_alpha,
-	   buf_a,
-	   buf_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   &data );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, ukrname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t      k, \
-                           void*      alpha, \
-                           void*      a, \
-                           void*      b, \
-                           void*      beta, \
-                           void*      c, inc_t rs_c, inc_t cs_c, \
-                           auxinfo_t* data  \
-                         ) \
-{ \
-	PASTEMAC(ch,ukrname)( k, \
-	                      alpha, \
-	                      a, \
-	                      b, \
-	                      beta, \
-	                      c, rs_c, cs_c, \
-	                      data ); \
-}
-
-INSERT_GENTFUNC_BASIC( gemm_ukr, GEMM_UKERNEL )
-
diff --git a/testsuite/src/test_gemm_ukr.h b/testsuite/src/test_gemm_ukr.h
index 002e033b6..72a563412 100644
--- a/testsuite/src/test_gemm_ukr.h
+++ b/testsuite/src/test_gemm_ukr.h
@@ -34,28 +34,3 @@
 
 void libblis_test_gemm_ukr( test_params_t* params, test_op_t* op );
 
-
-//
-// Prototype wrapper interfaces to micro-kernel.
-//
-void bli_gemm_ukr( obj_t*  alpha,
-                   obj_t*  a,
-                   obj_t*  b,
-                   obj_t*  beta,
-                   obj_t*  c );
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t      k, \
-                           void*      alpha, \
-                           void*      a, \
-                           void*      b, \
-                           void*      beta, \
-                           void*      c, inc_t rs_c, inc_t cs_c, \
-                           auxinfo_t* data  \
-                         );
-
-INSERT_GENTPROT_BASIC( gemm_ukr )
-
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index e2701d313..ff86640ae 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -314,7 +314,7 @@ void libblis_test_gemmtrsm_ukr_impl( iface_t   iface,
 	switch ( iface )
 	{
 		case BLIS_TEST_SEQ_UKERNEL:
-		bli_gemmtrsm_ukr( alpha, a1x, a11, bx1, b11, c11 );
+		bli_gemmtrsm_ukernel( alpha, a1x, a11, bx1, b11, c11 );
 		break;
 
 		default:
@@ -474,115 +474,3 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t  k,
 	bli_obj_set_diag_offset( 0, *a11 );
 }
 
-
-
-//
-// Define object-wrapper to GEMMTRSM_L_UKERNEL, GEMMTRSM_U_UKERNEL
-// micro-kernels.
-//
-
-#undef  FUNCPTR_T
-#define FUNCPTR_T gemmtrsm_ukr_fp
-
-typedef void (*FUNCPTR_T)(
-                           dim_t      k,
-                           void*      alpha,
-                           void*      a1x,
-                           void*      a11,
-                           void*      bx1,
-                           void*      b11,
-                           void*      c11, inc_t rs_c, inc_t cs_c,
-                           auxinfo_t* data
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes_l,gemmtrsm_l_ukr);
-static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukr);
-
-
-void bli_gemmtrsm_ukr( obj_t*  alpha,
-                       obj_t*  a1x,
-                       obj_t*  a11,
-                       obj_t*  bx1,
-                       obj_t*  b11,
-                       obj_t*  c11 )
-{
-	dim_t     k         = bli_obj_width( *a1x );
-
-	num_t     dt        = bli_obj_datatype( *c11 );
-
-	void*     buf_a1x   = bli_obj_buffer_at_off( *a1x );
-
-	void*     buf_a11   = bli_obj_buffer_at_off( *a11 );
-
-	void*     buf_bx1   = bli_obj_buffer_at_off( *bx1 );
-
-	void*     buf_b11   = bli_obj_buffer_at_off( *b11 );
-
-	void*     buf_c11   = bli_obj_buffer_at_off( *c11 );
-	inc_t     rs_c      = bli_obj_row_stride( *c11 );
-	inc_t     cs_c      = bli_obj_col_stride( *c11 );
-
-	void*     buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );
-
-	inc_t     ps_a      = bli_obj_panel_stride( *a1x );
-	inc_t     ps_b      = bli_obj_panel_stride( *bx1 );
-
-	FUNCPTR_T f;
-
-	auxinfo_t data;
-
-
-	// Fill the auxinfo_t struct in case the micro-kernel uses it.
-	if ( bli_obj_is_lower( *a11 ) )
-	{ bli_auxinfo_set_next_a( buf_a1x, data ); }
-	else
-	{ bli_auxinfo_set_next_a( buf_a11, data ); }
-	bli_auxinfo_set_next_b( buf_bx1, data );
-
-	bli_auxinfo_set_ps_a( ps_a, data );
-	bli_auxinfo_set_ps_b( ps_b, data );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt];
-	else                            f = ftypes_u[dt];
-
-	// Invoke the function.
-	f( k,
-	   buf_alpha,
-	   buf_a1x,
-	   buf_a11,
-	   buf_bx1,
-	   buf_b11,
-	   buf_c11, rs_c, cs_c,
-	   &data );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, ukrname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t      k, \
-                           void*      alpha, \
-                           void*      a1x, \
-                           void*      a11, \
-                           void*      bx1, \
-                           void*      b11, \
-                           void*      c11, inc_t rs_c, inc_t cs_c, \
-                           auxinfo_t* data  \
-                         ) \
-{ \
-	PASTEMAC(ch,ukrname)( k, \
-	                      alpha, \
-	                      a1x, \
-	                      a11, \
-	                      bx1, \
-	                      b11, \
-	                      c11, rs_c, cs_c, \
-	                      data ); \
-}
-
-INSERT_GENTFUNC_BASIC( gemmtrsm_l_ukr, GEMMTRSM_L_UKERNEL )
-INSERT_GENTFUNC_BASIC( gemmtrsm_u_ukr, GEMMTRSM_U_UKERNEL )
-
diff --git a/testsuite/src/test_gemmtrsm_ukr.h b/testsuite/src/test_gemmtrsm_ukr.h
index e2c2c311e..fd0d4cb9e 100644
--- a/testsuite/src/test_gemmtrsm_ukr.h
+++ b/testsuite/src/test_gemmtrsm_ukr.h
@@ -34,30 +34,3 @@
 
 void libblis_test_gemmtrsm_ukr( test_params_t* params, test_op_t* op );
 
-//
-// Prototype wrapper interfaces to micro-kernel.
-//
-void bli_gemmtrsm_ukr( obj_t*  alpha,
-                       obj_t*  a1x,
-                       obj_t*  a11,
-                       obj_t*  bx1,
-                       obj_t*  b11,
-                       obj_t*  c11 );
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t      k, \
-                           void*      alpha, \
-                           void*      a1x, \
-                           void*      a11, \
-                           void*      bx1, \
-                           void*      b11, \
-                           void*      c11, inc_t rs_c, inc_t cs_c, \
-                           auxinfo_t* data  \
-                         );
-
-INSERT_GENTPROT_BASIC( gemmtrsm_l_ukr )
-INSERT_GENTPROT_BASIC( gemmtrsm_u_ukr )
-
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 234bec68d..675520321 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -267,7 +267,7 @@ void libblis_test_trsm_ukr_impl( iface_t   iface,
 	switch ( iface )
 	{
 		case BLIS_TEST_SEQ_UKERNEL:
-		bli_trsm_ukr( a, b, c );
+		bli_trsm_ukernel( a, b, c );
 		break;
 
 		default:
@@ -367,84 +367,3 @@ void libblis_test_trsm_ukr_check( side_t  side,
 	bli_obj_free( &z );
 }
 
-
-
-//
-// Define object-wrapper to TRSM_L_UKERNEL, TRSM_U_UKERNEL micro-kernels.
-//
-
-#undef  FUNCPTR_T
-#define FUNCPTR_T trsm_ukr_fp
-
-typedef void (*FUNCPTR_T)(
-                           void*      a,
-                           void*      b,
-                           void*      c, inc_t rs_c, inc_t cs_c,
-                           auxinfo_t* data
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes_l,trsm_l_ukr);
-static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukr);
-
-
-void bli_trsm_ukr( obj_t*  a,
-                   obj_t*  b,
-                   obj_t*  c )
-{
-	num_t     dt        = bli_obj_datatype( *c );
-
-	void*     buf_a     = bli_obj_buffer_at_off( *a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( *b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( *c );
-	inc_t     rs_c      = bli_obj_row_stride( *c );
-	inc_t     cs_c      = bli_obj_col_stride( *c );
-
-	inc_t     ps_a      = bli_obj_panel_stride( *a );
-	inc_t     ps_b      = bli_obj_panel_stride( *b );
-
-	FUNCPTR_T f;
-
-	auxinfo_t data;
-
-
-	// Fill the auxinfo_t struct in case the micro-kernel uses it.
-	bli_auxinfo_set_next_a( buf_a, data );
-	bli_auxinfo_set_next_b( buf_b, data );
-
-	bli_auxinfo_set_ps_a( ps_a, data );
-	bli_auxinfo_set_ps_b( ps_b, data );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	if ( bli_obj_is_lower( *a ) ) f = ftypes_l[dt];
-	else                          f = ftypes_u[dt];
-
-	// Invoke the function.
-	f( buf_a,
-	   buf_b,
-	   buf_c, rs_c, cs_c,
-	   &data );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, ukrname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           void*      a, \
-                           void*      b, \
-                           void*      c, inc_t rs_c, inc_t cs_c, \
-                           auxinfo_t* data  \
-                         ) \
-{ \
-	PASTEMAC(ch,ukrname)( a, \
-	                      b, \
-	                      c, rs_c, cs_c, \
-	                      data ); \
-}
-
-INSERT_GENTFUNC_BASIC( trsm_l_ukr, TRSM_L_UKERNEL )
-INSERT_GENTFUNC_BASIC( trsm_u_ukr, TRSM_U_UKERNEL )
-
diff --git a/testsuite/src/test_trsm_ukr.h b/testsuite/src/test_trsm_ukr.h
index 83129d7a3..1057f2853 100644
--- a/testsuite/src/test_trsm_ukr.h
+++ b/testsuite/src/test_trsm_ukr.h
@@ -34,23 +34,3 @@
 
 void libblis_test_trsm_ukr( test_params_t* params, test_op_t* op );
 
-//
-// Prototype wrapper interfaces to micro-kernel.
-//
-void bli_trsm_ukr( obj_t*  a,
-                   obj_t*  b,
-                   obj_t*  c );
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           void*      a, \
-                           void*      b, \
-                           void*      c, inc_t rs_c, inc_t cs_c, \
-                           auxinfo_t* data  \
-                         );
-
-INSERT_GENTPROT_BASIC( trsm_l_ukr )
-INSERT_GENTPROT_BASIC( trsm_u_ukr )
-