Merge pull request #10 from Maratyszcza/stable

Portable Native Client port
2026-05-11 09:39:59 +00:00 · 2014-06-19 21:46:50 -05:00
parent ad48dca229 020a831bc5
commit 7101a8eec0
9 changed files with 1615 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,8 @@
 *.a
 # test executables
 *.x
+*.pexe
+*.nexe

 # -- build system files --

--- a/2
+++ b/2
@@ -469,7 +469,7 @@ endif
 blis-lib: check-env $(MK_LIBS)

 $(MK_ALL_BLIS_LIB): $(MK_ALL_BLIS_OBJS)
-ifeq ($(FLA_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
 	$(AR) $(ARFLAGS) $@ $?
 	$(RANLIB) $@
 else
--- a/config/pnacl/bli_config.h
+++ b/config/pnacl/bli_config.h
@@ -0,0 +1,165 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_CONFIG_H
+#define BLIS_CONFIG_H
+
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
+
+
+// -- INTEGER PROPERTIES -------------------------------------------------------
+
+// The bit size of the integer type used to track values such as dimensions,
+// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
+// integers while 64 results in 64-bit integers. Any other value results in use
+// of the C99 type "long int". Note that this ONLY affects integers used
+// internally within BLIS as well as those exposed in the native BLAS-like BLIS
+// interface.
+#define BLIS_INT_TYPE_SIZE               32
+
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES                4
+#define BLIS_MAX_TYPE_SIZE               sizeof(dcomplex)
+
+// Enable use of built-in C99 "float complex" and "double complex" types and
+// associated overloaded operations and functions? Disabling results in
+// scomplex and dcomplex being defined in terms of simple structs.
+//#define BLIS_ENABLE_C99_COMPLEX
+
+
+
+// -- MULTITHREADING -----------------------------------------------------------
+
+// The maximum number of BLIS threads that will run concurrently.
+#define BLIS_MAX_NUM_THREADS             1
+
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+// -- Contiguous (static) memory allocator --
+
+// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
+// contiguous memory pools.
+#define BLIS_NUM_MC_X_KC_BLOCKS          BLIS_MAX_NUM_THREADS
+#define BLIS_NUM_KC_X_NC_BLOCKS          BLIS_MAX_NUM_THREADS
+#define BLIS_NUM_MC_X_NC_BLOCKS          0
+
+// The maximum preload byte offset is used to pad the end of the contiguous
+// memory pools so that the micro-kernel, when computing with the end of the
+// last block, can exceed the bounds of the usable portion of the memory
+// region without causing a segmentation fault.
+#define BLIS_MAX_PRELOAD_BYTE_OFFSET     128
+
+// -- Memory alignment --
+
+// It is sometimes useful to define the various memory alignments in terms
+// of some other characteristics of the system, such as the cache line size
+// and the page size.
+#define BLIS_CACHE_LINE_SIZE             64
+#define BLIS_PAGE_SIZE                   4096
+
+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
+// Alignment size used to align local stack buffers within macro-kernel
+// functions.
+#define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when allocating memory dynamically from the operating
+// system (eg: posix_memalign()). To disable heap alignment and just use
+// malloc() instead, set this to 1.
+#define BLIS_HEAP_ADDR_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when sizing leading dimensions of dynamically
+// allocated memory.
+#define BLIS_HEAP_STRIDE_ALIGN_SIZE      BLIS_CACHE_LINE_SIZE
+
+// Alignment size used when allocating entire blocks of contiguous memory
+// from the contiguous memory allocator.
+#define BLIS_CONTIG_ADDR_ALIGN_SIZE      BLIS_PAGE_SIZE
+
+
+
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
+// Enable mixed domain operations?
+//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+
+// Enable extra mixed precision operations?
+//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+
+
+
+// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
+
+// Stay initialized after auto-initialization, unless and until the user
+// explicitly calls bli_finalize().
+#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
+
+
+
+// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
+
+// Enable the BLAS compatibility layer?
+#define BLIS_ENABLE_BLAS2BLIS
+
+// The bit size of the integer type used to track values such as dimensions and
+// leading dimensions (ie: column strides) within the BLAS compatibility layer.
+// A value of 32 results in the compatibility layer using 32-bit signed integers
+// while 64 results in 64-bit integers. Any other value results in use of the
+// C99 type "long int". Note that this ONLY affects integers used within the
+// BLAS compatibility layer.
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     32
+
+// Fortran-77 name-mangling macros.
+#define PASTEF770(name)                        name ## _
+#define PASTEF77(ch1,name)       ch1        ## name ## _
+#define PASTEF772(ch1,ch2,name)  ch1 ## ch2 ## name ## _
+
+
+
+
+#endif
+
--- a/config/pnacl/bli_kernel.h
+++ b/config/pnacl/bli_kernel.h
@@ -0,0 +1,247 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_KERNEL_H
+#define BLIS_KERNEL_H
+
+/*
+ * SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible.
+ * Therefore, if compilation targets an older Chrome release, we use scalar kernels.
+ * The target Chrome version is indicated by PPAPI_MACRO defined in the header below.
+ */
+#include <ppapi/c/pp_macros.h>
+
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Cache blocksizes --
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes)
+//     (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes)
+//     (b) MR (for zero-padding purposes when MR and NR are "swapped")
+// (3) KC must be a multiple of
+//     (a) MR and
+//     (b) NR (for triangular operations such as trmm and trsm).
+//
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_DEFAULT_MC_S              256
+#define BLIS_DEFAULT_KC_S              256
+#define BLIS_DEFAULT_NC_S              8192
+#else
+#define BLIS_DEFAULT_MC_S              252
+#define BLIS_DEFAULT_KC_S              264
+#define BLIS_DEFAULT_NC_S              8196
+#endif
+
+#define BLIS_DEFAULT_MC_D              1080
+#define BLIS_DEFAULT_KC_D              120
+#define BLIS_DEFAULT_NC_D              8400
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_DEFAULT_MC_C              128
+#define BLIS_DEFAULT_KC_C              256
+#define BLIS_DEFAULT_NC_C              4096
+#else
+#define BLIS_DEFAULT_MC_C              120
+#define BLIS_DEFAULT_KC_C              264
+#define BLIS_DEFAULT_NC_C              4092
+#endif
+
+#define BLIS_DEFAULT_MC_Z              60
+#define BLIS_DEFAULT_KC_Z              264
+#define BLIS_DEFAULT_NC_Z              2040
+
+// -- Register blocksizes --
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_DEFAULT_MR_S              8
+#define BLIS_DEFAULT_NR_S              4
+#else
+#define BLIS_DEFAULT_MR_S              4
+#define BLIS_DEFAULT_NR_S              3 
+#endif
+
+#define BLIS_DEFAULT_MR_D              4
+#define BLIS_DEFAULT_NR_D              3
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_DEFAULT_MR_C              4
+#define BLIS_DEFAULT_NR_C              4
+#else
+#define BLIS_DEFAULT_MR_C              2
+#define BLIS_DEFAULT_NR_C              3
+#endif
+
+#define BLIS_DEFAULT_MR_Z              2
+#define BLIS_DEFAULT_NR_Z              3
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+//#define BLIS_DEFAULT_KR_S              1
+//#define BLIS_DEFAULT_KR_D              1
+//#define BLIS_DEFAULT_KR_C              1
+//#define BLIS_DEFAULT_KR_Z              1
+
+// -- Cache blocksize extensions (for optimizing edge cases) --
+
+// NOTE: These cache blocksize "extensions" have the same constraints as
+// the corresponding default blocksizes above. When these values are
+// non-zero, blocksizes used at edge cases are extended (enlarged) if
+// such an extension would encompass the remaining portion of the
+// matrix dimension.
+
+//#define BLIS_EXTEND_MC_S               0 //(BLIS_DEFAULT_MC_S/4)
+//#define BLIS_EXTEND_KC_S               0 //(BLIS_DEFAULT_KC_S/4)
+//#define BLIS_EXTEND_NC_S               0 //(BLIS_DEFAULT_NC_S/4)
+
+//#define BLIS_EXTEND_MC_D               0 //(BLIS_DEFAULT_MC_D/4)
+//#define BLIS_EXTEND_KC_D               0 //(BLIS_DEFAULT_KC_D/4)
+//#define BLIS_EXTEND_NC_D               0 //(BLIS_DEFAULT_NC_D/4)
+
+//#define BLIS_EXTEND_MC_C               0 //(BLIS_DEFAULT_MC_C/4)
+//#define BLIS_EXTEND_KC_C               0 //(BLIS_DEFAULT_KC_C/4)
+//#define BLIS_EXTEND_NC_C               0 //(BLIS_DEFAULT_NC_C/4)
+
+//#define BLIS_EXTEND_MC_Z               0 //(BLIS_DEFAULT_MC_Z/4)
+//#define BLIS_EXTEND_KC_Z               0 //(BLIS_DEFAULT_KC_Z/4)
+//#define BLIS_EXTEND_NC_Z               0 //(BLIS_DEFAULT_NC_Z/4)
+
+// -- Register blocksize extensions (for packed micro-panels) --
+
+// NOTE: These register blocksize "extensions" determine whether the
+// leading dimensions used within the packed micro-panels are equal to
+// or greater than their corresponding register blocksizes above.
+
+//#define BLIS_EXTEND_MR_S               0
+//#define BLIS_EXTEND_NR_S               0
+
+//#define BLIS_EXTEND_MR_D               0
+//#define BLIS_EXTEND_NR_D               0
+
+//#define BLIS_EXTEND_MR_C               0
+//#define BLIS_EXTEND_NR_C               0
+
+//#define BLIS_EXTEND_MR_Z               0
+//#define BLIS_EXTEND_NR_Z               0
+
+
+
+// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
+
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+
+
+
+// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
+
+// -- gemm --
+
+#if PPAPI_RELEASE >= 36
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_opt
+#define BLIS_CGEMM_UKERNEL         bli_cgemm_opt
+#endif
+
+// -- trsm-related --
+
+
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+
+// -- unpackm --
+
+
+
+
+// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
+
+// -- axpy2v --
+
+// -- dotaxpyv --
+
+// -- axpyf --
+
+// -- dotxf --
+
+// -- dotxaxpyf --
+
+
+
+
+// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
+
+// -- addv --
+
+// -- axpyv --
+
+// -- copyv --
+
+// -- dotv --
+#define BLIS_SDOTV_KERNEL          bli_sdotv_opt
+#define BLIS_DDOTV_KERNEL          bli_ddotv_opt
+#define BLIS_CDOTV_KERNEL          bli_cdotv_opt
+#define BLIS_ZDOTV_KERNEL          bli_zdotv_opt
+
+// -- dotxv --
+
+// -- invertv --
+
+// -- scal2v --
+
+// -- scalv --
+
+// -- setv --
+
+// -- subv --
+
+// -- swapv --
+
+
+
+#endif
+
--- a/config/pnacl/kernels
+++ b/config/pnacl/kernels
@@ -0,0 +1 @@
+../../kernels/nacl/pnacl
--- a/config/pnacl/make_defs.mk
+++ b/config/pnacl/make_defs.mk
@@ -0,0 +1,117 @@
+#!/bin/bash
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# Only include this block of code once.
+ifndef MAKE_DEFS_MK_INCLUDED
+MAKE_DEFS_MK_INCLUDED := yes
+
+
+
+#
+# --- Build definitions --------------------------------------------------------
+#
+
+# Variables corresponding to other configure-time options.
+BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
+BLIS_ENABLE_STATIC_BUILD        := yes
+BLIS_ENABLE_DYNAMIC_BUILD       := no
+
+
+
+#
+# --- Utility program definitions ----------------------------------------------
+#
+
+SH         := /bin/sh
+MV         := mv
+MKDIR      := mkdir -p
+RM_F       := rm -f
+RM_RF      := rm -rf
+SYMLINK    := ln -sf
+FIND       := find
+GREP       := grep
+XARGS      := xargs
+RANLIB     := pnacl-ranlib
+INSTALL    := install -c
+
+# Used to refresh CHANGELOG.
+GIT        := git
+GIT_LOG    := $(GIT) log --decorate
+
+
+
+#
+# --- Development tools definitions --------------------------------------------
+#
+
+# --- Determine the C compiler and related flags ---
+CC             := pnacl-clang
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
+# NOTE: This is needed to enable posix_memalign().
+CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
+CMISCFLAGS     := -std=gnu11 -I$(NACL_SDK_ROOT)/include
+CDBGFLAGS      := -g
+CWARNFLAGS     := -Wall
+COPTFLAGS      := -O3
+CKOPTFLAGS     := $(COPTFLAGS) -ffast-math
+CVECFLAGS      :=
+
+# Aggregate all of the flags into multiple groups: one for standard
+# compilation, and one for each of the supported "special" compilation
+# modes.
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+
+# --- Determine the archiver and related flags ---
+AR             := pnacl-ar
+ARFLAGS        := rcs
+
+# --- Determine the linker and related flags ---
+LINKER         := $(CC)
+LDFLAGS        := -lm
+
+# --- Determine the finalizer and related flags ---
+FINALIZER      := pnacl-finalize
+FINFLAGS        :=
+
+# --- Determine the translator and related flags ---
+TRANSLATOR     := pnacl-translate
+TRNSFLAGS      := -O3
+TRNSAMD64FLAGS := -arch x86-64
+TRNSX86FLAGS   := -arch i686
+TRNSARMFLAGS   := -arch armv7
+
+# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
+endif
--- a/kernels/nacl/pnacl/1/bli_dotv_opt.c
+++ b/kernels/nacl/pnacl/1/bli_dotv_opt.c
@@ -0,0 +1,618 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#if PPAPI_RELEASE >= 36
+typedef float v4sf __attribute__ ((vector_size(16)));
+
+inline v4sf v4sf_splat(float x) {
+        return (v4sf) { x, x, x, x };
+}
+
+inline v4sf v4sf_load(const float* a) {
+        return *((const v4sf*)a);
+}
+
+inline v4sf v4sf_cload(const scomplex* a) {
+        return *((const v4sf*)a);
+}
+
+inline void v4sf_store(float* a, v4sf x) {
+        *((v4sf*)a) = x;
+}
+
+inline void v4sf_cstore(scomplex* a, v4sf x) {
+        *((v4sf*)a) = x;
+}
+
+inline v4sf v4sf_zero() {
+        return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
+}
+#endif
+
+void bli_sdotv_opt(
+	conj_t conjx,
+	conj_t conjy,
+	dim_t  n,
+	float  x[restrict static n],
+	inc_t  incx,
+	float  y[restrict static n],
+	inc_t  incy,
+	float  rho[restrict static 1])
+{
+#if PPAPI_RELEASE >= 36
+	// If the vector lengths are zero, set rho to zero and return.
+	if (bli_zero_dim1(n)) {
+		*rho = 0.0f;
+		return;
+	}
+	
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if (bli_has_nonunit_inc2(incx, incy)) {
+		float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
+		while (n >= 6) {
+			sum0 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum1 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum2 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum3 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum4 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			sum5 += (*x) * (*y);
+			x += incx;
+			y += incy;
+
+			n -= 6;
+		}
+		float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
+		while (n--) {
+			sum += (*x) * (*y);
+			x += incx;
+			y += incy;
+		}
+		*rho = sum;
+	} else {
+		v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero();
+		v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero();
+		while (n >= 24) {
+			vsum0 += v4sf_load(x) * v4sf_load(y);
+			vsum1 += v4sf_load(x+4) * v4sf_load(y+4);
+			vsum2 += v4sf_load(x+8) * v4sf_load(y+8);
+			vsum3 += v4sf_load(x+12) * v4sf_load(y+12);
+			vsum4 += v4sf_load(x+16) * v4sf_load(y+16);
+			vsum5 += v4sf_load(x+20) * v4sf_load(y+20);
+
+			x += 24;
+			y += 24;
+			n -= 24;
+		}
+		v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5);
+		while (n >= 4) {
+			vsum += v4sf_load(x) * v4sf_load(y);
+
+			x += 4;
+			y += 4;
+			n -= 4;
+		}
+		float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]);
+		while (n--) {
+			sum += (*x++) * (*y++);
+		}
+		*rho = sum;
+	}
+#else
+	float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f;
+	while (n >= 6) {
+		sum0 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum1 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum2 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum3 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum4 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum5 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		n -= 6;
+	}
+	float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
+	while (n--) {
+		sum += (*x) * (*y);
+		x += incx;
+		y += incy;
+	}
+	*rho = sum;
+#endif
+}
+
+void bli_ddotv_opt(
+	conj_t conjx,
+	conj_t conjy,
+	dim_t  n,
+	double x[restrict static n],
+	inc_t  incx,
+	double y[restrict static n],
+	inc_t  incy,
+	double rho[restrict static 1])
+{
+	double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0;
+	while (n >= 6) {
+		sum0 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum1 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum2 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum3 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum4 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		sum5 += (*x) * (*y);
+		x += incx;
+		y += incy;
+
+		n -= 6;
+	}
+	double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5);
+	while (n--) {
+		sum += (*x) * (*y);
+		x += incx;
+		y += incy;
+	}
+	*rho = sum;
+}
+
+void bli_cdotv_opt(
+	conj_t   conjx,
+	conj_t   conjy,
+	dim_t    n,
+	scomplex x[restrict static n],
+	inc_t    incx,
+	scomplex y[restrict static n],
+	inc_t    incy,
+	scomplex rho[restrict static 1])
+{
+	if (bli_is_conj(conjy)) {
+		bli_toggle_conj(conjx);
+	}
+
+	if (bli_zero_dim1(n)) {
+		rho->real = 0.0f;
+		rho->imag = 0.0f;
+		return;
+	}
+
+	float sumr;
+	float sumi;
+#if PPAPI_RELEASE >= 36
+	if (bli_is_noconj(conjx)) {
+		if (bli_has_nonunit_inc2(incx, incy)) {
+			float sum0r = 0.0f, sum1r = 0.0f;
+			float sum0i = 0.0f, sum1i = 0.0f;
+			while (n >= 2) {
+				const float x0r = x->real;
+				const float x0i = x->imag;
+				const float y0r = y->real;
+				const float y0i = y->imag;
+
+				sum0r += x0r * y0r - x0i * y0i;
+				sum0i += x0r * y0i + x0i * y0r;
+
+				x += incx;
+				y += incy;
+
+				const float x1r = x->real;
+				const float x1i = x->imag;
+				const float y1r = y->real;
+				const float y1i = y->imag;
+
+				sum1r += x1r * y1r - x1i * y1i;
+				sum1i += x1r * y1i + x1i * y1r;
+
+				x += incx;
+				y += incy;
+
+				n -= 2;
+			}
+			sumr = sum0r + sum1r;
+			sumi = sum0i + sum1i;
+		} else {
+			v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
+			v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
+			while (n >= 8) {
+				const v4sf xv0t = v4sf_cload(x);
+				const v4sf xv0b = v4sf_cload(x+2);
+				const v4sf yv0t = v4sf_cload(y);
+				const v4sf yv0b = v4sf_cload(y+2);
+
+				const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
+				const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
+				const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
+				const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
+
+				sumv0r += xv0r * yv0r - xv0i * yv0i;
+				sumv0i += xv0r * yv0i + xv0i * yv0r;
+
+				const v4sf xv1t = v4sf_cload(x+4);
+				const v4sf xv1b = v4sf_cload(x+6);
+				const v4sf yv1t = v4sf_cload(y+4);
+				const v4sf yv1b = v4sf_cload(y+6);
+
+				const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
+				const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
+				const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
+				const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
+
+				sumv1r += xv1r * yv1r - xv1i * yv1i;
+				sumv1i += xv1r * yv1i + xv1i * yv1r;
+
+				x += 8;
+				y += 8;
+
+				n -= 8;
+			}
+			const v4sf sumvr = sumv0r + sumv1r;
+			const v4sf sumvi = sumv0i + sumv1i;
+			sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
+			sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
+		}
+		while (n--) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr - xi * yi;
+			sumi += xr * yi + xi * yr;
+
+			x += incx;
+			y += incy;
+		}
+	} else {
+		if (bli_has_nonunit_inc2(incx, incy)) {
+			float sum0r = 0.0f, sum1r = 0.0f;
+			float sum0i = 0.0f, sum1i = 0.0f;
+			while (n >= 2) {
+				const float x0r = x->real;
+				const float x0i = x->imag;
+				const float y0r = y->real;
+				const float y0i = y->imag;
+
+				sum0r += x0r * y0r + x0i * y0i;
+				sum0i += x0r * y0i - x0i * y0r;
+
+				x += incx;
+				y += incy;
+
+				const float x1r = x->real;
+				const float x1i = x->imag;
+				const float y1r = y->real;
+				const float y1i = y->imag;
+
+				sum1r += x1r * y1r + x1i * y1i;
+				sum1i += x1r * y1i - x1i * y1r;
+
+				x += incx;
+				y += incy;
+
+				n -= 2;
+			}
+			sumr = sum0r + sum1r;
+			sumi = sum0i + sum1i;
+		} else {
+			v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero();
+			v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero();
+			while (n >= 8) {
+				const v4sf xv0t = v4sf_cload(x);
+				const v4sf xv0b = v4sf_cload(x+2);
+				const v4sf yv0t = v4sf_cload(y);
+				const v4sf yv0b = v4sf_cload(y+2);
+
+				const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6);
+				const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7);
+				const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6);
+				const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7);
+
+				sumv0r += xv0r * yv0r + xv0i * yv0i;
+				sumv0i += xv0r * yv0i - xv0i * yv0r;
+
+				const v4sf xv1t = v4sf_cload(x+4);
+				const v4sf xv1b = v4sf_cload(x+6);
+				const v4sf yv1t = v4sf_cload(y+4);
+				const v4sf yv1b = v4sf_cload(y+6);
+
+				const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6);
+				const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7);
+				const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6);
+				const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7);
+
+				sumv1r += xv1r * yv1r + xv1i * yv1i;
+				sumv1i += xv1r * yv1i - xv1i * yv1r;
+
+				x += 8;
+				y += 8;
+
+				n -= 8;
+			}
+			const v4sf sumvr = sumv0r + sumv1r;
+			const v4sf sumvi = sumv0i + sumv1i;
+			sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]);
+			sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]);
+		}
+		while (n--) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr + xi * yi;
+			sumi += xr * yi - xi * yr;
+
+			x += incx;
+			y += incy;
+		}
+	}
+#else
+	if (bli_is_noconj(conjx)) {
+		float sum0r = 0.0f, sum1r = 0.0f;
+		float sum0i = 0.0f, sum1i = 0.0f;
+		while (n >= 2) {
+			const float x0r = x->real;
+			const float x0i = x->imag;
+			const float y0r = y->real;
+			const float y0i = y->imag;
+
+			sum0r += x0r * y0r - x0i * y0i;
+			sum0i += x0r * y0i + x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const float x1r = x->real;
+			const float x1i = x->imag;
+			const float y1r = y->real;
+			const float y1i = y->imag;
+
+			sum1r += x1r * y1r - x1i * y1i;
+			sum1i += x1r * y1i + x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr - xi * yi;
+			sumi += xr * yi + xi * yr;
+		}
+	} else {
+		float sum0r = 0.0f, sum1r = 0.0f;
+		float sum0i = 0.0f, sum1i = 0.0f;
+		while (n >= 2) {
+			const float x0r = x->real;
+			const float x0i = x->imag;
+			const float y0r = y->real;
+			const float y0i = y->imag;
+
+			sum0r += x0r * y0r + x0i * y0i;
+			sum0i += x0r * y0i - x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const float x1r = x->real;
+			const float x1i = x->imag;
+			const float y1r = y->real;
+			const float y1i = y->imag;
+
+			sum1r += x1r * y1r + x1i * y1i;
+			sum1i += x1r * y1i - x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const float xr = x->real;
+			const float xi = x->imag;
+			const float yr = y->real;
+			const float yi = y->imag;
+
+			sumr += xr * yr + xi * yi;
+			sumi += xr * yi - xi * yr;
+		}
+	}
+#endif
+
+	rho->real = sumr;
+	rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
+}
+
+
+
+void bli_zdotv_opt(
+	conj_t   conjx,
+	conj_t   conjy,
+	dim_t    n,
+	dcomplex x[restrict static n],
+	inc_t    incx,
+	dcomplex y[restrict static n],
+	inc_t    incy,
+	dcomplex rho[restrict static 1])
+{
+	if (bli_is_conj(conjy)) {
+		bli_toggle_conj(conjx);
+	}
+
+	if (bli_zero_dim1(n)) {
+		rho->real = 0.0;
+		rho->imag = 0.0;
+		return;
+	}
+
+	double sumr;
+	double sumi;
+	if (bli_is_noconj(conjx)) {
+		double sum0r = 0.0, sum1r = 0.0;
+		double sum0i = 0.0, sum1i = 0.0;
+		while (n >= 2) {
+			const double x0r = x->real;
+			const double x0i = x->imag;
+			const double y0r = y->real;
+			const double y0i = y->imag;
+
+			sum0r += x0r * y0r - x0i * y0i;
+			sum0i += x0r * y0i + x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const double x1r = x->real;
+			const double x1i = x->imag;
+			const double y1r = y->real;
+			const double y1i = y->imag;
+
+			sum1r += x1r * y1r - x1i * y1i;
+			sum1i += x1r * y1i + x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const double xr = x->real;
+			const double xi = x->imag;
+			const double yr = y->real;
+			const double yi = y->imag;
+
+			sumr += xr * yr - xi * yi;
+			sumi += xr * yi + xi * yr;
+		}
+	} else {
+		double sum0r = 0.0, sum1r = 0.0;
+		double sum0i = 0.0, sum1i = 0.0;
+		while (n >= 2) {
+			const double x0r = x->real;
+			const double x0i = x->imag;
+			const double y0r = y->real;
+			const double y0i = y->imag;
+
+			sum0r += x0r * y0r + x0i * y0i;
+			sum0i += x0r * y0i - x0i * y0r;
+
+			x += incx;
+			y += incy;
+
+			const double x1r = x->real;
+			const double x1i = x->imag;
+			const double y1r = y->real;
+			const double y1i = y->imag;
+
+			sum1r += x1r * y1r + x1i * y1i;
+			sum1i += x1r * y1i - x1i * y1r;
+
+			x += incx;
+			y += incy;
+
+			n -= 2;
+		}
+		sumr = sum0r + sum1r;
+		sumi = sum0i + sum1i;
+		if (n != 0) {
+			const double xr = x->real;
+			const double xi = x->imag;
+			const double yr = y->real;
+			const double yi = y->imag;
+
+			sumr += xr * yr + xi * yi;
+			sumi += xr * yi - xi * yr;
+		}
+	}
+
+	rho->real = sumr;
+	rho->imag = bli_is_conj(conjy) ? -sumi : sumi;
+}
+
--- a/kernels/nacl/pnacl/3/bli_gemm_opt.c
+++ b/kernels/nacl/pnacl/3/bli_gemm_opt.c
@@ -0,0 +1,386 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#if PPAPI_RELEASE >= 36
+	typedef float v4sf __attribute__ ((vector_size(16)));
+
+	inline v4sf v4sf_splat(float x) {
+		return (v4sf) { x, x, x, x };
+	}
+
+	inline v4sf v4sf_load(const float* a) {
+		return *((const v4sf*)a);
+	}
+
+	inline v4sf v4sf_cload(const scomplex* a) {
+		return *((const v4sf*)a);
+	}
+
+	inline void v4sf_store(float* a, v4sf x) {
+		*((v4sf*)a) = x;
+	}
+
+	inline void v4sf_cstore(scomplex* a, v4sf x) {
+		*((v4sf*)a) = x;
+	}
+
+	inline v4sf v4sf_zero() {
+		return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f };
+	}
+
+	void bli_sgemm_opt(
+		dim_t      k,
+		float      alpha[restrict static 1],
+		float      a[restrict static 8*k],
+		float      b[restrict static k*4],
+		float      beta[restrict static 1],
+		float      c[restrict static 8*4],
+		inc_t      rs_c,
+		inc_t      cs_c,
+		auxinfo_t* data)
+	{
+		// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
+		v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero();
+		v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero();
+		for (dim_t i = 0; i < k; i += 1) {
+			const v4sf avt = v4sf_load(a);
+			const v4sf avb = v4sf_load(a+4);
+
+			const v4sf bv_xxxx = v4sf_splat(b[0]);
+			abv0t += avt * bv_xxxx;
+			abv0b += avb * bv_xxxx;
+
+			const v4sf bv_yyyy = v4sf_splat(b[1]);
+			abv1t += avt * bv_yyyy;
+			abv1b += avb * bv_yyyy;
+
+			const v4sf bv_zzzz = v4sf_splat(b[2]);
+			abv2t += avt * bv_zzzz;
+			abv2b += avb * bv_zzzz;
+
+			const v4sf bv_wwww = v4sf_splat(b[3]);
+			abv3t += avt * bv_wwww;
+			abv3b += avb * bv_wwww;
+
+			a += 8;
+			b += 4;
+		}
+
+		const v4sf alphav = v4sf_splat(*alpha);
+		abv0t *= alphav;
+		abv0b *= alphav;
+		abv1t *= alphav;
+		abv1b *= alphav;
+		abv2t *= alphav;
+		abv2b *= alphav;
+		abv3t *= alphav;
+		abv3b *= alphav;
+
+		if (rs_c == 1) {
+			v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]);
+			v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); 
+			v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); 
+			v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); 
+			v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]);
+			v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); 
+			v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); 
+			v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); 
+
+			const v4sf betav = v4sf_splat(*beta);
+			cv0t = cv0t * betav + abv0t;
+			cv1t = cv1t * betav + abv1t;
+			cv2t = cv2t * betav + abv2t;
+			cv3t = cv3t * betav + abv3t;
+			cv0b = cv0b * betav + abv0b;
+			cv1b = cv1b * betav + abv1b;
+			cv2b = cv2b * betav + abv2b;
+			cv3b = cv3b * betav + abv3b;
+
+			v4sf_store(&c[0*rs_c + 0*cs_c], cv0t);
+			v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); 
+			v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); 
+			v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); 
+			v4sf_store(&c[4*rs_c + 0*cs_c], cv0b);
+			v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); 
+			v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); 
+			v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); 
+		} else {
+			// Load columns 0, 1, 2, 3 (top part)
+			v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] };
+			v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] };
+			v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] };
+			v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] };
+			// Load columns 0, 1, 2, 3 (bottom part)
+			v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] };
+			v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] };
+			v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] };
+			v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] };
+
+			const v4sf betav = v4sf_splat(*beta);
+			cv0t = cv0t * betav + abv0t;
+			cv1t = cv1t * betav + abv1t;
+			cv2t = cv2t * betav + abv2t;
+			cv3t = cv3t * betav + abv3t;
+			cv0b = cv0b * betav + abv0b;
+			cv1b = cv1b * betav + abv1b;
+			cv2b = cv2b * betav + abv2b;
+			cv3b = cv3b * betav + abv3b;
+
+			// Store column 0
+			c[0*rs_c + 0*cs_c] = cv0t[0];
+			c[1*rs_c + 0*cs_c] = cv0t[1];
+			c[2*rs_c + 0*cs_c] = cv0t[2];
+			c[3*rs_c + 0*cs_c] = cv0t[3];
+			c[4*rs_c + 0*cs_c] = cv0b[0];
+			c[5*rs_c + 0*cs_c] = cv0b[1];
+			c[6*rs_c + 0*cs_c] = cv0b[2];
+			c[7*rs_c + 0*cs_c] = cv0b[3];
+
+			// Store column 1
+			c[0*rs_c + 1*cs_c] = cv1t[0];
+			c[1*rs_c + 1*cs_c] = cv1t[1];
+			c[2*rs_c + 1*cs_c] = cv1t[2];
+			c[3*rs_c + 1*cs_c] = cv1t[3];
+			c[4*rs_c + 1*cs_c] = cv1b[0];
+			c[5*rs_c + 1*cs_c] = cv1b[1];
+			c[6*rs_c + 1*cs_c] = cv1b[2];
+			c[7*rs_c + 1*cs_c] = cv1b[3];
+
+			// Store column 2
+			c[0*rs_c + 2*cs_c] = cv2t[0];
+			c[1*rs_c + 2*cs_c] = cv2t[1];
+			c[2*rs_c + 2*cs_c] = cv2t[2];
+			c[3*rs_c + 2*cs_c] = cv2t[3];
+			c[4*rs_c + 2*cs_c] = cv2b[0];
+			c[5*rs_c + 2*cs_c] = cv2b[1];
+			c[6*rs_c + 2*cs_c] = cv2b[2];
+			c[7*rs_c + 2*cs_c] = cv2b[3];
+
+			// Store column 3
+			c[0*rs_c + 3*cs_c] = cv3t[0];
+			c[1*rs_c + 3*cs_c] = cv3t[1];
+			c[2*rs_c + 3*cs_c] = cv3t[2];
+			c[3*rs_c + 3*cs_c] = cv3t[3];
+			c[4*rs_c + 3*cs_c] = cv3b[0];
+			c[5*rs_c + 3*cs_c] = cv3b[1];
+			c[6*rs_c + 3*cs_c] = cv3b[2];
+			c[7*rs_c + 3*cs_c] = cv3b[3];
+		}
+	}
+
+	void bli_cgemm_opt(
+		dim_t      k,
+		scomplex   alpha[restrict static 1],
+		scomplex   a[restrict static 4*k],
+		scomplex   b[restrict static k*4],
+		scomplex   beta[restrict static 1],
+		scomplex   c[restrict static 4*4],
+		inc_t      rs_c,
+		inc_t      cs_c,
+		auxinfo_t* data)
+	{
+		// Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0)
+		v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero();
+		v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero();
+		for (dim_t i = 0; i < k; i += 1) {
+			const v4sf avt = v4sf_cload(a);
+			const v4sf avb = v4sf_cload(a+2);
+			const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6);
+			const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7);
+
+			const v4sf bv0r = v4sf_splat(b[0].real);
+			const v4sf bv0i = v4sf_splat(b[0].imag);
+			abv0r += avr * bv0r - avi * bv0i;
+			abv0i += avr * bv0i + avi * bv0r;
+
+			const v4sf bv1r = v4sf_splat(b[1].real);
+			const v4sf bv1i = v4sf_splat(b[1].imag);
+			abv1r += avr * bv1r - avi * bv1i;
+			abv1i += avr * bv1i + avi * bv1r;
+
+			const v4sf bv2r = v4sf_splat(b[2].real);
+			const v4sf bv2i = v4sf_splat(b[2].imag);
+			abv2r += avr * bv2r - avi * bv2i;
+			abv2i += avr * bv2i + avi * bv2r;
+
+			const v4sf bv3r = v4sf_splat(b[3].real);
+			const v4sf bv3i = v4sf_splat(b[3].imag);
+			abv3r += avr * bv3r - avi * bv3i;
+			abv3i += avr * bv3i + avi * bv3r;
+
+			a += 4;
+			b += 4;
+		}
+
+		const v4sf alphavr = v4sf_splat(alpha->real);
+		const v4sf alphavi = v4sf_splat(alpha->imag);
+		v4sf temp;
+
+		temp  = abv0r * alphavr - abv0i * alphavi;
+		abv0i = abv0r * alphavi + abv0i * alphavr;
+		abv0r = temp;
+
+		temp  = abv1r * alphavr - abv1i * alphavi;
+		abv1i = abv1r * alphavi + abv1i * alphavr;
+		abv1r = temp;
+
+		temp  = abv2r * alphavr - abv2i * alphavi;
+		abv2i = abv2r * alphavi + abv2i * alphavr;
+		abv2r = temp;
+
+		temp  = abv3r * alphavr - abv3i * alphavi;
+		abv3i = abv3r * alphavi + abv3i * alphavr;
+		abv3r = temp;
+
+		if (rs_c == 1) {
+			const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]);
+			const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]);
+			const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]);
+			const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]);
+			const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]);
+			const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]);
+			const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]);
+			const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]);
+
+			v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6);
+			v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7);
+			v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6);
+			v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7);
+			v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6);
+			v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7);
+			v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6);
+			v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7);
+
+			const v4sf betavr = v4sf_splat(beta->real);
+			const v4sf betavi = v4sf_splat(beta->imag);
+			
+			temp = abv0r + cv0r * betavr - cv0i * betavi;
+			cv0i = abv0i + cv0r * betavi + cv0i * betavr;
+			cv0r = temp;
+
+			temp = abv1r + cv1r * betavr - cv1i * betavi;
+			cv1i = abv1i + cv1r * betavi + cv1i * betavr;
+			cv1r = temp;
+
+			temp = abv2r + cv2r * betavr - cv2i * betavi;
+			cv2i = abv2i + cv2r * betavi + cv2i * betavr;
+			cv2r = temp;
+
+			temp = abv3r + cv3r * betavr - cv3i * betavi;
+			cv3i = abv3i + cv3r * betavi + cv3i * betavr;
+			cv3r = temp;
+
+			v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7));
+			v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7));
+			v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7));
+			v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5));
+			v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7));
+		} else {
+			// Load columns 0, 1, 2, 3 (real part)
+			v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real };
+			v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real };
+			v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real };
+			v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real };
+			// Load columns 0, 1, 2, 3 (imaginary part)
+			v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag };
+			v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag };
+			v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag };
+			v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag };
+
+			const v4sf betavr = v4sf_splat(beta->real);
+			const v4sf betavi = v4sf_splat(beta->imag);
+
+			temp = abv0r + cv0r * betavr - cv0i * betavi;
+			cv0i = abv0i + cv0r * betavi + cv0i * betavr;
+			cv0r = temp;
+
+			temp = abv1r + cv1r * betavr - cv1i * betavi;
+			cv1i = abv1i + cv1r * betavi + cv1i * betavr;
+			cv1r = temp;
+
+			temp = abv2r + cv2r * betavr - cv2i * betavi;
+			cv2i = abv2i + cv2r * betavi + cv2i * betavr;
+			cv2r = temp;
+
+			temp = abv3r + cv3r * betavr - cv3i * betavi;
+			cv3i = abv3i + cv3r * betavi + cv3i * betavr;
+			cv3r = temp;
+
+			// Store column 0
+			c[0*rs_c + 0*cs_c].real = cv0r[0];
+			c[0*rs_c + 0*cs_c].imag = cv0i[0];
+			c[1*rs_c + 0*cs_c].real = cv0r[1];
+			c[1*rs_c + 0*cs_c].imag = cv0i[1];
+			c[2*rs_c + 0*cs_c].real = cv0r[2];
+			c[2*rs_c + 0*cs_c].imag = cv0i[2];
+			c[3*rs_c + 0*cs_c].real = cv0r[3];
+			c[3*rs_c + 0*cs_c].imag = cv0i[3];
+
+			// Store column 1
+			c[0*rs_c + 1*cs_c].real = cv1r[0];
+			c[0*rs_c + 1*cs_c].imag = cv1i[0];
+			c[1*rs_c + 1*cs_c].real = cv1r[1];
+			c[1*rs_c + 1*cs_c].imag = cv1i[1];
+			c[2*rs_c + 1*cs_c].real = cv1r[2];
+			c[2*rs_c + 1*cs_c].imag = cv1i[2];
+			c[3*rs_c + 1*cs_c].real = cv1r[3];
+			c[3*rs_c + 1*cs_c].imag = cv1i[3];
+
+			// Store column 2
+			c[0*rs_c + 2*cs_c].real = cv2r[0];
+			c[0*rs_c + 2*cs_c].imag = cv2i[0];
+			c[1*rs_c + 2*cs_c].real = cv2r[1];
+			c[1*rs_c + 2*cs_c].imag = cv2i[1];
+			c[2*rs_c + 2*cs_c].real = cv2r[2];
+			c[2*rs_c + 2*cs_c].imag = cv2i[2];
+			c[3*rs_c + 2*cs_c].real = cv2r[3];
+			c[3*rs_c + 2*cs_c].imag = cv2i[3];
+
+			// Store column 3
+			c[0*rs_c + 3*cs_c].real = cv3r[0];
+			c[0*rs_c + 3*cs_c].imag = cv3i[0];
+			c[1*rs_c + 3*cs_c].real = cv3r[1];
+			c[1*rs_c + 3*cs_c].imag = cv3i[1];
+			c[2*rs_c + 3*cs_c].real = cv3r[2];
+			c[2*rs_c + 3*cs_c].imag = cv3i[2];
+			c[3*rs_c + 3*cs_c].real = cv3r[3];
+			c[3*rs_c + 3*cs_c].imag = cv3i[3];
+		}
+	}
+#endif
--- a/testsuite/Makefile
+++ b/testsuite/Makefile
@@ -45,7 +45,8 @@
 #

 .PHONY: all bin clean \
-        check-env check-env-mk check-env-fragments check-env-make-defs
+        check-env check-env-mk check-env-fragments check-env-make-defs \
+	run run-amd64 run-x86 run-arm



@@ -241,8 +242,21 @@ TEST_OBJS      := $(patsubst $(TEST_SRC_PATH)/%.c, \
                             $(TEST_OBJ_PATH)/%.o, \
                             $(wildcard $(TEST_SRC_PATH)/*.c))

+ifeq ($(CONFIG_NAME),pnacl)
+# Linked executable
+TEST_BIN       := test_libblis.unstable.pexe
+# Finalized executable
+TEST_BIN_PNACL := test_libblis.pexe
+# Translated executable for x86-64
+TEST_BIN_AMD64 := test_libblis.x86-64.nexe
+# Translated executable for x86
+TEST_BIN_X86   := test_libblis.x86.nexe
+# Translated executable for ARM
+TEST_BIN_ARM   := test_libblis.arm.nexe
+else
 # Binary executable name.
 TEST_BIN       := test_libblis.x
+endif

 # Add installed and local header paths to CFLAGS
 CFLAGS         += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
@@ -257,7 +271,11 @@ CFLAGS         += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)

 all: check-env bin

+ifeq ($(CONFIG_NAME),pnacl)
+bin: check-env $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
+else
 bin: check-env $(TEST_BIN)
+endif


 # --- Environment check rules ---
@@ -301,9 +319,68 @@ else
 	@$(LINKER) $(TEST_OBJS) $(BLIS_LIB) $(LDFLAGS) -o $@
 endif

+ifeq ($(CONFIG_NAME),pnacl)
+
+# Finalize PNaCl executable (i.e. convert from LLVM bitcode to PNaCl bitcode)
+$(TEST_BIN_PNACL): $(TEST_BIN)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
+else
+	@echo "Finalizing $@"
+	@$(FINALIZER) $(FINFLAGS) -o $@ $(TEST_BIN)
+endif
+
+# Translate PNaCl executable to x86-64 NaCl executable
+$(TEST_BIN_AMD64): $(TEST_BIN_PNACL)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
+else
+	@echo "Translating $< -> $@"
+	@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSAMD64FLAGS) $< -o $@
+endif
+
+
+# Translate PNaCl executable to x86 NaCl executable
+$(TEST_BIN_X86): $(TEST_BIN_PNACL)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
+else
+	@echo "Translating $< -> $@"
+	@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSX86FLAGS) $< -o $@
+endif
+
+# Translate PNaCl executable to ARMv7 NaCl executable
+$(TEST_BIN_ARM): $(TEST_BIN_PNACL)
+ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
+	$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
+else
+	@echo "Translating $< -> $@"
+	@$(TRANSLATOR) $(TRNSFLAGS) $(TRNSARMFLAGS) $< -o $@
+endif
+
+endif
+
+# -- Test run rules --
+
+ifeq ($(CONFIG_NAME),pnacl)
+run-amd64: $(TEST_BIN_AMD64)
+	$(NACL_SDK_ROOT)/tools/sel_ldr_x86_64 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_64.nexe -- $(TEST_BIN_AMD64)
+run-x86: $(TEST_BIN_X86)
+	$(NACL_SDK_ROOT)/tools/sel_ldr_x86_32 -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_x86_32.nexe -- $(TEST_BIN_X86)
+run-arm: $(TEST_BIN_ARM)
+	$(NACL_SDK_ROOT)/tools/sel_ldr_arm -a -c -q -B $(NACL_SDK_ROOT)/tools/irt_core_arm.nexe -- $(TEST_BIN_ARM)
+else
+run: $(TEST_BIN)
+	./$(TEST_BIN)
+endif

 # -- Clean rules --

+ifeq ($(CONFIG_NAME),pnacl)
+clean:
+	- $(RM_F) $(TEST_OBJS) $(TEST_BIN) $(TEST_BIN_PNACL) $(TEST_BIN_AMD64) $(TEST_BIN_X86) $(TEST_BIN_ARM)
+else
 clean:
 	- $(RM_F) $(TEST_OBJS) $(TEST_BIN)
+endif