Reverted minor temp/wspace changes from b426f9e.

Details: - Added missing license header to bli_pwr9_asm_macros_12x6.h. - Reverted temporary changes to various files in 'test' and 'testsuite' directories. - Moved testsuite/jobscripts into testsuite/old. - Minor whitespace/comment changes across various files.
2026-04-20 07:38:53 +00:00 · 2019-11-04 13:57:12 -06:00
parent 4870260f6b
commit c84391314d
29 changed files with 566 additions and 987 deletions
--- a/config/power9/bli_cntx_init_power9.c
+++ b/config/power9/bli_cntx_init_power9.c
@@ -49,40 +49,40 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	bli_cntx_init_power9_ref( cntx );
 	
 	// Update the context with optimized native gemm micro-kernels and
-    // their storage preferences.
-    bli_cntx_set_l3_nat_ukrs
-    (
-      1,
-      //BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemmbb_power9_ref,        FALSE,
-      BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,        FALSE,
-      cntx
-    );
+	// their storage preferences.
+	bli_cntx_set_l3_nat_ukrs
+	(
+	  1,
+	  //BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemmbb_power9_ref,        FALSE,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,        FALSE,
+	  cntx
+	);

-    // Update the context with optimized packm kernels.
-    bli_cntx_set_packm_kers
-    (
-      1,
-      BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power9_ref,
-      cntx
-    );
+	// Update the context with optimized packm kernels.
+	bli_cntx_set_packm_kers
+	(
+	  1,
+	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power9_ref,
+	  cntx
+	);

-    bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    12,    -1,    -1 );
-    bli_blksz_init     ( &blkszs[ BLIS_NR ],    -1,     6,    -1,    -1,
-                                                -1,    12,    -1,    -1 );
-    bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   576,    -1,    -1 );
-    bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,  1408,    -1,    -1 );
-    bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  8190,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    12,    -1,    -1 );
+	bli_blksz_init     ( &blkszs[ BLIS_NR ],    -1,     6,    -1,    -1,
+	                                            -1,    12,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   576,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,  1408,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  8190,    -1,    -1 );

 	bli_cntx_set_blkszs
-    (
-      BLIS_NAT, 5,
-      // level-3
-      BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-      BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-      BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-      BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-      BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-      cntx
-    );	
+	(
+	  BLIS_NAT, 5,
+	  // level-3
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  cntx
+	);
 }

--- a/2
+++ b/2
@@ -39,7 +39,7 @@ cortexa15:   cortexa15/armv7a
 cortexa9:    cortexa9/armv7a

 # IBM architectures.
-power9:	     power9
+power9:      power9
 bgq:         bgq

 # Generic architectures.
--- a/2
+++ b/2
@@ -1430,7 +1430,7 @@ check_compiler()
 			# Thus, this "blacklistcc_add" statement has been moved above.
 			#blacklistcc_add "zen"
 			blacklistcc_add "skx"
-                        # GCC-5 may support POWER9 but it is unverified.
+			# gcc 5.x may support POWER9 but it is unverified.
 			blacklistcc_add "power9"
 		fi
 	fi
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -66,7 +66,6 @@ void bli_gemm_front
 #endif
 #endif

-
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bli_gemm_check( alpha, a, b, beta, c, cntx );
@@ -83,7 +82,6 @@ void bli_gemm_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( c, &c_local );

-
 #ifdef BLIS_ENABLE_GEMM_MD
 	cntx_t cntx_local;

@@ -150,7 +148,6 @@ void bli_gemm_front
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-
 	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
@@ -278,7 +275,6 @@ void bli_gemm_front
 	  cntl
 	);

-
 #ifdef BLIS_ENABLE_GEMM_MD
 #ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
 	// If we created a temporary matrix conformal to C for whatever reason,
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -167,7 +167,7 @@ void bli_gemm_ker_var2

 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];	
+	f = ftypes[dt_exec];

 	// Invoke the function.
 	f( schema_a,
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -142,7 +142,7 @@ void bli_arch_set_id( void )

 	// IBM microarchitectures.
 #ifdef BLIS_FAMILY_POWER9
-  id = BLIS_ARCH_POWER9;
+	id = BLIS_ARCH_POWER9;
 #endif
 #ifdef BLIS_FAMILY_POWER7
 	id = BLIS_ARCH_POWER7;
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -1,10 +1,13 @@
 /*
+
   BLIS
   An object-based framework for developing high-performance BLAS-like
   libraries.
+
   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
@@ -16,6 +19,7 @@
    - Neither the name(s) of the copyright holder(s) nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,6 +31,7 @@
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 */

 #ifndef BLIS_CNTX_H
@@ -40,22 +45,28 @@ typedef struct cntx_s
 {
 	blksz_t*  blkszs;
 	bszid_t*  bmults;
+
 	func_t*   l3_vir_ukrs;
 	func_t*   l3_nat_ukrs;
 	mbool_t*  l3_nat_ukrs_prefs;
+
 	blksz_t*  l3_sup_thresh;
 	void**    l3_sup_handlers;
 	blksz_t*  l3_sup_blkszs;
 	func_t*   l3_sup_kers;
 	mbool_t*  l3_sup_kers_prefs;
+
 	func_t*   l1f_kers;
 	func_t*   l1v_kers;
+
 	func_t*   packm_kers;
 	func_t*   unpackm_kers;
+
 	ind_t     method;
 	pack_t    schema_a;
 	pack_t    schema_b;
 	pack_t    schema_c;
+
 } cntx_t;
 */

--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -99,7 +99,7 @@ CNTX_INIT_PROTS( cortexa15 )
 CNTX_INIT_PROTS( cortexa9 )
 #endif

-// -- IBM BG/Q --
+// -- IBM Power --

 #ifdef BLIS_CONFIG_POWER9
 CNTX_INIT_PROTS( power9 )
@@ -107,6 +107,9 @@ CNTX_INIT_PROTS( power9 )
 #ifdef BLIS_CONFIG_POWER7
 CNTX_INIT_PROTS( power7 )
 #endif
+
+// -- IBM BG/Q --
+
 #ifdef BLIS_CONFIG_BGQ
 CNTX_INIT_PROTS( bgq )
 #endif
@@ -193,7 +196,7 @@ CNTX_INIT_PROTS( generic )
 #include "bli_family_cortexa9.h"
 #endif

-// -- IBM BG/Q --
+// -- IBM Power --

 #ifdef BLIS_FAMILY_POWER9
 #include "bli_family_power9.h"
@@ -201,6 +204,9 @@ CNTX_INIT_PROTS( generic )
 #ifdef BLIS_FAMILY_POWER7
 #include "bli_family_power7.h"
 #endif
+
+// -- IBM BG/Q --
+
 #ifdef BLIS_FAMILY_BGQ
 #include "bli_family_bgq.h"
 #endif
@@ -266,7 +272,7 @@ CNTX_INIT_PROTS( generic )
 #include "bli_kernels_armv7a.h"
 #endif

-// -- IBM BG/Q --
+// -- IBM Power --

 #ifdef BLIS_KERNELS_POWER9
 #include "bli_kernels_power9.h"
@@ -274,6 +280,9 @@ CNTX_INIT_PROTS( generic )
 #ifdef BLIS_KERNELS_POWER7
 #include "bli_kernels_power7.h"
 #endif
+
+// -- IBM BG/Q --
+
 #ifdef BLIS_KERNELS_BGQ
 #include "bli_kernels_bgq.h"
 #endif
--- a/kernels/haswell/bli_kernels_haswell.h
+++ b/kernels/haswell/bli_kernels_haswell.h
@@ -56,7 +56,7 @@ GEMMTRSM_UKR_PROT( float,    s, gemmtrsm_u_haswell_asm_6x16 )
 GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_haswell_asm_6x8 )


-// gemm (asm d8x6) 
+// gemm (asm d8x6)
 //GEMM_UKR_PROT( float,    s, gemm_haswell_asm_16x6 )
 //GEMM_UKR_PROT( double,   d, gemm_haswell_asm_8x6 )
 //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 )
--- a/kernels/power9/3/bli_pwr9_asm_macros_12x6.h
+++ b/kernels/power9/3/bli_pwr9_asm_macros_12x6.h
@@ -1,3 +1,36 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/

 // MACROS for power9_asm_d12x6 

--- a/kernels/power9/bli_kernels_power9.h
+++ b/kernels/power9/bli_kernels_power9.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -44,4 +44,4 @@ GEMM_UKR_PROT( double,   d, gemm_power9_asm_18x4 )
 GEMM_UKR_PROT( double,   d, gemm_power9_asm_16x4 )

 // gemm (asm d4x16)
-GEMM_UKR_PROT( double,   d, gemm_power9_asm_4x16 )
+GEMM_UKR_PROT( double,   d, gemm_power9_asm_4x16 )
--- a/test/3/Makefile
+++ b/test/3/Makefile
@@ -130,9 +130,9 @@ VENDORP_LIB    := $(MKLP_LIB)
 #

 # Single core (single-threaded)
-PS_BEGIN := 100
-PS_MAX   := 1000
-PS_INC   := 100
+PS_BEGIN := 48
+PS_MAX   := 2400
+PS_INC   := 48

 # Single-socket (multithreaded)
 P1_BEGIN := 96
@@ -242,8 +242,8 @@ blis-2s:    blis-nat-2s
 blis-nat:   blis-nat-st blis-nat-1s blis-nat-2s

 # Define the datatypes, operations, and implementations.
-DTS    := d # s d c z
-OPS    := gemm # hemm herk trmm trsm
+DTS    := s d c z
+OPS    := gemm hemm herk trmm trsm
 BIMPLS := asm_blis openblas vendor
 EIMPLS := eigen

--- a/test/3/Makefile_cpy1
+++ b/test/3/Makefile_cpy1
@@ -1,464 +0,0 @@
-#!/bin/bash
-#
-#  BLIS    
-#  An object-based framework for developing high-performance BLAS-like
-#  libraries.
-#
-#  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2018, Advanced Micro Devices, Inc.
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are
-#  met:
-#   - Redistributions of source code must retain the above copyright
-#     notice, this list of conditions and the following disclaimer.
-#   - Redistributions in binary form must reproduce the above copyright
-#     notice, this list of conditions and the following disclaimer in the
-#     documentation and/or other materials provided with the distribution.
-#   - Neither the name(s) of the copyright holder(s) nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-#
-# Makefile
-#
-# Field G. Van Zee
-#
-# Makefile for standalone BLIS test drivers.
-#
-
-#
-# --- Makefile PHONY target definitions ----------------------------------------
-#
-
-.PHONY: all \
-        clean cleanx
-
-
-
-#
-# --- Determine makefile fragment location -------------------------------------
-#
-
-# Comments:
-# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
-# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
-#   the second case because CONFIG_NAME is not yet set.
-ifneq ($(strip $(BLIS_INSTALL_PATH)),)
-LIB_PATH   := $(BLIS_INSTALL_PATH)/lib
-INC_PATH   := $(BLIS_INSTALL_PATH)/include/blis
-SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
-else
-DIST_PATH  := ../..
-LIB_PATH    = ../../lib/$(CONFIG_NAME)
-INC_PATH    = ../../include/$(CONFIG_NAME)
-SHARE_PATH := ../..
-endif
-
-
-
-#
-# --- Include common makefile definitions --------------------------------------
-#
-
-# Include the common makefile fragment.
-include $(SHARE_PATH)/common.mk
-
-
-
-#
-# --- BLAS implementations -----------------------------------------------------
-#
-
-# BLAS library path(s). This is where the BLAS libraries reside.
-HOME_LIB_PATH  := $(HOME)/flame/lib
-#VENDOR_LIB_PATH   := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
-#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
-#VENDOR_LIB_PATH := ${MKLROOT}/lib/intel64
-#ICC_LIB_PATH   := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
-
-# OpenBLAS
-OPENBLAS_LIB   := $(HOME_LIB_PATH)/libopenblas.a
-# OPENBLASP_LIB  := $(HOME_LIB_PATH)/libopenblasp.a
-
-# ATLAS
-#ATLAS_LIB      := $(HOME_LIB_PATH)/libf77blas.a \
-#                  $(HOME_LIB_PATH)/libatlas.a
-
-# Eigen
-EIGEN_INC      := $(HOME)/flame/eigen/include/eigen3
-EIGEN_LIB      := $(HOME_LIB_PATH)/libeigen_blas_static.a
-EIGENP_LIB     := $(EIGEN_LIB)
-
-# MKL
- MKL_LIB        := -L$(MKL_LIB_PATH) \
-                   -lmkl_intel_lp64 \
-                   -lmkl_core \
-                   -lmkl_sequential \
-                   -lpthread -lm -ldl
-#MKLP_LIB       := -L$(MKL_LIB_PATH) \
-#                  -lmkl_intel_thread \
-#                  -lmkl_core \
-#                  -lmkl_intel_ilp64 \
-#                  -L$(ICC_LIB_PATH) \
-#                  -liomp5
-# MKLP_LIB       := -L$(MKL_LIB_PATH) \
-#                   -lmkl_intel_lp64 \
-#                   -lmkl_core \
-#                   -lmkl_gnu_thread \
-#                   -lpthread -lm -ldl -fopenmp
-#                   #-L$(ICC_LIB_PATH) \
-#                   #-lgomp
-
-VENDOR_LIB     := $(MKL_LIB)
-VENDORP_LIB    := $(MKLP_LIB)
-
-
-#
-# --- Problem size definitions -------------------------------------------------
-#
-
-# Single core (single-threaded)
-PS_BEGIN := 100
-PS_MAX   := 1000
-PS_INC   := 100
-
-# Single-socket (multithreaded)
-P1_BEGIN := 120
-P1_MAX   := 6000
-P1_INC   := 120
-
-# Dual-socket (multithreaded)
-P2_BEGIN := 160
-P2_MAX   := 8000
-P2_INC   := 160
-
-
-#
-# --- General build definitions ------------------------------------------------
-#
-
-TEST_SRC_PATH  := .
-TEST_OBJ_PATH  := .
-
-# Gather all local object files.
-TEST_OBJS      := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
-                                    $(TEST_OBJ_PATH)/%.o, \
-                                    $(wildcard $(TEST_SRC_PATH)/*.c)))
-
-# Override the value of CINCFLAGS so that the value of CFLAGS returned by
-# get-user-cflags-for() is not cluttered up with include paths needed only
-# while building BLIS.
-CINCFLAGS      := -I$(INC_PATH)
-
-# Use the "framework" CFLAGS for the configuration family.
-CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
-
-# Add local header paths to CFLAGS.
-CFLAGS         += -I$(TEST_SRC_PATH)
-
-# Locate the libblis library to which we will link.
-#LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
-
-# Define a set of CFLAGS for use with C++ and Eigen.
-CXXFLAGS       := $(subst -std=c99,-std=c++11,$(CFLAGS))
-CXXFLAGS       += -I$(EIGEN_INC)
-
-# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
-CXXFLAGS_ST    := -march=native $(subst -fopenmp,,$(CXXFLAGS))
-CXXFLAGS_MT    := -march=native $(CXXFLAGS)
-
-
-# Which library?
-BLI_DEF  := -DBLIS
-BLA_DEF  := -DBLAS
-EIG_DEF  := -DEIGEN
-
-# Complex implementation type
-D3MHW    := -DIND=BLIS_3MH
-D3M1     := -DIND=BLIS_3M1
-D4MHW    := -DIND=BLIS_4MH
-D4M1B    := -DIND=BLIS_4M1B
-D4M1A    := -DIND=BLIS_4M1A
-D1M      := -DIND=BLIS_1M
-DNAT     := -DIND=BLIS_NAT
-
-# Implementation string
-#STR_3MHW := -DSTR=\"3mhw\"
-#STR_3M1  := -DSTR=\"3m1\"
-#STR_4MHW := -DSTR=\"4mhw\"
-#STR_4M1B := -DSTR=\"4m1b\"
-#STR_4M1A := -DSTR=\"4m1a\"
-#STR_1M   := -DSTR=\"1m\"
-STR_NAT  := -DSTR=\"asm_blis\"
-STR_OBL  := -DSTR=\"openblas\"
-STR_EIG  := -DSTR=\"eigen\"
-STR_VEN  := -DSTR=\"vendor\"
-
-# Single or multithreaded string
-STR_ST   := -DTHR_STR=\"st\"
-STR_1S   := -DTHR_STR=\"1s\"
-STR_2S   := -DTHR_STR=\"2s\"
-
-# Problem size specification
-PDEF_ST  := -DP_BEGIN=$(PS_BEGIN)  -DP_INC=$(PS_INC)  -DP_MAX=$(PS_MAX)
-PDEF_1S  := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
-PDEF_2S  := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
-
-
-
-#
-# --- Targets/rules ------------------------------------------------------------
-#
-
-all:        all-st all-1s all-2s
-blis:       blis-st blis-1s blis-2s
-openblas:   openblas-st openblas-1s openblas-2s
-eigen:      eigen-st eigen-1s eigen-2s
-vendor:     vendor-st vendor-1s vendor-2s
-mkl:        vendor
-armpl:      vendor
-
-all-st:     blis-st openblas-st mkl-st
-all-1s:     blis-1s openblas-1s mkl-1s
-all-2s:     blis-2s openblas-2s mkl-2s
-
-blis-st:    blis-nat-st
-blis-1s:    blis-nat-1s
-blis-2s:    blis-nat-2s
-
-#blis-ind:   blis-ind-st blis-ind-mt
-blis-nat:   blis-nat-st blis-nat-1s blis-nat-2s
-
-# Define the datatypes, operations, and implementations.
-DTS   := d #s d c z
-OPS   := gemm #hemm herk trmm trsm
-IMPLS := asm_blis openblas vendor
-
-# Define functions to construct object filenames from the datatypes and
-# operations given an implementation. We define one function for single-
-# threaded, single-socket, and dual-socket filenames.
-get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
-get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
-get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
-
-# Construct object and binary names for single-threaded, single-socket, and
-# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
-BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
-BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
-BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
-BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
-BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
-BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
-
-OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
-OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
-OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
-OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
-OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
-OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
-
-EIGEN_ST_OBJS    := $(call get-st-objs,eigen)
-EIGEN_ST_BINS    := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
-EIGEN_1S_OBJS    := $(call get-1s-objs,eigen)
-EIGEN_1S_BINS    := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS))
-EIGEN_2S_OBJS    := $(call get-2s-objs,eigen)
-EIGEN_2S_BINS    := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS))
-
-VENDOR_ST_OBJS   := $(call get-st-objs,vendor)
-VENDOR_ST_BINS   := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
-VENDOR_1S_OBJS   := $(call get-1s-objs,vendor)
-VENDOR_1S_BINS   := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
-VENDOR_2S_OBJS   := $(call get-2s-objs,vendor)
-VENDOR_2S_BINS   := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
-
-# Define some targets associated with the above object/binary files.
-blis-nat-st: $(BLIS_NAT_ST_BINS)
-blis-nat-1s: $(BLIS_NAT_1S_BINS)
-blis-nat-2s: $(BLIS_NAT_2S_BINS)
-
-openblas-st: $(OPENBLAS_ST_BINS)
-openblas-1s: $(OPENBLAS_1S_BINS)
-openblas-2s: $(OPENBLAS_2S_BINS)
-
-eigen-st: $(EIGEN_ST_BINS)
-eigen-1s: $(EIGEN_1S_BINS)
-eigen-2s: $(EIGEN_2S_BINS)
-
-vendor-st: $(VENDOR_ST_BINS)
-vendor-1s: $(VENDOR_1S_BINS)
-vendor-2s: $(VENDOR_2S_BINS)
-
-mkl-st: vendor-st
-mkl-1s: vendor-1s
-mkl-2s: vendor-2s
-
-armpl-st: vendor-st
-armpl-1s: vendor-1s
-armpl-2s: vendor-2s
-
-# Mark the object files as intermediate so that make will remove them
-# automatically after building the binaries on which they depend.
-.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS)
-.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS)
-.INTERMEDIATE: $(EIGEN_ST_OBJS)    $(EIGEN_1S_OBJS)    $(EIGEN_2S_OBJS)
-.INTERMEDIATE: $(VENDOR_ST_OBJS)   $(VENDOR_1S_OBJS)   $(VENDOR_2S_OBJS)
-
-
-# --Object file rules --
-
-#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
-#	$(CC) $(CFLAGS) -c $< -o $@
-
-# A function to return the datatype cpp macro def from the datatype
-# character.
-get-dt-cpp = $(strip \
-             $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT    -DIS_FLOAT,\
-             $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE   -DIS_DOUBLE,\
-             $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
-                                       -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
-
-# A function to return other cpp macros that help the test driver
-# identify the implementation.
-#get-bl-cpp = $(strip \
-#             $(if $(findstring     blis,$(1)),$(STR_NAT) $(BLI_DEF),\
-#             $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
-#             $(if $(findstring    eigen,$(1)),$(STR_EIG) $(EIG_DEF),\
-#                                              $(STR_VEN) $(BLA_DEF)))))
-
-get-bl-cpp = $(strip \
-             $(if $(findstring     blis,$(1)),$(STR_NAT) $(BLI_DEF),\
-             $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
-             $(if $(and $(findstring eigen,$(1)),\
-                        $(findstring  gemm,$(2))),\
-                                              $(STR_EIG) $(EIG_DEF),\
-             $(if       $(findstring eigen,$(1)),\
-                                              $(STR_EIG) $(BLA_DEF),\
-                                              $(STR_VEN) $(BLA_DEF))))))
-
-
-# Rules for BLIS and BLAS libraries.
-define make-st-rule
-test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
-	$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
-endef
-
-define make-1s-rule
-test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
-	$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
-endef
-
-define make-2s-rule
-test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
-	$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
-endef
-
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
-
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
-
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
-
-# Rules for Eigen.
-define make-eigst-rule
-test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
-	$(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
-endef
-
-define make-eig1s-rule
-test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
-	$(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
-endef
-
-define make-eig2s-rule
-test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
-	$(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
-endef
-
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im))))))
-
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im))))))
-
-$(foreach dt,$(DTS), \
-$(foreach op,$(OPS), \
-$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
-
-
-# -- Executable file rules --
-
-# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
-# on the link command line in case BLIS was configured with the BLAS
-# compatibility layer. This prevents BLIS from inadvertently getting called
-# for the BLAS routines we are trying to test with.
-
-test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-
-test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(OPENBLAS_LIB)  $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-
-test_%_$(PS_MAX)_eigen_st.x:    test_%_$(PS_MAX)_eigen_st.o    $(LIBBLIS_LINK)
-	$(CXX) $(strip $<  $(EIGEN_LIB)     $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P1_MAX)_eigen_1s.x:    test_%_$(P1_MAX)_eigen_1s.o    $(LIBBLIS_LINK)
-	$(CXX) $(strip $<  $(EIGENP_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P2_MAX)_eigen_2s.x:    test_%_$(P2_MAX)_eigen_2s.o    $(LIBBLIS_LINK)
-	$(CXX) $(strip $<  $(EIGENP_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-
-test_%_$(PS_MAX)_vendor_st.x:   test_%_$(PS_MAX)_vendor_st.o   $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(VENDOR_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P1_MAX)_vendor_1s.x:   test_%_$(P1_MAX)_vendor_1s.o   $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(VENDORP_LIB)   $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-test_%_$(P2_MAX)_vendor_2s.x:   test_%_$(P2_MAX)_vendor_2s.o   $(LIBBLIS_LINK)
-	$(CC) $(strip $<   $(VENDORP_LIB)   $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
-
-
-# -- Clean rules --
-
-clean: cleanx
-
-cleanx:
-	- $(RM_F) *.o *.x
-
--- a/test/3/runme.sh
+++ b/test/3/runme.sh
@@ -65,15 +65,16 @@ elif [ ${sys} = "ul264" ]; then
 fi

 # Datatypes to test.
-test_dts="d " #s z c"
+test_dts="d s z c"

 # Operations to test.
-test_ops="gemm "#hemm herk trmm trsm"
+test_ops="gemm hemm herk trmm trsm"

 # Implementations to test.
-#impls="all"
-#impls="other"
 impls="blis"
+#impls="other"
+#impls="eigen"
+#impls="all"

 if [ "${impls}" = "blis" ]; then

--- a/test/3/test_gemm.c
+++ b/test/3/test_gemm.c
@@ -1,418 +1,418 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <unistd.h>
-#ifdef EIGEN
-  #define BLIS_DISABLE_BLAS_DEFS
-  #include "blis.h"
-  #include <Eigen/Core>
-  #include <Eigen/src/misc/blas.h>
-  using namespace Eigen;
-#else
-  #include "blis.h"
-#endif
-
-#define COL_STORAGE
-//#define ROW_STORAGE
-
-//#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t    a, b, c;
-	obj_t    c_save;
-	obj_t    alpha, beta;
-	dim_t    m, n, k;
-	dim_t    p;
-	dim_t    p_begin, p_max, p_inc;
-	int      m_input, n_input, k_input;
-	ind_t    ind;
-	num_t    dt;
-	char     dt_ch;
-	int      r, n_repeats;
-	trans_t  transa;
-	trans_t  transb;
-	f77_char f77_transa;
-	f77_char f77_transb;
-
-	double   dtime;
-	double   dtime_save;
-	double   gflops;
-
-	//bli_init();
-
-	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-	n_repeats = 3;
-
-	dt      = DT;
-
-	ind     = IND;
-
-#if 1
-	p_begin = P_BEGIN;
-	p_max   = P_MAX;
-	p_inc   = P_INC;
-
-	m_input = -1;
-	n_input = -1;
-	k_input = -1;
-#else
-	p_begin = 40;
-	p_max   = 1000;
-	p_inc   = 40;
-
-	m_input = -1;
-	n_input = -1;
-	k_input = -1;
-#endif
-
-
-	// Supress compiler warnings about unused variable 'ind'.
-	( void )ind;
-
-#if 0
-
-	cntx_t* cntx;
-
-	ind_t ind_mod = ind;
-
-	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
-	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
-
-	// Initialize a context for the current induced method and datatype.
-	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
-
-	// Set k to the kc blocksize for the current datatype.
-	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
-
-#elif 1
-
-	//k_input = 256;
-
-#endif
-
-	// Choose the char corresponding to the requested datatype.
-	if      ( bli_is_float( dt ) )    dt_ch = 's';
-	else if ( bli_is_double( dt ) )   dt_ch = 'd';
-	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
-	else                              dt_ch = 'z';
-
-	transa = BLIS_NO_TRANSPOSE;
-	transb = BLIS_NO_TRANSPOSE;
-
-	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
-	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
-
-	// Begin with initializing the last entry to zero so that
-	// matlab allocates space for the entire array once up-front.
-	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
-
-	printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
-	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin)/p_inc + 1,
-	        ( unsigned long )0,
-	        ( unsigned long )0,
-	        ( unsigned long )0, 0.0 );
-
-
-	//for ( p = p_begin; p <= p_max; p += p_inc )
-	for ( p = p_max; p_begin <= p; p -= p_inc )
-	{
-
-		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
-		else               n =     ( dim_t )    n_input;
-		if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
-		else               k =     ( dim_t )    k_input;
-
-		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt, 1, 1, 0, 0, &beta );
-
-	#ifdef COL_STORAGE
-		bli_obj_create( dt, m, k, 0, 0, &a );
-		bli_obj_create( dt, k, n, 0, 0, &b );
-		bli_obj_create( dt, m, n, 0, 0, &c );
-		bli_obj_create( dt, m, n, 0, 0, &c_save );
-	#else
-		bli_obj_create( dt, m, k, k, 1, &a );
-		bli_obj_create( dt, k, n, n, 1, &b );
-		bli_obj_create( dt, m, n, n, 1, &c );
-		bli_obj_create( dt, m, n, n, 1, &c_save );
-	#endif
-
-		bli_randm( &a );
-		bli_randm( &b );
-		bli_randm( &c );
-
-		bli_obj_set_conjtrans( transa, &a );
-		bli_obj_set_conjtrans( transb, &b );
-
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
-		bli_setsc(  (1.0/1.0), 0.0, &beta );
-
-		bli_copym( &c, &c_save );
-	
-#if 0 //def BLIS
-		bli_ind_disable_all_dt( dt );
-		bli_ind_enable_dt( ind, dt );
-#endif
-
-#ifdef EIGEN
-		double alpha_r, alpha_i;
-
-		bli_getsc( &alpha, &alpha_r, &alpha_i );
-
-		void* ap = bli_obj_buffer_at_off( &a );
-		void* bp = bli_obj_buffer_at_off( &b );
-		void* cp = bli_obj_buffer_at_off( &c );
-
-	#ifdef COL_STORAGE
-		const int os_a = bli_obj_col_stride( &a );
-		const int os_b = bli_obj_col_stride( &b );
-		const int os_c = bli_obj_col_stride( &c );
-	#else
-		const int os_a = bli_obj_row_stride( &a );
-		const int os_b = bli_obj_row_stride( &b );
-		const int os_c = bli_obj_row_stride( &c );
-	#endif
-
-		Stride<Dynamic,1> stride_a( os_a, 1 );
-		Stride<Dynamic,1> stride_b( os_b, 1 );
-		Stride<Dynamic,1> stride_c( os_c, 1 );
-
-	#ifdef COL_STORAGE
-		#if defined(IS_FLOAT)
-		typedef Matrix<float,                Dynamic, Dynamic, ColMajor> MatrixXf_;
-		#elif defined (IS_DOUBLE)
-		typedef Matrix<double,               Dynamic, Dynamic, ColMajor> MatrixXd_;
-		#elif defined (IS_SCOMPLEX)
-		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, ColMajor> MatrixXcf_;
-		#elif defined (IS_DCOMPLEX)
-		typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
-		#endif
-	#else
-		#if defined(IS_FLOAT)
-		typedef Matrix<float,                Dynamic, Dynamic, RowMajor> MatrixXf_;
-		#elif defined (IS_DOUBLE)
-		typedef Matrix<double,               Dynamic, Dynamic, RowMajor> MatrixXd_;
-		#elif defined (IS_SCOMPLEX)
-		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, RowMajor> MatrixXcf_;
-		#elif defined (IS_DCOMPLEX)
-		typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
-		#endif
-	#endif
-	#if defined(IS_FLOAT)
-		Map<MatrixXf_,  0, Stride<Dynamic,1> > A( ( float*  )ap, m, k, stride_a );
-		Map<MatrixXf_,  0, Stride<Dynamic,1> > B( ( float*  )bp, k, n, stride_b );
-		Map<MatrixXf_,  0, Stride<Dynamic,1> > C( ( float*  )cp, m, n, stride_c );
-	#elif defined (IS_DOUBLE)
-		Map<MatrixXd_,  0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
-		Map<MatrixXd_,  0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
-		Map<MatrixXd_,  0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
-	#elif defined (IS_SCOMPLEX)
-		Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>*  )ap, m, k, stride_a );
-		Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>*  )bp, k, n, stride_b );
-		Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>*  )cp, m, n, stride_c );
-	#elif defined (IS_DCOMPLEX)
-		Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
-		Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
-		Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
-	#endif
-#endif
-
-		dtime_save = DBL_MAX;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-			dtime = bli_clock();
-
-#ifdef PRINT
-			bli_printm( "a", &a, "%4.1f", "" );
-			bli_printm( "b", &b, "%4.1f", "" );
-			bli_printm( "c", &c, "%4.1f", "" );
-#endif
-
-#if defined(BLIS)
-
-			bli_gemm( &alpha,
-			          &a,
-			          &b,
-			          &beta,
-			          &c );
-
-#elif defined(EIGEN)
-
-			C.noalias() += alpha_r * A * B;
-
-#else // if defined(BLAS)
-
-			if ( bli_is_float( dt ) )
-			{
-				f77_int   mm     = bli_obj_length( &c );
-				f77_int   kk     = bli_obj_width_after_trans( &a );
-				f77_int   nn     = bli_obj_width( &c );
-				f77_int   lda    = bli_obj_col_stride( &a );
-				f77_int   ldb    = bli_obj_col_stride( &b );
-				f77_int   ldc    = bli_obj_col_stride( &c );
-				float*    alphap = ( float* )bli_obj_buffer( &alpha );
-				float*    ap     = ( float* )bli_obj_buffer( &a );
-				float*    bp     = ( float* )bli_obj_buffer( &b );
-				float*    betap  = ( float* )bli_obj_buffer( &beta );
-				float*    cp     = ( float* )bli_obj_buffer( &c );
-
-				sgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
-			}
-			else if ( bli_is_double( dt ) )
-			{
-				f77_int   mm     = bli_obj_length( &c );
-				f77_int   kk     = bli_obj_width_after_trans( &a );
-				f77_int   nn     = bli_obj_width( &c );
-				f77_int   lda    = bli_obj_col_stride( &a );
-				f77_int   ldb    = bli_obj_col_stride( &b );
-				f77_int   ldc    = bli_obj_col_stride( &c );
-				double*   alphap = ( double* )bli_obj_buffer( &alpha );
-				double*   ap     = ( double* )bli_obj_buffer( &a );
-				double*   bp     = ( double* )bli_obj_buffer( &b );
-				double*   betap  = ( double* )bli_obj_buffer( &beta );
-				double*   cp     = ( double* )bli_obj_buffer( &c );
-
-				dgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
-			}
-			else if ( bli_is_scomplex( dt ) )
-			{
-				f77_int   mm     = bli_obj_length( &c );
-				f77_int   kk     = bli_obj_width_after_trans( &a );
-				f77_int   nn     = bli_obj_width( &c );
-				f77_int   lda    = bli_obj_col_stride( &a );
-				f77_int   ldb    = bli_obj_col_stride( &b );
-				f77_int   ldc    = bli_obj_col_stride( &c );
-				scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
-				scomplex* ap     = ( scomplex* )bli_obj_buffer( &a );
-				scomplex* bp     = ( scomplex* )bli_obj_buffer( &b );
-				scomplex* betap  = ( scomplex* )bli_obj_buffer( &beta );
-				scomplex* cp     = ( scomplex* )bli_obj_buffer( &c );
-
-				cgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
-			}
-			else if ( bli_is_dcomplex( dt ) )
-			{
-				f77_int   mm     = bli_obj_length( &c );
-				f77_int   kk     = bli_obj_width_after_trans( &a );
-				f77_int   nn     = bli_obj_width( &c );
-				f77_int   lda    = bli_obj_col_stride( &a );
-				f77_int   ldb    = bli_obj_col_stride( &b );
-				f77_int   ldc    = bli_obj_col_stride( &c );
-				dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
-				dcomplex* ap     = ( dcomplex* )bli_obj_buffer( &a );
-				dcomplex* bp     = ( dcomplex* )bli_obj_buffer( &b );
-				dcomplex* betap  = ( dcomplex* )bli_obj_buffer( &beta );
-				dcomplex* cp     = ( dcomplex* )bli_obj_buffer( &c );
-
-				zgemm_( &f77_transa,
-						&f77_transb,
-						&mm,
-						&nn,
-						&kk,
-						alphap,
-						ap, &lda,
-						bp, &ldb,
-						betap,
-						cp, &ldc );
-			}
-#endif
-
-#ifdef PRINT
-			bli_printm( "c after", &c, "%4.1f", "" );
-			exit(1);
-#endif
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
-
-		if ( bli_is_complex( dt ) ) gflops *= 4.0;
-
-		printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )k,
-		        ( unsigned long )n, gflops );
-
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &b );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	//bli_finalize();
-
-	return 0;
-}
-
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#ifdef EIGEN
+  #define BLIS_DISABLE_BLAS_DEFS
+  #include "blis.h"
+  #include <Eigen/Core>
+  #include <Eigen/src/misc/blas.h>
+  using namespace Eigen;
+#else
+  #include "blis.h"
+#endif
+
+#define COL_STORAGE
+//#define ROW_STORAGE
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, b, c;
+	obj_t    c_save;
+	obj_t    alpha, beta;
+	dim_t    m, n, k;
+	dim_t    p;
+	dim_t    p_begin, p_max, p_inc;
+	int      m_input, n_input, k_input;
+	ind_t    ind;
+	num_t    dt;
+	char     dt_ch;
+	int      r, n_repeats;
+	trans_t  transa;
+	trans_t  transb;
+	f77_char f77_transa;
+	f77_char f77_transb;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+
+	ind     = IND;
+
+#if 1
+	p_begin = P_BEGIN;
+	p_max   = P_MAX;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	n_input = -1;
+	k_input = -1;
+#else
+	p_begin = 40;
+	p_max   = 1000;
+	p_inc   = 40;
+
+	m_input = -1;
+	n_input = -1;
+	k_input = -1;
+#endif
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 1
+
+	//k_input = 256;
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+	transa = BLIS_NO_TRANSPOSE;
+	transb = BLIS_NO_TRANSPOSE;
+
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
+
+	printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+		if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+	#ifdef COL_STORAGE
+		bli_obj_create( dt, m, k, 0, 0, &a );
+		bli_obj_create( dt, k, n, 0, 0, &b );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+	#else
+		bli_obj_create( dt, m, k, k, 1, &a );
+		bli_obj_create( dt, k, n, n, 1, &b );
+		bli_obj_create( dt, m, n, n, 1, &c );
+		bli_obj_create( dt, m, n, n, 1, &c_save );
+	#endif
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		bli_setsc(  (1.0/1.0), 0.0, &beta );
+
+		bli_copym( &c, &c_save );
+
+#if 0 //def BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+#ifdef EIGEN
+		double alpha_r, alpha_i;
+
+		bli_getsc( &alpha, &alpha_r, &alpha_i );
+
+		void* ap = bli_obj_buffer_at_off( &a );
+		void* bp = bli_obj_buffer_at_off( &b );
+		void* cp = bli_obj_buffer_at_off( &c );
+
+	#ifdef COL_STORAGE
+		const int os_a = bli_obj_col_stride( &a );
+		const int os_b = bli_obj_col_stride( &b );
+		const int os_c = bli_obj_col_stride( &c );
+	#else
+		const int os_a = bli_obj_row_stride( &a );
+		const int os_b = bli_obj_row_stride( &b );
+		const int os_c = bli_obj_row_stride( &c );
+	#endif
+
+		Stride<Dynamic,1> stride_a( os_a, 1 );
+		Stride<Dynamic,1> stride_b( os_b, 1 );
+		Stride<Dynamic,1> stride_c( os_c, 1 );
+
+	#ifdef COL_STORAGE
+		#if defined(IS_FLOAT)
+		typedef Matrix<float,                Dynamic, Dynamic, ColMajor> MatrixXf_;
+		#elif defined (IS_DOUBLE)
+		typedef Matrix<double,               Dynamic, Dynamic, ColMajor> MatrixXd_;
+		#elif defined (IS_SCOMPLEX)
+		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, ColMajor> MatrixXcf_;
+		#elif defined (IS_DCOMPLEX)
+		typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
+		#endif
+	#else
+		#if defined(IS_FLOAT)
+		typedef Matrix<float,                Dynamic, Dynamic, RowMajor> MatrixXf_;
+		#elif defined (IS_DOUBLE)
+		typedef Matrix<double,               Dynamic, Dynamic, RowMajor> MatrixXd_;
+		#elif defined (IS_SCOMPLEX)
+		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, RowMajor> MatrixXcf_;
+		#elif defined (IS_DCOMPLEX)
+		typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
+		#endif
+	#endif
+	#if defined(IS_FLOAT)
+		Map<MatrixXf_,  0, Stride<Dynamic,1> > A( ( float*  )ap, m, k, stride_a );
+		Map<MatrixXf_,  0, Stride<Dynamic,1> > B( ( float*  )bp, k, n, stride_b );
+		Map<MatrixXf_,  0, Stride<Dynamic,1> > C( ( float*  )cp, m, n, stride_c );
+	#elif defined (IS_DOUBLE)
+		Map<MatrixXd_,  0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
+		Map<MatrixXd_,  0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
+		Map<MatrixXd_,  0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
+	#elif defined (IS_SCOMPLEX)
+		Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>*  )ap, m, k, stride_a );
+		Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>*  )bp, k, n, stride_b );
+		Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>*  )cp, m, n, stride_c );
+	#elif defined (IS_DCOMPLEX)
+		Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
+		Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
+		Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
+	#endif
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+			dtime = bli_clock();
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "b", &b, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#if defined(BLIS)
+
+			bli_gemm( &alpha,
+			          &a,
+			          &b,
+			          &beta,
+			          &c );
+
+#elif defined(EIGEN)
+
+			C.noalias() += alpha_r * A * B;
+
+#else // if defined(BLAS)
+
+			if ( bli_is_float( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				float*    alphap = ( float* )bli_obj_buffer( &alpha );
+				float*    ap     = ( float* )bli_obj_buffer( &a );
+				float*    bp     = ( float* )bli_obj_buffer( &b );
+				float*    betap  = ( float* )bli_obj_buffer( &beta );
+				float*    cp     = ( float* )bli_obj_buffer( &c );
+
+				sgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+			else if ( bli_is_double( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				double*   alphap = ( double* )bli_obj_buffer( &alpha );
+				double*   ap     = ( double* )bli_obj_buffer( &a );
+				double*   bp     = ( double* )bli_obj_buffer( &b );
+				double*   betap  = ( double* )bli_obj_buffer( &beta );
+				double*   cp     = ( double* )bli_obj_buffer( &c );
+
+				dgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+			else if ( bli_is_scomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
+				scomplex* ap     = ( scomplex* )bli_obj_buffer( &a );
+				scomplex* bp     = ( scomplex* )bli_obj_buffer( &b );
+				scomplex* betap  = ( scomplex* )bli_obj_buffer( &beta );
+				scomplex* cp     = ( scomplex* )bli_obj_buffer( &c );
+
+				cgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+			else if ( bli_is_dcomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
+				dcomplex* ap     = ( dcomplex* )bli_obj_buffer( &a );
+				dcomplex* bp     = ( dcomplex* )bli_obj_buffer( &b );
+				dcomplex* betap  = ( dcomplex* )bli_obj_buffer( &beta );
+				dcomplex* cp     = ( dcomplex* )bli_obj_buffer( &c );
+
+				zgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+		printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, gflops );
+
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
--- a/test/Makefile
+++ b/test/Makefile
@@ -96,7 +96,7 @@ endif
 BLAS_LIB_PATH  := $(HOME)/flame/lib
 #MKL_LIB_PATH   := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
 #MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64
-#MKL_LIB_PATH   := ${MKLROOT}/lib/intel64
+MKL_LIB_PATH   := ${MKLROOT}/lib/intel64
 #ESSL_LIB_PATH  := $(HOME)/path/to/essl/changeme

 # OpenBLAS
@@ -165,21 +165,23 @@ CFLAGS         += -I$(TEST_SRC_PATH)
 #all: blis openblas atlas mkl
 all: blis openblas mkl

-blis:   test_gemm_blis.x \
-#       test_dotv_blis.x \
-#       test_axpyv_blis.x \
-#       test_gemv_blis.x \
-#       test_ger_blis.x \
-#       test_hemv_blis.x \
-#       test_her_blis.x \
-#       test_her2_blis.x \
-#       test_trmv_blis.x \
-#       test_trsv_blis.x \
-      # test_hemm_blis.x \
-      # test_herk_blis.x \
-      # test_her2k_blis.x \
-      # test_trmm_blis.x \
-      # test_trsm_blis.x
+blis: \
+       test_dotv_blis.x \
+       test_axpyv_blis.x \
+       test_gemv_blis.x \
+       test_ger_blis.x \
+       test_hemv_blis.x \
+       test_her_blis.x \
+       test_her2_blis.x \
+       test_trmv_blis.x \
+       test_trsv_blis.x \
+       \
+       test_gemm_blis.x \
+       test_hemm_blis.x \
+       test_herk_blis.x \
+       test_her2k_blis.x \
+       test_trmm_blis.x \
+       test_trsm_blis.x

 openblas: \
      test_dotv_openblas.x \
--- a/test/output_gemm_blis.m
+++ b/test/output_gemm_blis.m
--- a/test/runme.sh
+++ b/test/runme.sh
@@ -5,13 +5,12 @@ out_root="output"
 #out_root="output_square"

 # Operations to test.
-# l2_ops="gemv ger hemv her her2 trmv trsv"
-l3_ops="gemm" 
-# "hemm herk her2k trmm trsm"
-test_ops=" ${l3_ops}" 
-# "${l2_ops}"
+l2_ops="gemv ger hemv her her2 trmv trsv"
+l3_ops="gemm hemm herk her2k trmm trsm"
+test_ops="${l2_ops} ${l3_ops}"

-# Implementations to test | "openblas atlas mkl"
+# Implementations to test.
+#test_impls="openblas mkl blis"
 test_impls="blis"

 for im in ${test_impls}; do
@@ -24,7 +23,7 @@ for im in ${test_impls}; do
 		# Construct the name of the output file.
 		out_file="${out_root}_${op}_${im}.m"

-		echo " Running ${exec_name} > ${out_file} "
+		echo "Running ${exec_name} > ${out_file}"

 		# Run executable.
 		./${exec_name} > ${out_file}
--- a/testsuite/input.general
+++ b/testsuite/input.general
@@ -8,8 +8,8 @@
 #  accepted values.
 #

-3       # Number of repeats per experiment (best result is reported)
-c       # Matrix storage scheme(s) to test:
+1       # Number of repeats per experiment (best result is reported)
+rc      # Matrix storage scheme(s) to test:
        #   'c' = col-major storage; 'g' = general stride storage;
        #   'r' = row-major storage
 cj      # Vector storage scheme(s) to test:
@@ -22,14 +22,14 @@ cj      # Vector storage scheme(s) to test:
        #   '0' = real values on [-1,1];
        #   '1' = powers of 2 in narrow precision range
 32      # General stride spacing (for cases when testing general stride)
-d       # Datatype(s) to test:
+sdcz    # Datatype(s) to test:
        #   's' = single real; 'c' = single complex;
        #   'd' = double real; 'z' = double complex
 0       # Test gemm with mixed-domain operands?
 0       # Test gemm with mixed-precision operands?
-2000      # Problem size: first to test
-2000     # Problem size: maximum to test
-200      # Problem size: increment between experiments
+100     # Problem size: first to test
+500     # Problem size: maximum to test
+100     # Problem size: increment between experiments
        # Complex level-3 implementations to test:
 0       #   3mh  ('1' = enable; '0' = disable)
 0       #   3m1  ('1' = enable; '0' = disable)
@@ -45,5 +45,5 @@ d       # Datatype(s) to test:
        #   '0' = disable error checking; '1' = full error checking
 i       # Reaction to test failure:
        #   'i' = ignore; 's' = sleep() and continue; 'a' = abort
-1       # Output results in matlab/octave format? ('1' = yes; '0' = no)
+0       # Output results in matlab/octave format? ('1' = yes; '0' = no)
 0       # Output results to stdout AND files? ('1' = yes; '0' = no)
--- a/testsuite/input.operations
+++ b/testsuite/input.operations
@@ -276,9 +276,9 @@

 # --- Level-3 --------------------------------------------------------------

-2        # gemm
-1 -1 -1  #   dimensions: m n k
-nn       #   parameters: transa transb
+1        # gemm
+-1 -1 -1 #   dimensions: m n k
+??       #   parameters: transa transb

 1        # hemm
 -1 -1    #   dimensions: m n
--- a/testsuite/old/jobscripts/cfig.out
+++ b/testsuite/old/jobscripts/cfig.out
--- a/testsuite/old/jobscripts/cfig.sh
+++ b/testsuite/old/jobscripts/cfig.sh
--- a/testsuite/old/jobscripts/jb-cfig.sh
+++ b/testsuite/old/jobscripts/jb-cfig.sh
--- a/testsuite/old/jobscripts/jb-mk.sh
+++ b/testsuite/old/jobscripts/jb-mk.sh
--- a/testsuite/old/jobscripts/jb-runtest.sh
+++ b/testsuite/old/jobscripts/jb-runtest.sh
--- a/testsuite/old/jobscripts/mk.out
+++ b/testsuite/old/jobscripts/mk.out
--- a/testsuite/old/jobscripts/mk.sh
+++ b/testsuite/old/jobscripts/mk.sh
--- a/testsuite/old/jobscripts/runtest.sh
+++ b/testsuite/old/jobscripts/runtest.sh
--- a/testsuite/src/test_gemm.c
+++ b/testsuite/src/test_gemm.c
@@ -259,6 +259,7 @@ void libblis_test_gemm_experiment
 	libblis_test_mobj_randomize( params, TRUE, &b );
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
+
 //bli_setm( &BLIS_ONE, &a );
 //bli_setsc(  1.0,  0.0, &alpha );
 //bli_setsc(  0.0,  0.0, &beta );
@@ -272,23 +273,11 @@ void libblis_test_gemm_experiment
 	{
 		bli_copym( &c_save, &c );

-#if 0
-bli_printm( "alpha", &alpha, "%5.2f", "" );
-bli_printm( "beta", &beta, "%5.2f", "" );
-bli_printm( "a = [", &a, "%7.6f", "];" );
-bli_printm( "b = [", &b, "%7.6f", "];" );
-bli_printm( "c = [", &c, "%7.6f", "];" );
-#endif
-
 		time = bli_clock();

 		libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );

 		time_min = bli_clock_min_diff( time_min, time );
-#if 0
-bli_printm( "c_after = [", &c, "%7.6f", "];" );
-#endif
-
 	}

 	// Estimate the performance of the best experiment repeat.
@@ -417,6 +406,7 @@ void libblis_test_gemm_md

 		libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );

+		time_min = bli_clock_min_diff( time_min, time );
 	}

 	// Estimate the performance of the best experiment repeat.
@@ -453,18 +443,20 @@ void libblis_test_gemm_impl
 	{
 		case BLIS_TEST_SEQ_FRONT_END:
 #if 0
-bli_printm( "alpha", alpha, "%5.2f", "" );
-bli_printm( "beta", beta, "%5.2f", "" );
-bli_printm( "a", a, "%6.3f", "" );
-bli_printm( "b", b, "%6.3f", "" );
-bli_printm( "c", c, "%6.3f", "" );
+//bli_printm( "alpha", alpha, "%5.2f", "" );
+//bli_printm( "beta", beta, "%5.2f", "" );
+bli_printm( "a", a, "%5.2f", "" );
+bli_printm( "b", b, "%5.2f", "" );
+bli_printm( "c", c, "%5.2f", "" );
 #endif
 //if ( bli_obj_length( b ) == 16 &&
 //     bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
 //bli_printm( "c before", c, "%6.3f", "" );
 		bli_gemm( alpha, a, b, beta, c );
 #if 0
-bli_printm( "c after", c, "%6.3f", "");
+if ( bli_obj_length( c ) == 12 &&
+     bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
+bli_printm( "c after", c, "%6.3f", "" );
 #endif
 		break;