Updated copyright headers of emscripten configuration files.

2026-06-06 04:34:02 +00:00 · 2014-08-06 14:13:46 -05:00
parent 30833ed71d
commit 9526ce9881
13 changed files with 68 additions and 730 deletions
--- a/config/emscripten/bli_config.h
+++ b/config/emscripten/bli_config.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2014, The University of Texas
+   Copyright (C) 2014, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
--- a/config/emscripten/bli_kernel.h
+++ b/config/emscripten/bli_kernel.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2014, The University of Texas
+   Copyright (C) 2014, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
--- a/config/emscripten/make_defs.mk
+++ b/config/emscripten/make_defs.mk
@@ -4,7 +4,7 @@
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2014, The University of Texas
+#  Copyright (C) 2014, The University of Texas at Austin
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
 #   - Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
-#   - Neither the name of The University of Texas nor the names of its
-#     contributors may be used to endorse or promote products derived
-#     from this software without specific prior written permission.
+#   - Neither the name of The University of Texas at Austin nor the names
+#     of its contributors may be used to endorse or promote products
+#     derived from this software without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 #  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
--- a/config/reference/bli_config.h
+++ b/config/reference/bli_config.h
@@ -48,7 +48,7 @@
 // of the C99 type "long int". Note that this ONLY affects integers used
 // internally within BLIS as well as those exposed in the native BLAS-like BLIS
 // interface.
-#define BLIS_INT_TYPE_SIZE               32
+#define BLIS_INT_TYPE_SIZE               64



@@ -99,7 +99,7 @@

 // Alignment size needed by the instruction set for aligned SIMD/vector
 // instructions.
-#define BLIS_SIMD_ALIGN_SIZE             16
+#define BLIS_SIMD_ALIGN_SIZE             32

 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
@@ -151,7 +151,7 @@
 // while 64 results in 64-bit integers. Any other value results in use of the
 // C99 type "long int". Note that this ONLY affects integers used within the
 // BLAS compatibility layer.
-#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     32
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     64

 // Fortran-77 name-mangling macros.
 #define PASTEF770(name)                                   name ## _
--- a/config/sandybridge/bli_config.h
+++ b/config/sandybridge/bli_config.h
@@ -48,7 +48,7 @@
 // of the C99 type "long int". Note that this ONLY affects integers used
 // internally within BLIS as well as those exposed in the native BLAS-like BLIS
 // interface.
-#define BLIS_INT_TYPE_SIZE               32
+#define BLIS_INT_TYPE_SIZE               64



@@ -69,7 +69,7 @@
 // -- MULTITHREADING -----------------------------------------------------------

 // The maximum number of BLIS threads that will run concurrently.
-#define BLIS_MAX_NUM_THREADS             24
+#define BLIS_MAX_NUM_THREADS             1



@@ -80,7 +80,7 @@
 // The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
 // contiguous memory pools.
 #define BLIS_NUM_MC_X_KC_BLOCKS          BLIS_MAX_NUM_THREADS
-#define BLIS_NUM_KC_X_NC_BLOCKS          1
+#define BLIS_NUM_KC_X_NC_BLOCKS          BLIS_MAX_NUM_THREADS
 #define BLIS_NUM_MC_X_NC_BLOCKS          0

 // The maximum preload byte offset is used to pad the end of the contiguous
@@ -151,7 +151,7 @@
 // while 64 results in 64-bit integers. Any other value results in use of the
 // C99 type "long int". Note that this ONLY affects integers used within the
 // BLAS compatibility layer.
-#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     32
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     64

 // Fortran-77 name-mangling macros.
 #define PASTEF770(name)                        name ## _
--- a/config/sandybridge/bli_kernel.h
+++ b/config/sandybridge/bli_kernel.h
@@ -54,12 +54,12 @@
 //     (b) NR (for triangular operations such as trmm and trsm).
 //

-#define BLIS_DEFAULT_MC_S              64
-#define BLIS_DEFAULT_KC_S              128
+#define BLIS_DEFAULT_MC_S              256
+#define BLIS_DEFAULT_KC_S              384
 #define BLIS_DEFAULT_NC_S              4096

-#define BLIS_DEFAULT_MC_D              96
-#define BLIS_DEFAULT_KC_D              256
+#define BLIS_DEFAULT_MC_D              128
+#define BLIS_DEFAULT_KC_D              384
 #define BLIS_DEFAULT_NC_D              4096

 #define BLIS_DEFAULT_MC_C              64
@@ -70,10 +70,13 @@
 #define BLIS_DEFAULT_KC_Z              128
 #define BLIS_DEFAULT_NC_Z              4096

+//#define BLIS_DEFAULT_4M_MC_Z           128
+//#define BLIS_DEFAULT_4M_KC_Z           128
+
 // -- Register blocksizes --

 #define BLIS_DEFAULT_MR_S              8
-#define BLIS_DEFAULT_NR_S              4
+#define BLIS_DEFAULT_NR_S              8

 #define BLIS_DEFAULT_MR_D              8
 #define BLIS_DEFAULT_NR_D              4
@@ -152,7 +155,10 @@

 // -- gemm --

-#define BLIS_DGEMM_UKERNEL         bli_dgemm_opt_8x4_ref_u4_nodupl_avx1
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_asm_8x8
+
+//#define BLIS_DGEMM_UKERNEL         bli_dgemm_int_8x4
+#define BLIS_DGEMM_UKERNEL         bli_dgemm_asm_8x4

 // -- trsm-related --

--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -85,6 +85,7 @@ CPICFLAGS      := -fPIC
 CDBGFLAGS      := #-g
 CWARNFLAGS     := -Wall
 COPTFLAGS      := -O3 -march=native
+#COPTFLAGS      := -O1 -march=native
 CKOPTFLAGS     := $(COPTFLAGS)
 CVECFLAGS      := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse

--- a/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.c
+++ b/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.c
@@ -1,675 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
-      of its contributors may be used to endorse or promote products
-      derived derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include <immintrin.h> 
-
-
-
-void bli_sgemm_opt_8x4_ref_u4_nodupl_avx1(
-                        dim_t              k,
-                        float*    restrict alpha,
-                        float*    restrict a,
-                        float*    restrict b,
-                        float*    restrict beta,
-                        float*    restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data
-                      )
-{
-	/* Just call the reference implementation. */
-	BLIS_SGEMM_UKERNEL_REF( k,
-	                   alpha,
-	                   a,
-	                   b,
-	                   beta,
-	                   c, rs_c, cs_c,
-	                   data );
-}
-
-
-
-void bli_dgemm_opt_8x4_ref_u4_nodupl_avx1(
-                        dim_t              k,
-                        double*   restrict alpha,
-                        double*   restrict a,
-                        double*   restrict b,
-                        double*   restrict beta,
-                        double*   restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data
-                      )
-{
-	//void* a_next = bli_auxinfo_next_a( data );
-	void* b_next = bli_auxinfo_next_b( data );
-
-	dim_t k_iter  = k / 2;
-	dim_t k_left  = k % 2;
-
-	dim_t i;
-
-        double *c00, *c01, *c02, *c03;
-        double *c40, *c41, *c42, *c43;
-
-	// Quad registers.
-	__m256d va0_3, va4_7;
-	__m256d vA0_3, vA4_7;
-	__m256d vb0, vb1, vb2, vb3;
-	__m256d vb;
-	__m256d vB0;
-
-	__m256d va0_3b_0, va4_7b_0; 
-	__m256d va0_3b_1, va4_7b_1; 
-	__m256d va0_3b_2, va4_7b_2; 
-	__m256d va0_3b_3, va4_7b_3; 
-
-	__m256d va0_3b0, va4_7b0; 
-	__m256d va0_3b1, va4_7b1; 
-	__m256d va0_3b2, va4_7b2; 
-	__m256d va0_3b3, va4_7b3; 
-
-
-	__m256d valpha, vbeta, vtmp; 
-	__m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3;
-	__m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3;
-
-	__m128d aa, bb;
-	
-	__asm__ volatile( "prefetcht0 0(%0)          \n\t" : :"r"(a)  );
-	__asm__ volatile( "prefetcht2 0(%0)          \n\t" : :"r"(b_next)  );
-	__asm__ volatile( "prefetcht0 0(%0)          \n\t" : :"r"(c)  );
-
-	va0_3b0 = _mm256_setzero_pd();
-	va0_3b1 = _mm256_setzero_pd();
-	va0_3b2 = _mm256_setzero_pd();
-	va0_3b3 = _mm256_setzero_pd();
-
-	va4_7b0 = _mm256_setzero_pd();
-	va4_7b1 = _mm256_setzero_pd();
-	va4_7b2 = _mm256_setzero_pd();
-	va4_7b3 = _mm256_setzero_pd();
-
-	va0_3b_0 = _mm256_setzero_pd();
-	va0_3b_1 = _mm256_setzero_pd();
-	va0_3b_2 = _mm256_setzero_pd();
-	va0_3b_3 = _mm256_setzero_pd();
-
-	va4_7b_0 = _mm256_setzero_pd();
-	va4_7b_1 = _mm256_setzero_pd();
-	va4_7b_2 = _mm256_setzero_pd();
-	va4_7b_3 = _mm256_setzero_pd();
-
-	// Load va0_3
- 	va0_3 = _mm256_load_pd( a );
-	// Load va4_7
- 	va4_7 = _mm256_load_pd( a + 4 );
-
-	// Load vb (b0,b1,b2,b3) 
- 	vb0 = _mm256_load_pd( b );
-
-	for( i = 0; i < k_iter; ++i )
-	{
-		__asm__ volatile( "prefetcht0 192(%0)          \n\t" : :"r"(a)  );
-
-		// Load va0_3 (Prefetch)
- 		vA0_3 = _mm256_load_pd( a + 8 );
-
-		// Iteration 0.
-		vtmp = _mm256_mul_pd( va0_3, vb0 );
-		va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb0 );
-		va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
-
-		// Load va4_7 (Prefetch)
- 		vA4_7 = _mm256_load_pd( a + 12 );
-
-		// Shuffle vb (b1,b0,b3,b2)
- 		vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 );
-
-		vtmp = _mm256_mul_pd( va0_3, vb1 );
-		va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb1 );
-		va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
-
-		// Permute vb (b3,b2,b1,b0)
- 		vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
-
-		// Load vb (b0,b1,b2,b3) (Prefetch)
- 		vB0 = _mm256_load_pd( b + 4 ); 
-
-		vtmp = _mm256_mul_pd( va0_3, vb2 );
-		va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb2 );
-		va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
-
-		// Shuffle vb (b3,b2,b1,b0)
- 		vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
-
-		vtmp = _mm256_mul_pd( va0_3, vb3 );
-		va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb3 );
-		va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
-
-		// Iteration 1.
-
-		__asm__ volatile( "prefetcht0 512(%0)          \n\t" : :"r"(a)  );
-		
-		// Load va0_3 (Next iteration)
- 		va0_3 = _mm256_load_pd( a + 16 );
-
-		vtmp = _mm256_mul_pd( vA0_3, vB0 );
-		va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
-
- 		vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 );
-
-		vtmp = _mm256_mul_pd( vA4_7, vB0 );
-		va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
-
-		vtmp = _mm256_mul_pd( vA0_3, vb1 );
-		va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
-
-		// Load va4_7 (Next iteration)
- 		va4_7 = _mm256_load_pd( a + 20 );
-
- 		vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
-
-		vtmp = _mm256_mul_pd( vA4_7, vb1 );
-		va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
-
-		vtmp = _mm256_mul_pd( vA0_3, vb2 );
-		va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
-
- 		vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
-
-		vtmp = _mm256_mul_pd( vA4_7, vb2 );
-		va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
-
-		// Load vb0(Next iteration)
- 		vb0 = _mm256_load_pd( b + 8 ); 
-
-		vtmp = _mm256_mul_pd( vA0_3, vb3 );
-		va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
-
-		vtmp = _mm256_mul_pd( vA4_7, vb3 );
-		va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
-
-		a += 16;
-		b += 8;
-
-	}
-
-	for( i = 0; i < k_left; ++i )
-	{
-		// Iteration 0.
-
-		// Load va0_3
- 		va0_3 = _mm256_load_pd( a );
-		// Load va4_7
- 		va4_7 = _mm256_load_pd( a + 4 );
-
-		// Load vb (b0,b1,b2,b3) 
- 		vb = _mm256_load_pd( b );
-
-		vtmp = _mm256_mul_pd( va0_3, vb );
-		va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb );
-		va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
-
-		// Shuffle vb (b1,b0,b3,b2)
- 		vb = _mm256_shuffle_pd( vb, vb, 0x5 );
-
-		vtmp = _mm256_mul_pd( va0_3, vb );
-		va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb );
-		va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
-
-		// Permute vb (b3,b2,b1,b0)
- 		vb = _mm256_permute2f128_pd( vb, vb, 0x1 );
-
-		vtmp = _mm256_mul_pd( va0_3, vb );
-		va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb );
-		va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
-
-		// Shuffle vb (b3,b2,b1,b0)
- 		vb = _mm256_shuffle_pd( vb, vb, 0x5 );
-
-		vtmp = _mm256_mul_pd( va0_3, vb );
-		va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
-
-		vtmp = _mm256_mul_pd( va4_7, vb );
-		va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
-
-		a += 8;
-		b += 4;
-
-	}
-
-	vbeta = _mm256_broadcast_sd( beta );
-
-	__m256d vtmpa_0_3b_0 = _mm256_blend_pd( va0_3b_0, va0_3b_1, 0x6 );
-	__m256d vtmpa_0_3b_1 = _mm256_blend_pd( va0_3b_1, va0_3b_0, 0x6 );
-
-	__m256d vtmpa_0_3b_2 = _mm256_blend_pd( va0_3b_2, va0_3b_3, 0x6 );
-	__m256d vtmpa_0_3b_3 = _mm256_blend_pd( va0_3b_3, va0_3b_2, 0x6 );
-
-	__m256d vtmpa_4_7b_0 = _mm256_blend_pd( va4_7b_0, va4_7b_1, 0x6 );
-	__m256d vtmpa_4_7b_1 = _mm256_blend_pd( va4_7b_1, va4_7b_0, 0x6 );
-
-	__m256d vtmpa_4_7b_2 = _mm256_blend_pd( va4_7b_2, va4_7b_3, 0x6 );
-	__m256d vtmpa_4_7b_3 = _mm256_blend_pd( va4_7b_3, va4_7b_2, 0x6 );
-
-	valpha = _mm256_broadcast_sd( alpha );
-
-	va0_3b0 = _mm256_permute2f128_pd( vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30 );
-	va0_3b3 = _mm256_permute2f128_pd( vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30 );
-
-	va0_3b1 = _mm256_permute2f128_pd( vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30 );
-	va0_3b2 = _mm256_permute2f128_pd( vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30 );
-
-	va4_7b0 = _mm256_permute2f128_pd( vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30 );
-	va4_7b3 = _mm256_permute2f128_pd( vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30 );
-
-	va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 );
-	va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 );
-
-	if( rs_c == 1 )
-	{
-		// Calculate address
-		c00 = ( c + 0*rs_c + 0*cs_c );
-		// Load
-		//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c  );
-		vc0_3_0 = _mm256_load_pd( c00  );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b0);
-		// Scale by beta
-		vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
-		// Add gemm result
-		vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c00, vc0_3_0 );
-	
-		// Calculate address
-		c40 = ( c + 4*rs_c + 0*cs_c );
-		// Load
-		//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c  );
-		vc4_7_0 = _mm256_load_pd( c40  );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b0);
-		// Scale by beta
-		vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
-		// Add gemm result
-		vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c40, vc4_7_0 );
-	
-		// Calculate address
-		c01 = ( c + 0*rs_c + 1*cs_c );
-		// Load
-		//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c  );
-		vc0_3_1 = _mm256_load_pd( c01  );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b1);
-		// Scale by beta
-		vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
-		// Add gemm result
-		vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c01, vc0_3_1 );
-	
-		// Calculate address
-		c41 = ( c + 4*rs_c + 1*cs_c );
-		// Load
-		//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c  );
-		vc4_7_1 = _mm256_load_pd( c41  );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b1);
-		// Scale by beta
-		vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
-		// Add gemm result
-		vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c41, vc4_7_1 );
-	
-		// Calculate address
-		c02 = ( c + 0*rs_c + 2*cs_c );
-		// Load
-		//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c  );
-		vc0_3_2 = _mm256_load_pd( c02 );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b2);
-		// Scale by beta
-		vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
-		// Add gemm result
-		vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c02, vc0_3_2 );
-	
-		// Calculate address
-		c42 = ( c + 4*rs_c + 2*cs_c );
-		// Load
-		//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c  );
-		vc4_7_2 = _mm256_load_pd( c42 );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b2);
-		// Scale by beta
-		vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
-		// Add gemm result
-		vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c42, vc4_7_2 );
-		
-		// Calculate address
-		c03 = ( c + 0*rs_c + 3*cs_c );
-		// Load
-		//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c  );
-		vc0_3_3 = _mm256_load_pd( c03 );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b3);
-		// Scale by beta
-		vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
-		// Add gemm result
-		vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c03, vc0_3_3 );
-	
-		// Calculate address
-		c43 = ( c + 4*rs_c + 3*cs_c );
-		// Load
-		//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c  );
-		vc4_7_3 = _mm256_load_pd( c43 );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b3);
-		// Scale by beta
-		vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
-		// Add gemm result
-		vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
-		// Store back to memory
-		_mm256_store_pd( c43, vc4_7_3 );
-	
-	}
-	else
-	{
-		// Calculate address
-		c00 = ( c + 0*rs_c + 0*cs_c );
-		// Load
-		//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c  );
-		vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ),  
-                                         *(c + 2*rs_c + 0*cs_c ), 
-                                         *(c + 1*rs_c + 0*cs_c ), 
-                                         *(c + 0*rs_c + 0*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b0);
-		// Scale by beta
-		vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
-		// Add gemm result
-		vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c00, vc0_3_0 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 0*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 0*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb );
-
-		// Calculate address
-		c40 = ( c + 4*rs_c + 0*cs_c );
-		// Load
-		//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c  );
-		vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ),  
-                                         *(c + 6*rs_c + 0*cs_c ), 
-                                         *(c + 5*rs_c + 0*cs_c ), 
-                                         *(c + 4*rs_c + 0*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b0);
-		// Scale by beta
-		vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
-		// Add gemm result
-		vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c40, vc4_7_0 );
-	
-		aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 0*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 0*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb );
-
-		// Calculate address
-		c01 = ( c + 0*rs_c + 1*cs_c );
-		// Load
-		//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c  );
-		vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ),  
-                                         *(c + 2*rs_c + 1*cs_c ), 
-                                         *(c + 1*rs_c + 1*cs_c ), 
-                                         *(c + 0*rs_c + 1*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b1);
-		// Scale by beta
-		vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
-		// Add gemm result
-		vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c01, vc0_3_1 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 1*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 1*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb );
-
-		// Calculate address
-		c41 = ( c + 4*rs_c + 1*cs_c );
-		// Load
-		//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c  );
-		vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ),  
-                                         *(c + 6*rs_c + 1*cs_c ), 
-                                         *(c + 5*rs_c + 1*cs_c ), 
-                                         *(c + 4*rs_c + 1*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b1);
-		// Scale by beta
-		vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
-		// Add gemm result
-		vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c41, vc4_7_1 );
-	
-		aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 1*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 1*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb );
-
-		// Calculate address
-		c02 = ( c + 0*rs_c + 2*cs_c );
-		// Load
-		//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c  );
-		vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ),  
-                                         *(c + 2*rs_c + 2*cs_c ), 
-                                         *(c + 1*rs_c + 2*cs_c ), 
-                                         *(c + 0*rs_c + 2*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b2);
-		// Scale by beta
-		vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
-		// Add gemm result
-		vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c02, vc0_3_2 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 2*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 2*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb );
-
-		// Calculate address
-		c42 = ( c + 4*rs_c + 2*cs_c );
-		// Load
-		//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c  );
-		vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ),  
-                                         *(c + 6*rs_c + 2*cs_c ), 
-                                         *(c + 5*rs_c + 2*cs_c ), 
-                                         *(c + 4*rs_c + 2*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b2);
-		// Scale by beta
-		vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
-		// Add gemm result
-		vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c42, vc4_7_2 );
-		
-		aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 2*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 2*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb );
-
-		// Calculate address
-		c03 = ( c + 0*rs_c + 3*cs_c );
-		// Load
-		//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c  );
-		vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ),  
-                                         *(c + 2*rs_c + 3*cs_c ), 
-                                         *(c + 1*rs_c + 3*cs_c ), 
-                                         *(c + 0*rs_c + 3*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va0_3b3);
-		// Scale by beta
-		vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
-		// Add gemm result
-		vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c03, vc0_3_3 );
-	
-		aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ;
-		bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ;
-
-		_mm_storel_pd( c + 0*rs_c + 3*cs_c, aa );
-		_mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa );
-		_mm_storel_pd( c + 2*rs_c + 3*cs_c, bb );
-		_mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb );
-
-		// Calculate address
-		c43 = ( c + 4*rs_c + 3*cs_c );
-		// Load
-		//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c  );
-		vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ),  
-                                         *(c + 6*rs_c + 3*cs_c ), 
-                                         *(c + 5*rs_c + 3*cs_c ), 
-                                         *(c + 4*rs_c + 3*cs_c ) );
-		// Scale by alpha
-		vtmp = _mm256_mul_pd( valpha, va4_7b3);
-		// Scale by beta
-		vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
-		// Add gemm result
-		vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
-		// Store back to memory
-		//_mm256_store_pd( c43, vc4_7_3 );
-
-		aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ;
-		bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ;
-
-		_mm_storel_pd( c + 4*rs_c + 3*cs_c, aa );
-		_mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa );
-		_mm_storel_pd( c + 6*rs_c + 3*cs_c, bb );
-		_mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb );
-	}
-
-}
-
-
-
-void bli_cgemm_opt_8x4_ref_u4_nodupl_avx1(
-                        dim_t              k,
-                        scomplex* restrict alpha,
-                        scomplex* restrict a,
-                        scomplex* restrict b,
-                        scomplex* restrict beta,
-                        scomplex* restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data
-                      )
-{
-	/* Just call the reference implementation. */
-	BLIS_CGEMM_UKERNEL_REF( k,
-	                   alpha,
-	                   a,
-	                   b,
-	                   beta,
-	                   c, rs_c, cs_c,
-	                   data );
-}
-
-
-
-void bli_zgemm_opt_8x4_ref_u4_nodupl_avx1(
-                        dim_t              k,
-                        dcomplex* restrict alpha,
-                        dcomplex* restrict a,
-                        dcomplex* restrict b,
-                        dcomplex* restrict beta,
-                        dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data
-                      )
-{
-	/* Just call the reference implementation. */
-	BLIS_ZGEMM_UKERNEL_REF( k,
-	                   alpha,
-	                   a,
-	                   b,
-	                   beta,
-	                   c, rs_c, cs_c,
-	                   data );
-}
-
--- a/test/Makefile
+++ b/test/Makefile
@@ -126,7 +126,7 @@ BLIS_LIB       := $(BLIS_LIB_PATH)/libblis.a

 # BLAS library path(s). This is where the BLAS libraries reside.
 BLAS_LIB_PATH  := $(HOME)/flame/lib
-MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64/
+MKL_LIB_PATH   := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
 ESSL_LIB_PATH  := $(HOME)/path/to/essl/changeme

 # OpenBLAS
@@ -174,7 +174,7 @@ TEST_OBJS      := $(patsubst $(TEST_SRC_PATH)/%.c, \
 CFLAGS         += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)

 LINKER         := $(CC)
-LDFLAGS        := -L/home/00146/field/gnu/gcc-4.8.2/lib64
+LDFLAGS        := #-L/home/00146/field/gnu/gcc-4.8.2/lib64
 LDFLAGS        += -lgfortran -lm -lpthread


@@ -187,7 +187,7 @@ LDFLAGS        += -lgfortran -lm -lpthread
 #
 #   blis openblas atlas mkl mac essl
 #
-all: blis openblas atlas mkl
+all: blis openblas #mkl

 blis: test_gemv_blis.x \
      test_ger_blis.x \
--- a/test/test_gemm.c
+++ b/test/test_gemm.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2014, The University of Texas

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
-      of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -81,7 +81,11 @@ int main( int argc, char** argv )
 #endif

 #if 1
-	dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DOUBLE;
+	dt_a = BLIS_DOUBLE;
+	dt_b = BLIS_DOUBLE;
+	dt_c = BLIS_DOUBLE;
+	dt_alpha = BLIS_DOUBLE;
+	dt_beta = BLIS_DOUBLE;
 #else
 	dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX;
 #endif
--- a/testsuite/input.general
+++ b/testsuite/input.general
@@ -8,7 +8,7 @@
 #  accepted values.
 #

-1       # Number of repeats per experiment (best result is reported)
+3       # Number of repeats per experiment (best result is reported)
 c       # Matrix storage scheme(s) to test:
        #   'c' = col-major storage; 'g' = general stride storage;
        #   'r' = row-major storage
@@ -17,12 +17,12 @@ c       # Vector storage scheme(s) to test:
        #   'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride
 0       # Test all combinations of storage schemes?
 32      # General stride spacing (for cases when testing general stride)
-sdcz    # Datatype(s) to test:
+s #sdcz    # Datatype(s) to test:
        #   's' = single real; 'c' = single complex;
        #   'd' = double real; 'z' = double complex
-100     # Problem size: first to test
-300     # Problem size: maximum to test
-100     # Problem size: increment between experiments
+128     # Problem size: first to test
+2048    # Problem size: maximum to test
+128     # Problem size: increment between experiments
 1       # Error-checking level:
        #   '0' = disable error checking; '1' = full error checking
 i       # Reaction to test failure:
--- a/testsuite/input.operations
+++ b/testsuite/input.operations
@@ -78,11 +78,11 @@
 # --- Section overrides ----------------------------------------------------

 1        # Utility
-1        # Level-1v
-1        # Level-1m
-1        # Level-1f kernels
-1        # Level-2
-1        # Level-3 micro-kernels
+0        # Level-1v
+0        # Level-1m
+0        # Level-1f kernels
+0        # Level-2
+0        # Level-3 micro-kernels
 1        # Level-3


@@ -291,49 +291,49 @@
 1        # gemm
 1        #   test sequential front-end
 -1 -1 -1 #   dimensions: m n k
-??       #   parameters: transa transb
+nn       #   parameters: transa transb

-1        # hemm
+0        # hemm
 1        #   test sequential front-end
 -1 -1    #   dimensions: m n
 ????     #   parameters: side uploa conja transb

-1        # herk
+0        # herk
 1        #   test sequential front-end
 -1 -1    #   dimensions: m k
 ??       #   parameters: uploc transa

-1        # her2k
+0        # her2k
 1        #   test sequential front-end
 -1 -1    #   dimensions: m k
 ???      #   parameters: uploc transa transb

-1        # symm
+0        # symm
 1        #   test sequential front-end
 -1 -1    #   dimensions: m n
 ????     #   parameters: side uploa conja transb

-1        # syrk
+0        # syrk
 1        #   test sequential front-end
 -1 -1    #   dimensions: m k
 ??       #   parameters: uploc transa

-1        # syr2k
+0        # syr2k
 1        #   test sequential front-end
 -1 -1    #   dimensions: m k
 ???      #   parameters: uploc transa transb

-1        # trmm
+0        # trmm
 1        #   test sequential front-end
 -1 -1    #   dimensions: m n
 ????     #   parameters: side uploa transa diaga

-1        # trmm3
+0        # trmm3
 1        #   test sequential front-end
 -1 -1    #   dimensions: m n
 ?????    #   parameters: side uploa transa diaga transb

-1        # trsm
+0        # trsm
 1        #   test sequential front-end
 -1 -1    #   dimensions: m n
 ????     #   parameters: side uploa transa diaga
--- a/testsuite/src/test_gemm.c
+++ b/testsuite/src/test_gemm.c
@@ -178,8 +178,10 @@ void libblis_test_gemm_experiment( test_params_t* params,
 	}
 	else
 	{
-		bli_setsc(  1.2,  0.8, &alpha );
-		bli_setsc( -1.0,  1.0, &beta );
+		//bli_setsc(  1.2,  0.8, &alpha );
+		//bli_setsc( -1.0,  1.0, &beta );
+		bli_setsc(  1.2,  0.0, &alpha );
+		bli_setsc( -1.0,  0.0, &beta );
 	}

 	// Randomize A, B, and C, and save C.