diff --git a/config/clarksville/make_defs.mk b/config/clarksville/make_defs.mk
index c91e0ddef..dbb7155f7 100644
--- a/config/clarksville/make_defs.mk
+++ b/config/clarksville/make_defs.mk
@@ -80,9 +80,9 @@ CC           := gcc
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS  := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS   := -std=c99 # -fopenmp -pg
-CDBGFLAGS    := #-g
+CDBGFLAGS    := -g
 CWARNFLAGS   := -Wall
-COPTFLAGS    := -O2 -fomit-frame-pointer
+COPTFLAGS    := -O2 #-fomit-frame-pointer
 CVECFLAGS    := -msse3 -march=nocona -mfpmath=sse
 
 # Aggregate all of the flags into two groups: one for optimizable code, and
diff --git a/kernels/x86_64/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/3/bli_gemm_opt_d4x4.c
index 4a372d837..93cd0db2b 100644
--- a/kernels/x86_64/3/bli_gemm_opt_d4x4.c
+++ b/kernels/x86_64/3/bli_gemm_opt_d4x4.c
@@ -320,23 +320,12 @@ void bli_dgemm_opt_d4x4(
 		"movddup (%%rbx), %%xmm7         \n\t" // load beta and duplicate
 		"                                \n\t"
 		"                                \n\t"
-		//"movq    %6, %%rcx               \n\t" // load address of c
-		//"movq    %8, %%rdi               \n\t" // load cs_c
-		//"salq    $3, %%rdi               \n\t" // cs_c *= sizeof(double)
-		"                                \n\t"
 		"movq    %7, %%rsi               \n\t" // load rs_c
 		"movq    %%rsi, %%r8             \n\t" // make a copy of rs_c
-		//"salq    $3, %%rsi               \n\t" // rs_c *= sizeof(double)
 		"leaq    (,%%rsi,8), %%rsi       \n\t" // rs_c *= sizeof(double)
 		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
 		"leaq   (%%rcx,%%rsi,2), %%rdx   \n\t" // load address of c + 2*rs_c;
 		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
 		"                                \n\t" // xmm8:   xmm9:   xmm10:  xmm11:
 		"                                \n\t" // ( ab01  ( ab00  ( ab03  ( ab02
 		"                                \n\t" //   ab10 )  ab11 )  ab12 )  ab13 )
@@ -368,12 +357,26 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t" //   ab30 )  ab31 )  ab32 )  ab33 )
 		"                                \n\t"
 		"                                \n\t"
-		"                                \n\t" // assert: c % 16 == 0 && rs_c == 1
+		"                                \n\t"
+		"                                \n\t" // determine if
+		"                                \n\t" //   c % 16 == 0, AND
+		"                                \n\t" //   rs_c == 1
+		"                                \n\t" // ie: aligned and column-stored
 		"                                \n\t"
 		"cmpq       $1, %%r8             \n\t" // set ZF if rs_c == 1.
 		"sete           %%bl             \n\t" // bl = ( ZF == 1 ? 1 : 0 );
 		"testq     $15, %%rcx            \n\t" // set ZF if c & 16 is zero.
 		"setz           %%bh             \n\t" // bh = ( ZF == 1 ? 1 : 0 );
+		"                                \n\t" // and(bl,bh) will reveal result
+		"                                \n\t"
+		"                                \n\t" // now avoid loading C if beta == 0
+		"                                \n\t"
+		//"xorpd     %%xmm0,  %%xmm0       \n\t" // set xmm0 to zero.
+		//"ucomisd   %%xmm0,  %%xmm7       \n\t" // check if beta == 0.
+		//"je      .BETAZERO               \n\t" // if ZF = 1, jump to beta == 0 case
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // check if aligned/column-stored
 		"andb     %%bl, %%bh             \n\t" // set ZF if bl & bh == 1.
 		"jne     .COLSTORED              \n\t" // jump to column storage case
 		"                                \n\t"
@@ -381,8 +384,6 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t"
 		".GENSTORED:                     \n\t"
 		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
 		"movlpd  (%%rcx),       %%xmm0   \n\t" // load c00 and c10,
 		"movhpd  (%%rcx,%%rsi), %%xmm0   \n\t"
 		"mulpd   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
@@ -450,7 +451,7 @@ void bli_dgemm_opt_d4x4(
 		"addpd  %%xmm11,  %%xmm0         \n\t" // add the gemm result,
 		"movlpd  %%xmm0,  (%%rcx)        \n\t" // and store back to memory.
 		"movhpd  %%xmm0,  (%%rcx,%%rsi)  \n\t"
-		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
 		"                                \n\t"
 		"movlpd  (%%rdx),       %%xmm1   \n\t" // load c23 and c33,
 		"movhpd  (%%rdx,%%rsi), %%xmm1   \n\t"
@@ -460,8 +461,6 @@ void bli_dgemm_opt_d4x4(
 		"movlpd  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
 		"movhpd  %%xmm1,  (%%rdx,%%rsi)  \n\t"
 		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
 		"jmp    .DONE                    \n\t" // jump to end.
 		"                                \n\t"
 		"                                \n\t"
@@ -521,7 +520,7 @@ void bli_dgemm_opt_d4x4(
 		"mulpd   %%xmm7,  %%xmm0         \n\t" // scale by beta,
 		"addpd  %%xmm11,  %%xmm0         \n\t" // add the gemm result,
 		"movaps  %%xmm0,  (%%rcx)        \n\t" // and store back to memory.
-		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
 		"                                \n\t"
 		"movaps  (%%rdx),       %%xmm1   \n\t" // load c23 and c33,
 		"mulpd   %%xmm6,  %%xmm15        \n\t" // scale by alpha,
@@ -529,6 +528,114 @@ void bli_dgemm_opt_d4x4(
 		"addpd  %%xmm15,  %%xmm1         \n\t" // add the gemm result,
 		"movaps  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
 		"                                \n\t"
+		"jmp    .DONE                    \n\t" // jump to end.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".BETAZERO:                      \n\t"
+		"                                \n\t" // check if aligned/column-stored
+		"andb     %%bl, %%bh             \n\t" // set ZF if bl & bh == 1.
+		"jne     .COLSTORBZ              \n\t" // jump to column storage case
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".GENSTORBZ:                     \n\t"
+		"                                \n\t" // skip loading c00 and c10,
+		"mulpd   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
+		"movlpd  %%xmm8,  (%%rcx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm8,  (%%rcx,%%rsi)  \n\t"
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c20 and c30,
+		"mulpd   %%xmm6,  %%xmm12        \n\t" // scale by alpha,
+		"movlpd  %%xmm12, (%%rdx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm12, (%%rdx,%%rsi)  \n\t"
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c01 and c11,
+		"mulpd   %%xmm6,  %%xmm9         \n\t" // scale by alpha,
+		"movlpd  %%xmm9,  (%%rcx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm9,  (%%rcx,%%rsi)  \n\t"
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c21 and c31,
+		"mulpd   %%xmm6,  %%xmm13        \n\t" // scale by alpha,
+		"movlpd  %%xmm13, (%%rdx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm13, (%%rdx,%%rsi)  \n\t"
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c02 and c12,
+		"mulpd   %%xmm6,  %%xmm10        \n\t" // scale by alpha,
+		"movlpd  %%xmm10, (%%rcx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm10, (%%rcx,%%rsi)  \n\t"
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c22 and c32,
+		"mulpd   %%xmm6,  %%xmm14        \n\t" // scale by alpha,
+		"movlpd  %%xmm14, (%%rdx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm14, (%%rdx,%%rsi)  \n\t"
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c03 and c13,
+		"mulpd   %%xmm6,  %%xmm11        \n\t" // scale by alpha,
+		"movlpd  %%xmm11, (%%rcx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm11, (%%rcx,%%rsi)  \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c23 and c33,
+		"mulpd   %%xmm6,  %%xmm15        \n\t" // scale by alpha,
+		"movlpd  %%xmm15, (%%rdx)        \n\t" // and store back to memory.
+		"movhpd  %%xmm15, (%%rdx,%%rsi)  \n\t"
+		"                                \n\t"
+		"jmp    .DONE                    \n\t" // jump to end.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".COLSTORBZ:                     \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c00 and c10,
+		"mulpd   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
+		"movaps  %%xmm8,  (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c20 and c30,
+		"mulpd   %%xmm6,  %%xmm12        \n\t" // scale by alpha,
+		"movaps  %%xmm12, (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c01 and c11,
+		"mulpd   %%xmm6,  %%xmm9         \n\t" // scale by alpha,
+		"movaps  %%xmm9,  (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c21 and c31,
+		"mulpd   %%xmm6,  %%xmm13        \n\t" // scale by alpha,
+		"movaps  %%xmm13, (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c02 and c12,
+		"mulpd   %%xmm6,  %%xmm10        \n\t" // scale by alpha,
+		"movaps  %%xmm10, (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c22 and c32,
+		"mulpd   %%xmm6,  %%xmm14        \n\t" // scale by alpha,
+		"movaps  %%xmm14, (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c03 and c13,
+		"mulpd   %%xmm6,  %%xmm11        \n\t" // scale by alpha,
+		"movaps  %%xmm11, (%%rcx)        \n\t" // and store back to memory.
+		"                                \n\t"
+		"                                \n\t" // skip loading c23 and c33,
+		"mulpd   %%xmm6,  %%xmm15        \n\t" // scale by alpha,
+		"movaps  %%xmm15, (%%rdx)        \n\t" // and store back to memory.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
 		".DONE:                          \n\t"
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index d001d42bb..afae782e5 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -978,7 +978,34 @@ void libblis_test_op_driver( test_params_t* params,
 		// Only run combinations where all operands of either type (matrices
 		// or vectors) are stored in one storage scheme or another (no mixing
 		// of schemes within the same operand type).
-		n_store_combos = n_mstorage * n_vstorage;
+		unsigned int n_mat_operands = 0;
+		unsigned int n_vec_operands = 0;
+
+		for ( o = 0; o < n_operands; ++o )
+		{
+			operand_t operand_type
+			          = libblis_test_get_operand_type_for_char( o_types[o] );
+			if      ( operand_type == BLIS_TEST_MATRIX_OPERAND ) ++n_mat_operands;
+			else if ( operand_type == BLIS_TEST_VECTOR_OPERAND ) ++n_vec_operands;
+		}
+
+		// We compute the total number of storage combinations based on whether
+		// the current operation has only matrix operands, only vector operands,
+		// or both.
+		if      ( n_vec_operands == 0 )
+		{
+			n_store_combos = n_mstorage;
+			n_vstorage = 1;
+		}
+		else if ( n_mat_operands == 0 )
+		{
+			n_store_combos = n_vstorage;
+			n_mstorage = 1;
+		}
+		else
+		{
+			n_store_combos = n_mstorage * n_vstorage;
+		}
 
 		sc_str = ( char** ) malloc( n_store_combos * sizeof( char* ) );