From d352c746e5683037d41b5061dfb5ce08e1d0843b Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 27 Aug 2013 13:41:46 -0500
Subject: [PATCH] Added single/real gemm micro-kernel for x86_64.

Details:
- Added a single-precision real gemm micro-kernel in
  kernels/x86_64/3/bli_gemm_opt_d4x4.c.
- Adjusted the single-precision real register blocksizes in
  config/clarksville/bli_kernel.h to be 8x4.
- Added a missing comment to bli_packm_blk_var2.c that was present in
  bli_packm_blk_var3.c
---
 config/clarksville/bli_kernel.h      |   8 +-
 frame/1m/packm/bli_packm_blk_var2.c  |   3 +
 kernels/x86_64/3/bli_gemm_opt_d4x4.c | 714 ++++++++++++++++++++++++++-
 3 files changed, 698 insertions(+), 27 deletions(-)

diff --git a/config/clarksville/bli_kernel.h b/config/clarksville/bli_kernel.h
index 8992df6a7..dca826c64 100644
--- a/config/clarksville/bli_kernel.h
+++ b/config/clarksville/bli_kernel.h
@@ -54,7 +54,7 @@
 //     (b) NR (for triangular operations such as trmm and trsm).
 // 
 
-#define BLIS_DEFAULT_MC_S              256
+#define BLIS_DEFAULT_MC_S              768
 #define BLIS_DEFAULT_KC_S              256
 #define BLIS_DEFAULT_NC_S              8192
 
@@ -82,8 +82,8 @@
 #define BLIS_EXTEND_KC_S               0 //(BLIS_DEFAULT_KC_S/4)
 #define BLIS_EXTEND_NC_S               0 //(BLIS_DEFAULT_NC_S/4)
 
-#define BLIS_EXTEND_MC_D               (BLIS_DEFAULT_MC_D/2)
-#define BLIS_EXTEND_KC_D               (BLIS_DEFAULT_KC_D/2)
+#define BLIS_EXTEND_MC_D               0 //(BLIS_DEFAULT_MC_D/2)
+#define BLIS_EXTEND_KC_D               0 //(BLIS_DEFAULT_KC_D/2)
 #define BLIS_EXTEND_NC_D               0 //(BLIS_DEFAULT_NC_D/4)
 
 #define BLIS_EXTEND_MC_C               0 //(BLIS_DEFAULT_MC_C/4)
@@ -100,7 +100,7 @@
 // in the m and n dimensions should all be equal to the size expected by
 // the reference micro-kernel(s).
 
-#define BLIS_DEFAULT_MR_S              4
+#define BLIS_DEFAULT_MR_S              8
 #define BLIS_DEFAULT_NR_S              4
 
 #define BLIS_DEFAULT_MR_D              4
diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c
index c9e394b80..6a1b15cf3 100644
--- a/frame/1m/packm/bli_packm_blk_var2.c
+++ b/frame/1m/packm/bli_packm_blk_var2.c
@@ -260,6 +260,9 @@ void PASTEMAC(ch,varname )( \
 		{ \
 			diagoffc_i_abs = bli_abs( diagoffc_i ); \
 \
+			/* Sanity check. Diagonals should not intersect the short end of
+			   a micro-panel, but we can probably still support those cases if
+			   it happens. */ \
 			if ( ( bli_is_col_stored( rs_p, cs_p ) && diagoffc_i < 0 ) || \
 			     ( bli_is_row_stored( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
 				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
diff --git a/kernels/x86_64/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/3/bli_gemm_opt_d4x4.c
index fcf8f76cd..f133d8e64 100644
--- a/kernels/x86_64/3/bli_gemm_opt_d4x4.c
+++ b/kernels/x86_64/3/bli_gemm_opt_d4x4.c
@@ -45,7 +45,675 @@ void bli_sgemm_opt_d4x4(
                          float* restrict    b_next
                        )
 {
-	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+	dim_t   k_iter;
+	dim_t   k_left;
+
+	k_iter  = k / 4;
+	k_left  = k % 4;
+
+	__asm__ volatile
+	(
+		"                                \n\t"
+		"                                \n\t"
+		"movq          %2, %%rax         \n\t" // load address of a.
+		"movq          %3, %%rbx         \n\t" // load address of b.
+		"movq          %9, %%r9          \n\t" // load address of b_next.
+		"                                \n\t"
+		"subq    $-8 * 16, %%rax         \n\t" // increment pointers to allow byte
+		"subq    $-8 * 16, %%rbx         \n\t" // offsets in the unrolled iterations.
+		"                                \n\t"
+		"movaps  -8 * 16(%%rax), %%xmm0  \n\t" // initialize loop by pre-loading elements
+		"movaps  -7 * 16(%%rax), %%xmm1  \n\t" // of a and b.
+		"movaps  -8 * 16(%%rbx), %%xmm2  \n\t"
+		"                                \n\t"
+		"movq          %6, %%rcx         \n\t" // load address of c
+		"movq          %8, %%rdi         \n\t" // load cs_c
+		"leaq        (,%%rdi,4), %%rdi   \n\t" // cs_c *= sizeof(float)
+		"leaq   (%%rcx,%%rdi,2), %%r10   \n\t" // load address of c + 2*cs_c;
+		"                                \n\t"
+		"prefetcht2   0 * 4(%%r9)        \n\t" // prefetch b_next
+		"                                \n\t"
+		"xorpd     %%xmm3,  %%xmm3       \n\t"
+		"xorpd     %%xmm4,  %%xmm4       \n\t"
+		"xorpd     %%xmm5,  %%xmm5       \n\t"
+		"xorpd     %%xmm6,  %%xmm6       \n\t"
+		"                                \n\t"
+		"prefetcht0   6 * 4(%%rcx)       \n\t" // prefetch c + 0*cs_c
+		"xorpd     %%xmm8,  %%xmm8       \n\t"
+		"movaps    %%xmm8,  %%xmm9       \n\t"
+		"prefetcht0   6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
+		"movaps    %%xmm8, %%xmm10       \n\t"
+		"movaps    %%xmm8, %%xmm11       \n\t"
+		"prefetcht0   6 * 4(%%r10)       \n\t" // prefetch c + 2*cs_c
+		"movaps    %%xmm8, %%xmm12       \n\t"
+		"movaps    %%xmm8, %%xmm13       \n\t"
+		"prefetcht0   6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
+		"movaps    %%xmm8, %%xmm14       \n\t"
+		"movaps    %%xmm8, %%xmm15       \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movq      %0, %%rsi             \n\t" // i = k_iter;
+		"testq  %%rsi, %%rsi             \n\t" // check i via logical AND.
+		"je     .SCONSIDKLEFT            \n\t" // if i == 0, jump to code that
+		"                                \n\t" // contains the k_left loop.
+		"                                \n\t"
+		"                                \n\t"
+		".SLOOPKITER:                    \n\t" // MAIN LOOP
+		"                                \n\t"
+		"prefetcht0  (4*35+1) * 8(%%rax) \n\t"
+		"                                \n\t"
+		"addps   %%xmm6, %%xmm10         \n\t" // iteration 0
+		"addps   %%xmm3, %%xmm14         \n\t"
+		"movaps  %%xmm2, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm2, %%xmm7  \n\t"
+		"mulps   %%xmm0, %%xmm2          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm4, %%xmm11         \n\t"
+		"addps   %%xmm5, %%xmm15         \n\t"
+		"movaps  %%xmm7, %%xmm5          \n\t"
+		"pshufd   $0x39, %%xmm7, %%xmm6  \n\t"
+		"mulps   %%xmm0, %%xmm7          \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"                                \n\t"
+		"addps   %%xmm2, %%xmm8          \n\t"
+		"movaps  -7 * 16(%%rbx), %%xmm2  \n\t"
+		"addps   %%xmm3, %%xmm12         \n\t"
+		"movaps  %%xmm6, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm6, %%xmm4  \n\t"
+		"mulps   %%xmm0, %%xmm6          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm7, %%xmm9          \n\t"
+		"addps   %%xmm5, %%xmm13         \n\t"
+		"movaps  %%xmm4, %%xmm5          \n\t"
+		"mulps   %%xmm0, %%xmm4          \n\t"
+		"movaps  -6 * 16(%%rax), %%xmm0  \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"movaps  -5 * 16(%%rax), %%xmm1  \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"addps   %%xmm6, %%xmm10         \n\t" // iteration 1
+		"addps   %%xmm3, %%xmm14         \n\t"
+		"movaps  %%xmm2, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm2, %%xmm7  \n\t"
+		"mulps   %%xmm0, %%xmm2          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm4, %%xmm11         \n\t"
+		"addps   %%xmm5, %%xmm15         \n\t"
+		"movaps  %%xmm7, %%xmm5          \n\t"
+		"pshufd   $0x39, %%xmm7, %%xmm6  \n\t"
+		"mulps   %%xmm0, %%xmm7          \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"                                \n\t"
+		"addps   %%xmm2, %%xmm8          \n\t"
+		"movaps  -6 * 16(%%rbx), %%xmm2  \n\t"
+		"addps   %%xmm3, %%xmm12         \n\t"
+		"movaps  %%xmm6, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm6, %%xmm4  \n\t"
+		"mulps   %%xmm0, %%xmm6          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm7, %%xmm9          \n\t"
+		"addps   %%xmm5, %%xmm13         \n\t"
+		"movaps  %%xmm4, %%xmm5          \n\t"
+		"mulps   %%xmm0, %%xmm4          \n\t"
+		"movaps  -4 * 16(%%rax), %%xmm0  \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"movaps  -3 * 16(%%rax), %%xmm1  \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"addps   %%xmm6, %%xmm10         \n\t" // iteration 2
+		"addps   %%xmm3, %%xmm14         \n\t"
+		"movaps  %%xmm2, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm2, %%xmm7  \n\t"
+		"mulps   %%xmm0, %%xmm2          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm4, %%xmm11         \n\t"
+		"addps   %%xmm5, %%xmm15         \n\t"
+		"movaps  %%xmm7, %%xmm5          \n\t"
+		"pshufd   $0x39, %%xmm7, %%xmm6  \n\t"
+		"mulps   %%xmm0, %%xmm7          \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"                                \n\t"
+		"addps   %%xmm2, %%xmm8          \n\t"
+		"movaps  -5 * 16(%%rbx), %%xmm2  \n\t"
+		"addps   %%xmm3, %%xmm12         \n\t"
+		"movaps  %%xmm6, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm6, %%xmm4  \n\t"
+		"mulps   %%xmm0, %%xmm6          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm7, %%xmm9          \n\t"
+		"addps   %%xmm5, %%xmm13         \n\t"
+		"movaps  %%xmm4, %%xmm5          \n\t"
+		"mulps   %%xmm0, %%xmm4          \n\t"
+		"movaps  -2 * 16(%%rax), %%xmm0  \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"movaps  -1 * 16(%%rax), %%xmm1  \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"addps   %%xmm6, %%xmm10         \n\t" // iteration 3
+		"addps   %%xmm3, %%xmm14         \n\t"
+		"movaps  %%xmm2, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm2, %%xmm7  \n\t"
+		"mulps   %%xmm0, %%xmm2          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"subq  $-4 * 8 * 4, %%rax        \n\t" // a += 4*8 (unroll x mr)
+		"                                \n\t"
+		"addps   %%xmm4, %%xmm11         \n\t"
+		"addps   %%xmm5, %%xmm15         \n\t"
+		"movaps  %%xmm7, %%xmm5          \n\t"
+		"pshufd   $0x39, %%xmm7, %%xmm6  \n\t"
+		"mulps   %%xmm0, %%xmm7          \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"                                \n\t"
+		"subq  $-4 * 4 * 4, %%r9         \n\t" // b_next += 4*4 (unroll x nr)
+		"                                \n\t"
+		"addps   %%xmm2, %%xmm8          \n\t"
+		"movaps  -4 * 16(%%rbx), %%xmm2  \n\t"
+		"addps   %%xmm3, %%xmm12         \n\t"
+		"movaps  %%xmm6, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm6, %%xmm4  \n\t"
+		"mulps   %%xmm0, %%xmm6          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"subq  $-4 * 4 * 4, %%rbx        \n\t" // b += 4*4 (unroll x nr)
+		"                                \n\t"
+		"addps   %%xmm7, %%xmm9          \n\t"
+		"addps   %%xmm5, %%xmm13         \n\t"
+		"movaps  %%xmm4, %%xmm5          \n\t"
+		"mulps   %%xmm0, %%xmm4          \n\t"
+		"movaps  -8 * 16(%%rax), %%xmm0  \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"movaps  -7 * 16(%%rax), %%xmm1  \n\t"
+		"                                \n\t"
+		"prefetcht2        0 * 4(%%r9)   \n\t" // prefetch b_next[0]
+		"prefetcht2       16 * 4(%%r9)   \n\t" // prefetch b_next[16]
+		"                                \n\t"
+		"                                \n\t"
+		"decq   %%rsi                    \n\t" // i -= 1;
+		"jne    .SLOOPKITER              \n\t" // iterate again if i != 0.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SCONSIDKLEFT:                  \n\t"
+		"                                \n\t"
+		"movq      %1, %%rsi             \n\t" // i = k_left;
+		"testq  %%rsi, %%rsi             \n\t" // check i via logical AND.
+		"je     .SPOSTACCUM              \n\t" // if i == 0, we're done; jump to end.
+		"                                \n\t" // else, we prepare to enter k_left loop.
+		"                                \n\t"
+		"                                \n\t"
+		".SLOOPKLEFT:                    \n\t" // EDGE LOOP
+		"                                \n\t"
+		"addps   %%xmm6, %%xmm10         \n\t" // iteration 0
+		"addps   %%xmm3, %%xmm14         \n\t"
+		"movaps  %%xmm2, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm2, %%xmm7  \n\t"
+		"mulps   %%xmm0, %%xmm2          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm4, %%xmm11         \n\t"
+		"addps   %%xmm5, %%xmm15         \n\t"
+		"movaps  %%xmm7, %%xmm5          \n\t"
+		"pshufd   $0x39, %%xmm7, %%xmm6  \n\t"
+		"mulps   %%xmm0, %%xmm7          \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"                                \n\t"
+		"addps   %%xmm2, %%xmm8          \n\t"
+		"movaps  -7 * 16(%%rbx), %%xmm2  \n\t"
+		"addps   %%xmm3, %%xmm12         \n\t"
+		"movaps  %%xmm6, %%xmm3          \n\t"
+		"pshufd   $0x39, %%xmm6, %%xmm4  \n\t"
+		"mulps   %%xmm0, %%xmm6          \n\t"
+		"mulps   %%xmm1, %%xmm3          \n\t"
+		"                                \n\t"
+		"addps   %%xmm7, %%xmm9          \n\t"
+		"addps   %%xmm5, %%xmm13         \n\t"
+		"movaps  %%xmm4, %%xmm5          \n\t"
+		"mulps   %%xmm0, %%xmm4          \n\t"
+		"movaps  -6 * 16(%%rax), %%xmm0  \n\t"
+		"mulps   %%xmm1, %%xmm5          \n\t"
+		"movaps  -5 * 16(%%rax), %%xmm1  \n\t"
+		"                                \n\t"
+		"subq  $-1 * 8 * 4, %%rax        \n\t" // a += 8 (1 x mr)
+		"subq  $-1 * 4 * 4, %%rbx        \n\t" // b += 4 (1 x nr)
+		"                                \n\t"
+		"                                \n\t"
+		"decq   %%rsi                    \n\t" // i -= 1;
+		"jne    .SLOOPKLEFT              \n\t" // iterate again if i != 0.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SPOSTACCUM:                    \n\t"
+		"                                \n\t"
+		"addps   %%xmm6, %%xmm10         \n\t"
+		"addps   %%xmm3, %%xmm14         \n\t"
+		"addps   %%xmm4, %%xmm11         \n\t"
+		"addps   %%xmm5, %%xmm15         \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movq    %4, %%rax               \n\t" // load address of alpha
+		"movq    %5, %%rbx               \n\t" // load address of beta 
+		"movss   (%%rax), %%xmm6         \n\t" // load alpha to bottom 4 bytes of xmm6
+		"movss   (%%rbx), %%xmm7         \n\t" // load beta to bottom 4 bytes of xmm7
+		"pshufd  $0x00, %%xmm6, %%xmm6   \n\t" // populate xmm6 with four alphas
+		"pshufd  $0x00, %%xmm7, %%xmm7   \n\t" // populate xmm7 with four betas
+		"                                \n\t"
+		"                                \n\t"
+		"movq    %7, %%rsi               \n\t" // load rs_c
+		"movq    %%rsi, %%r8             \n\t" // make a copy of rs_c
+		"                                \n\t"
+		"leaq    (,%%rsi,4), %%rsi       \n\t" // rsi = rs_c * sizeof(float)
+		"leaq    (%%rsi,%%rsi,2), %%r11  \n\t" // r11 = 3*(rs_c * sizeof(float))
+		"                                \n\t"
+		"leaq   (%%rcx,%%rsi,4), %%rdx   \n\t" // load address of c + 4*rs_c;
+		"                                \n\t"
+		"                                \n\t" // xmm8:   xmm9:   xmm10:  xmm11:
+		"                                \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
+		"                                \n\t" //   ab11    ab12    ab13    ab10
+		"                                \n\t" //   ab22    ab23    ab20    ab21
+		"                                \n\t" //   ab33 )  ab30 )  ab31 )  ab32 )
+		"                                \n\t" //
+		"                                \n\t" // xmm12:  xmm13:  xmm14:  xmm15:
+		"                                \n\t" // ( ab40  ( ab41  ( ab42  ( ab43
+		"                                \n\t" //   ab51    ab52    ab53    ab50
+		"                                \n\t" //   ab62    ab63    ab60    ab61
+		"                                \n\t" //   ab73 )  ab70 )  ab71 )  ab72 )
+		"movaps  %%xmm9, %%xmm4          \n\t"
+		"shufps   $0xd8, %%xmm8,  %%xmm9 \n\t"
+		"shufps   $0xd8, %%xmm11, %%xmm8 \n\t"
+		"shufps   $0xd8, %%xmm10, %%xmm11\n\t"
+		"shufps   $0xd8, %%xmm4,  %%xmm10\n\t"
+		"                                \n\t"
+		"movaps  %%xmm8, %%xmm4          \n\t"
+		"shufps   $0xd8, %%xmm10, %%xmm8 \n\t"
+		"shufps   $0xd8, %%xmm4, %%xmm10 \n\t"
+		"movaps  %%xmm9, %%xmm5          \n\t"
+		"shufps   $0xd8, %%xmm11, %%xmm9 \n\t"
+		"shufps   $0xd8, %%xmm5, %%xmm11 \n\t"
+		"                                \n\t"
+		"movaps  %%xmm13, %%xmm4         \n\t"
+		"shufps   $0xd8, %%xmm12, %%xmm13\n\t"
+		"shufps   $0xd8, %%xmm15, %%xmm12\n\t"
+		"shufps   $0xd8, %%xmm14, %%xmm15\n\t"
+		"shufps   $0xd8, %%xmm4,  %%xmm14\n\t"
+		"                                \n\t"
+		"movaps  %%xmm12, %%xmm4         \n\t"
+		"shufps   $0xd8, %%xmm14, %%xmm12\n\t"
+		"shufps   $0xd8, %%xmm4, %%xmm14 \n\t"
+		"movaps  %%xmm13, %%xmm5         \n\t"
+		"shufps   $0xd8, %%xmm15, %%xmm13\n\t"
+		"shufps   $0xd8, %%xmm5, %%xmm15 \n\t"
+		"                                \n\t" // xmm8:   xmm9:   xmm10:  xmm11:
+		"                                \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
+		"                                \n\t" //   ab10    ab11    ab12    ab13
+		"                                \n\t" //   ab20    ab21    ab22    ab23
+		"                                \n\t" //   ab30 )  ab31 )  ab32 )  ab33 )
+		"                                \n\t" //
+		"                                \n\t" // xmm12:  xmm13:  xmm14:  xmm15:
+		"                                \n\t" // ( ab40  ( ab41  ( ab42  ( ab43
+		"                                \n\t" //   ab50    ab51    ab52    ab53
+		"                                \n\t" //   ab60    ab61    ab62    ab63
+		"                                \n\t" //   ab70 )  ab71 )  ab72 )  ab73 )
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // determine if
+		"                                \n\t" //   c % 16 == 0, AND
+		"                                \n\t" //   rs_c == 1
+		"                                \n\t" // ie: aligned and column-stored
+		"                                \n\t"
+		"cmpq       $1, %%r8             \n\t" // set ZF if rs_c == 1.
+		"sete           %%bl             \n\t" // bl = ( ZF == 1 ? 1 : 0 );
+		"testq     $15, %%rcx            \n\t" // set ZF if c & 16 is zero.
+		"setz           %%bh             \n\t" // bh = ( ZF == 1 ? 1 : 0 );
+		"                                \n\t" // and(bl,bh) will reveal result
+		"                                \n\t"
+		"                                \n\t" // now avoid loading C if beta == 0
+		"                                \n\t"
+		"xorpd     %%xmm0,  %%xmm0       \n\t" // set xmm0 to zero.
+		"ucomisd   %%xmm0,  %%xmm7       \n\t" // check if beta == 0.
+		"je      .SBETAZERO              \n\t" // if ZF = 1, jump to beta == 0 case
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // check if aligned/column-stored
+		"andb     %%bl, %%bh             \n\t" // set ZF if bl & bh == 1.
+		"jne     .SCOLSTORED             \n\t" // jump to column storage case
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SGENSTORED:                    \n\t"
+		"                                \n\t"
+		"movlps  (%%rcx        ), %%xmm0 \n\t" // load c00 ~ c30
+		"movhps  (%%rcx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rcx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rcx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm8,  %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rcx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rcx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rcx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rcx,%%r11  ) \n\t"
+		"                                \n\t"
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movlps  (%%rdx        ), %%xmm0 \n\t" // load c40 ~ c70
+		"movhps  (%%rdx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rdx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rdx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm12        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm12, %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rdx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rdx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rdx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rdx,%%r11  ) \n\t"
+		"                                \n\t"
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movlps  (%%rcx        ), %%xmm0 \n\t" // load c01 ~ c31
+		"movhps  (%%rcx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rcx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rcx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm9         \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm9,  %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rcx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rcx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rcx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rcx,%%r11  ) \n\t"
+		"                                \n\t"
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movlps  (%%rdx        ), %%xmm0 \n\t" // load c41 ~ c71
+		"movhps  (%%rdx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rdx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rdx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm13        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm13, %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rdx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rdx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rdx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rdx,%%r11  ) \n\t"
+		"                                \n\t"
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movlps  (%%rcx        ), %%xmm0 \n\t" // load c02 ~ c32
+		"movhps  (%%rcx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rcx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rcx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm10        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm10, %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rcx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rcx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rcx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rcx,%%r11  ) \n\t"
+		"                                \n\t"
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movlps  (%%rdx        ), %%xmm0 \n\t" // load c42 ~ c72
+		"movhps  (%%rdx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rdx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rdx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm14        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm14, %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rdx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rdx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rdx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rdx,%%r11  ) \n\t"
+		"                                \n\t"
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movlps  (%%rcx        ), %%xmm0 \n\t" // load c03 ~ c33
+		"movhps  (%%rcx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rcx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rcx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm11        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm11, %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rcx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rcx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rcx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rcx,%%r11  ) \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movlps  (%%rdx        ), %%xmm0 \n\t" // load c43 ~ c73
+		"movhps  (%%rdx,%%rsi,1), %%xmm0 \n\t"
+		"movlps  (%%rdx,%%rsi,2), %%xmm1 \n\t"
+		"movhps  (%%rdx,%%r11  ), %%xmm1 \n\t"
+		"shufps    $0x88, %%xmm1, %%xmm0 \n\t"
+		"                                \n\t"
+		"mulps   %%xmm6,  %%xmm15        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm15, %%xmm0         \n\t" // add the gemm result,
+		"                                \n\t"
+		"movss   %%xmm0, (%%rdx        ) \n\t" // and store back to memory.
+		"pshufd    $0x39, %%xmm0, %%xmm1 \n\t"
+		"movss   %%xmm1, (%%rdx,%%rsi,1) \n\t"
+		"pshufd    $0x39, %%xmm1, %%xmm2 \n\t"
+		"movss   %%xmm2, (%%rdx,%%rsi,2) \n\t"
+		"pshufd    $0x39, %%xmm2, %%xmm3 \n\t"
+		"movss   %%xmm3, (%%rdx,%%r11  ) \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"jmp    .SDONE                   \n\t" // jump to end.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SCOLSTORED:                    \n\t"
+		"                                \n\t"
+		"movaps  (%%rcx),       %%xmm0   \n\t" // load c00 ~ c30,
+		"mulps   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm8,  %%xmm0         \n\t" // add the gemm result,
+		"movaps  %%xmm0,  (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
+		"movaps  (%%rdx),       %%xmm1   \n\t" // load c40 ~ c70,
+		"mulps   %%xmm6,  %%xmm12        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm1         \n\t" // scale by beta,
+		"addps  %%xmm12,  %%xmm1         \n\t" // add the gemm result,
+		"movaps  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movaps  (%%rcx),       %%xmm0   \n\t" // load c01 ~ c31,
+		"mulps   %%xmm6,  %%xmm9         \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps   %%xmm9,  %%xmm0         \n\t" // add the gemm result,
+		"movaps  %%xmm0,  (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
+		"movaps  (%%rdx),       %%xmm1   \n\t" // load c41 ~ c71,
+		"mulps   %%xmm6,  %%xmm13        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm1         \n\t" // scale by beta,
+		"addps  %%xmm13,  %%xmm1         \n\t" // add the gemm result,
+		"movaps  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movaps  (%%rcx),       %%xmm0   \n\t" // load c02 ~ c32,
+		"mulps   %%xmm6,  %%xmm10        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps  %%xmm10,  %%xmm0         \n\t" // add the gemm result,
+		"movaps  %%xmm0,  (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t"
+		"movaps  (%%rdx),       %%xmm1   \n\t" // load c42 ~ c72,
+		"mulps   %%xmm6,  %%xmm14        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm1         \n\t" // scale by beta,
+		"addps  %%xmm14,  %%xmm1         \n\t" // add the gemm result,
+		"movaps  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"movaps  (%%rcx),       %%xmm0   \n\t" // load c03 ~ c33,
+		"mulps   %%xmm6,  %%xmm11        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm0         \n\t" // scale by beta,
+		"addps  %%xmm11,  %%xmm0         \n\t" // add the gemm result,
+		"movaps  %%xmm0,  (%%rcx)        \n\t" // and store back to memory.
+		"                                \n\t"
+		"                                \n\t"
+		"movaps  (%%rdx),       %%xmm1   \n\t" // load c43 ~ c73,
+		"mulps   %%xmm6,  %%xmm15        \n\t" // scale by alpha,
+		"mulps   %%xmm7,  %%xmm1         \n\t" // scale by beta,
+		"addps  %%xmm15,  %%xmm1         \n\t" // add the gemm result,
+		"movaps  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
+		"                                \n\t"
+		"jmp    .SDONE                   \n\t" // jump to end.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SBETAZERO:                     \n\t"
+		"                                \n\t" // check if aligned/column-stored
+		"andb     %%bl, %%bh             \n\t" // set ZF if bl & bh == 1.
+		"jne     .SCOLSTORBZ             \n\t" // jump to column storage case
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SGENSTORBZ:                    \n\t"
+		"                                \n\t"
+		"jmp    .SGENSTORED              \n\t" // use gen-stored beta != 0 case for now
+		//"jmp    .SDONE                   \n\t" // jump to end.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SCOLSTORBZ:                    \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c00 ~ c30,
+		"mulps   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
+		"movaps  %%xmm8,  (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c40 ~ c70,
+		"mulps   %%xmm6,  %%xmm12        \n\t" // scale by alpha,
+		"movaps  %%xmm12, (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c01 ~ c31,
+		"mulps   %%xmm6,  %%xmm9         \n\t" // scale by alpha,
+		"movaps  %%xmm9,  (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c41 ~ c71,
+		"mulps   %%xmm6,  %%xmm13        \n\t" // scale by alpha,
+		"movaps  %%xmm13, (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c02 ~ c32,
+		"mulps   %%xmm6,  %%xmm10        \n\t" // scale by alpha,
+		"movaps  %%xmm10, (%%rcx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rcx           \n\t"
+		"                                \n\t" // skip loading c42 ~ c72,
+		"mulps   %%xmm6,  %%xmm14        \n\t" // scale by alpha,
+		"movaps  %%xmm14, (%%rdx)        \n\t" // and store back to memory.
+		"addq     %%rdi, %%rdx           \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t" // skip loading c03 ~ c33,
+		"mulps   %%xmm6,  %%xmm11        \n\t" // scale by alpha,
+		"movaps  %%xmm11, (%%rcx)        \n\t" // and store back to memory.
+		"                                \n\t"
+		"                                \n\t" // skip loading c43 ~ c73,
+		"mulps   %%xmm6,  %%xmm15        \n\t" // scale by alpha,
+		"movaps  %%xmm15, (%%rdx)        \n\t" // and store back to memory.
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		"                                \n\t"
+		".SDONE:                         \n\t"
+		"                                \n\t"
+
+		: // output operands (none)
+		: // input operands
+		  "m" (k_iter),
+		  "m" (k_left),
+		  "m" (a),
+		  "m" (b),
+		  "m" (alpha),
+		  "m" (beta),
+		  "m" (c),
+		  "m" (rs_c),
+		  "m" (cs_c),
+		  "m" (b_next)
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+		  "xmm0", "xmm1", "xmm2", "xmm3",
+		  "xmm4", "xmm5", "xmm6", "xmm7",
+		  "xmm8", "xmm9", "xmm10", "xmm11",
+		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "memory"
+	);
 }
 
 void bli_dgemm_opt_d4x4(
@@ -109,11 +777,11 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t"
 		"movq      %0, %%rsi             \n\t" // i = k_iter;
 		"testq  %%rsi, %%rsi             \n\t" // check i via logical AND.
-		"je     .CONSIDERKLEFT           \n\t" // if i == 0, jump to code that
+		"je     .DCONSIDKLEFT            \n\t" // if i == 0, jump to code that
 		"                                \n\t" // contains the k_left loop.
 		"                                \n\t"
 		"                                \n\t"
-		".LOOPKITER:                     \n\t" // MAIN LOOP
+		".DLOOPKITER:                    \n\t" // MAIN LOOP
 		"                                \n\t"
 		"prefetcht0  (4*35+1) * 8(%%rax) \n\t"
 		"                                \n\t"
@@ -252,19 +920,19 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t"
 		"                                \n\t"
 		"decq   %%rsi                    \n\t" // i -= 1;
-		"jne    .LOOPKITER               \n\t" // iterate again if i != 0.
+		"jne    .DLOOPKITER              \n\t" // iterate again if i != 0.
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".CONSIDERKLEFT:                 \n\t"
+		".DCONSIDKLEFT:                  \n\t"
 		"                                \n\t"
 		"movq      %1, %%rsi             \n\t" // i = k_left;
 		"testq  %%rsi, %%rsi             \n\t" // check i via logical AND.
-		"je     .POSTACCUM               \n\t" // if i == 0, we're done; jump to end.
+		"je     .DPOSTACCUM              \n\t" // if i == 0, we're done; jump to end.
 		"                                \n\t" // else, we prepare to enter k_left loop.
 		"                                \n\t"
 		"                                \n\t"
-		".LOOPKLEFT:                     \n\t" // EDGE LOOP
+		".DLOOPKLEFT:                    \n\t" // EDGE LOOP
 		"                                \n\t"
 		"addpd   %%xmm3, %%xmm11         \n\t" // iteration 0
 		"movaps  -7 * 16(%%rbx), %%xmm3  \n\t"
@@ -302,11 +970,11 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t"
 		"                                \n\t"
 		"decq   %%rsi                    \n\t" // i -= 1;
-		"jne    .LOOPKLEFT               \n\t" // iterate again if i != 0.
+		"jne    .DLOOPKLEFT              \n\t" // iterate again if i != 0.
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".POSTACCUM:                     \n\t"
+		".DPOSTACCUM:                    \n\t"
 		"                                \n\t"
 		"addpd   %%xmm3, %%xmm11         \n\t"
 		"addpd   %%xmm4, %%xmm15         \n\t"
@@ -322,7 +990,8 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t"
 		"movq    %7, %%rsi               \n\t" // load rs_c
 		"movq    %%rsi, %%r8             \n\t" // make a copy of rs_c
-		"leaq    (,%%rsi,8), %%rsi       \n\t" // rs_c *= sizeof(double)
+		"                                \n\t"
+		"leaq    (,%%rsi,8), %%rsi       \n\t" // rsi = rs_c * sizeof(double)
 		"                                \n\t"
 		"leaq   (%%rcx,%%rsi,2), %%rdx   \n\t" // load address of c + 2*rs_c;
 		"                                \n\t"
@@ -373,16 +1042,16 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t"
 		"xorpd     %%xmm0,  %%xmm0       \n\t" // set xmm0 to zero.
 		"ucomisd   %%xmm0,  %%xmm7       \n\t" // check if beta == 0.
-		"je      .BETAZERO               \n\t" // if ZF = 1, jump to beta == 0 case
+		"je      .DBETAZERO              \n\t" // if ZF = 1, jump to beta == 0 case
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t" // check if aligned/column-stored
 		"andb     %%bl, %%bh             \n\t" // set ZF if bl & bh == 1.
-		"jne     .COLSTORED              \n\t" // jump to column storage case
+		"jne     .DCOLSTORED             \n\t" // jump to column storage case
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".GENSTORED:                     \n\t"
+		".DGENSTORED:                    \n\t"
 		"                                \n\t"
 		"movlpd  (%%rcx),       %%xmm0   \n\t" // load c00 and c10,
 		"movhpd  (%%rcx,%%rsi), %%xmm0   \n\t"
@@ -461,11 +1130,11 @@ void bli_dgemm_opt_d4x4(
 		"movlpd  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
 		"movhpd  %%xmm1,  (%%rdx,%%rsi)  \n\t"
 		"                                \n\t"
-		"jmp    .DONE                    \n\t" // jump to end.
+		"jmp    .DDONE                   \n\t" // jump to end.
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".COLSTORED:                     \n\t"
+		".DCOLSTORED:                    \n\t"
 		"                                \n\t"
 		"movaps  (%%rcx),       %%xmm0   \n\t" // load c00 and c10,
 		"mulpd   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
@@ -528,19 +1197,19 @@ void bli_dgemm_opt_d4x4(
 		"addpd  %%xmm15,  %%xmm1         \n\t" // add the gemm result,
 		"movaps  %%xmm1,  (%%rdx)        \n\t" // and store back to memory.
 		"                                \n\t"
-		"jmp    .DONE                    \n\t" // jump to end.
+		"jmp    .DDONE                   \n\t" // jump to end.
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".BETAZERO:                      \n\t"
+		".DBETAZERO:                     \n\t"
 		"                                \n\t" // check if aligned/column-stored
 		"andb     %%bl, %%bh             \n\t" // set ZF if bl & bh == 1.
-		"jne     .COLSTORBZ              \n\t" // jump to column storage case
+		"jne     .DCOLSTORBZ             \n\t" // jump to column storage case
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".GENSTORBZ:                     \n\t"
+		".DGENSTORBZ:                    \n\t"
 		"                                \n\t" // skip loading c00 and c10,
 		"mulpd   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
 		"movlpd  %%xmm8,  (%%rcx)        \n\t" // and store back to memory.
@@ -587,11 +1256,11 @@ void bli_dgemm_opt_d4x4(
 		"movlpd  %%xmm15, (%%rdx)        \n\t" // and store back to memory.
 		"movhpd  %%xmm15, (%%rdx,%%rsi)  \n\t"
 		"                                \n\t"
-		"jmp    .DONE                    \n\t" // jump to end.
+		"jmp    .DDONE                   \n\t" // jump to end.
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".COLSTORBZ:                     \n\t"
+		".DCOLSTORBZ:                    \n\t"
 		"                                \n\t"
 		"                                \n\t" // skip loading c00 and c10,
 		"mulpd   %%xmm6,  %%xmm8         \n\t" // scale by alpha,
@@ -638,7 +1307,7 @@ void bli_dgemm_opt_d4x4(
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		".DONE:                          \n\t"
+		".DDONE:                         \n\t"
 		"                                \n\t"
 
 		: // output operands (none)
@@ -661,7 +1330,6 @@ void bli_dgemm_opt_d4x4(
 		  "xmm12", "xmm13", "xmm14", "xmm15",
 		  "memory"
 	);
-
 }
 
 void bli_cgemm_opt_d4x4(