diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.c
new file mode 100644
index 000000000..b6eaeafa8
--- /dev/null
+++ b/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.c
@@ -0,0 +1,222 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_strsm_l_opt_d4x4(
+                           float* restrict    a11,
+                           float* restrict    b11,
+                           float* restrict    c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+}
+
+void bli_dtrsm_l_opt_d4x4(
+                           double* restrict   a11,
+                           double* restrict   b11,
+                           double* restrict   c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	__asm__ volatile
+	(
+		"                                  \n\t"
+		"movq      %1, %%rbx               \n\t" // load address of b11.
+		"                                  \n\t"
+		"movaps  0 * 16(%%rbx), %%xmm8     \n\t" // xmm8  = ( beta00 beta01 )
+		"movaps  1 * 16(%%rbx), %%xmm12    \n\t" // xmm9  = ( beta02 beta03 )
+		"movaps  2 * 16(%%rbx), %%xmm9     \n\t" // xmm10 = ( beta10 beta11 )
+		"movaps  3 * 16(%%rbx), %%xmm13    \n\t" // xmm11 = ( beta12 beta13 )
+		"movaps  4 * 16(%%rbx), %%xmm10    \n\t" // xmm12 = ( beta20 beta21 )
+		"movaps  5 * 16(%%rbx), %%xmm14    \n\t" // xmm13 = ( beta22 beta23 )
+		"movaps  6 * 16(%%rbx), %%xmm11    \n\t" // xmm14 = ( beta30 beta31 )
+		"movaps  7 * 16(%%rbx), %%xmm15    \n\t" // xmm15 = ( beta32 beta33 )
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"movq     %0, %%rax                \n\t" // load address of a11
+		"movq     %2, %%rcx                \n\t" // load address of c11
+		"                                  \n\t"
+		"movq     %3, %%rsi                \n\t" // load rs_c
+		"movq     %4, %%rdi                \n\t" // load cs_c
+		"salq     $3, %%rsi                \n\t" // rs_c *= sizeof( double )
+		"salq     $3, %%rdi                \n\t" // cs_c *= sizeof( double )
+		"                                  \n\t"
+		"leaq   (%%rcx,%%rdi,2), %%rdx     \n\t" // c11_2 = c11 + 2*cs_c
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 0
+		"                                  \n\t"
+		"movddup (0+0*4)*8(%%rax), %%xmm0  \n\t" // load xmm0 = (1/alpha00)
+		"                                  \n\t"
+		"mulpd    %%xmm0, %%xmm8           \n\t" // xmm8  *= (1/alpha00);
+		"mulpd    %%xmm0, %%xmm12          \n\t" // xmm12 *= (1/alpha00);
+		"                                  \n\t"
+		"movaps   %%xmm8,  0 * 16(%%rbx)   \n\t" // store ( beta00 beta01 ) = xmm8
+		"movaps   %%xmm12, 1 * 16(%%rbx)   \n\t" // store ( beta02 beta03 ) = xmm12
+		"movlpd   %%xmm8,  (%%rcx)         \n\t" // store ( gamma00 ) = xmm8[0]
+		"movhpd   %%xmm8,  (%%rcx,%%rdi)   \n\t" // store ( gamma01 ) = xmm8[1]
+		"movlpd   %%xmm12, (%%rdx)         \n\t" // store ( gamma02 ) = xmm12[0]
+		"movhpd   %%xmm12, (%%rdx,%%rdi)   \n\t" // store ( gamma03 ) = xmm12[1]
+		"addq     %%rsi, %%rcx             \n\t" // c11   += rs_c
+		"addq     %%rsi, %%rdx             \n\t" // c11_2 += rs_c
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 1
+		"                                  \n\t"
+		"movddup (1+0*4)*8(%%rax), %%xmm0  \n\t" // load xmm0 = alpha10
+		"movddup (1+1*4)*8(%%rax), %%xmm1  \n\t" // load xmm1 = (1/alpha11)
+		"                                  \n\t"
+		"movaps   %%xmm0,  %%xmm4          \n\t" // xmm4 = xmm0
+		"mulpd    %%xmm8,  %%xmm0          \n\t" // xmm0 = alpha10 * ( beta00 beta01 )
+		"mulpd    %%xmm12, %%xmm4          \n\t" // xmm4 = alpha10 * ( beta02 beta03 )
+		"subpd    %%xmm0,  %%xmm9          \n\t" // xmm9  -= xmm0
+		"subpd    %%xmm4,  %%xmm13         \n\t" // xmm13 -= xmm4
+		"mulpd    %%xmm1,  %%xmm9          \n\t" // xmm9  *= (1/alpha11);
+		"mulpd    %%xmm1,  %%xmm13         \n\t" // xmm13 *= (1/alpha11);
+		"                                  \n\t"
+		"movaps   %%xmm9,  2 * 16(%%rbx)   \n\t" // store ( beta10 beta11 ) = xmm9
+		"movaps   %%xmm13, 3 * 16(%%rbx)   \n\t" // store ( beta12 beta13 ) = xmm13
+		"movlpd   %%xmm9,  (%%rcx)         \n\t" // store ( gamma10 ) = xmm9[0]
+		"movhpd   %%xmm9,  (%%rcx,%%rdi)   \n\t" // store ( gamma11 ) = xmm9[1]
+		"movlpd   %%xmm13, (%%rdx)         \n\t" // store ( gamma12 ) = xmm13[0]
+		"movhpd   %%xmm13, (%%rdx,%%rdi)   \n\t" // store ( gamma13 ) = xmm13[1]
+		"addq     %%rsi, %%rcx             \n\t" // c11   += rs_c
+		"addq     %%rsi, %%rdx             \n\t" // c11_2 += rs_c
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 2
+		"                                  \n\t"
+		"movddup (2+0*4)*8(%%rax), %%xmm0  \n\t" // load xmm0 = alpha20
+		"movddup (2+1*4)*8(%%rax), %%xmm1  \n\t" // load xmm1 = alpha21
+		"movddup (2+2*4)*8(%%rax), %%xmm2  \n\t" // load xmm2 = (1/alpha22)
+		"                                  \n\t"
+		"movaps   %%xmm0,  %%xmm4          \n\t" // xmm4 = xmm0
+		"movaps   %%xmm1,  %%xmm5          \n\t" // xmm5 = xmm1
+		"mulpd    %%xmm8,  %%xmm0          \n\t" // xmm0 = alpha20 * ( beta00 beta01 )
+		"mulpd    %%xmm12, %%xmm4          \n\t" // xmm4 = alpha20 * ( beta02 beta03 )
+		"mulpd    %%xmm9,  %%xmm1          \n\t" // xmm1 = alpha21 * ( beta10 beta11 )
+		"mulpd    %%xmm13, %%xmm5          \n\t" // xmm5 = alpha21 * ( beta12 beta13 )
+		"addpd    %%xmm1,  %%xmm0          \n\t" // xmm0 += xmm1;
+		"addpd    %%xmm5,  %%xmm4          \n\t" // xmm4 += xmm5;
+		"subpd    %%xmm0,  %%xmm10         \n\t" // xmm10 -= xmm0
+		"subpd    %%xmm4,  %%xmm14         \n\t" // xmm14 -= xmm4
+		"mulpd    %%xmm2,  %%xmm10         \n\t" // xmm10 *= (1/alpha22);
+		"mulpd    %%xmm2,  %%xmm14         \n\t" // xmm14 *= (1/alpha22);
+		"                                  \n\t"
+		"movaps   %%xmm10, 4 * 16(%%rbx)   \n\t" // store ( beta20 beta21 ) = xmm10
+		"movaps   %%xmm14, 5 * 16(%%rbx)   \n\t" // store ( beta22 beta23 ) = xmm14
+		"movlpd   %%xmm10, (%%rcx)         \n\t" // store ( gamma20 ) = xmm10[0]
+		"movhpd   %%xmm10, (%%rcx,%%rdi)   \n\t" // store ( gamma21 ) = xmm10[1]
+		"movlpd   %%xmm14, (%%rdx)         \n\t" // store ( gamma22 ) = xmm14[0]
+		"movhpd   %%xmm14, (%%rdx,%%rdi)   \n\t" // store ( gamma23 ) = xmm14[1]
+		"addq     %%rsi, %%rcx             \n\t" // c11   += rs_c
+		"addq     %%rsi, %%rdx             \n\t" // c11_2 += rs_c
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 3
+		"                                  \n\t"
+		"movddup (3+0*4)*8(%%rax), %%xmm0  \n\t" // load xmm0 = alpha30
+		"movddup (3+1*4)*8(%%rax), %%xmm1  \n\t" // load xmm1 = alpha31
+		"movddup (3+2*4)*8(%%rax), %%xmm2  \n\t" // load xmm2 = alpha32
+		"movddup (3+3*4)*8(%%rax), %%xmm3  \n\t" // load xmm3 = (1/alpha33)
+		"                                  \n\t"
+		"movaps   %%xmm0,  %%xmm4          \n\t" // xmm4 = xmm0
+		"movaps   %%xmm1,  %%xmm5          \n\t" // xmm5 = xmm1
+		"movaps   %%xmm2,  %%xmm6          \n\t" // xmm6 = xmm2
+		"mulpd    %%xmm8,  %%xmm0          \n\t" // xmm0 = alpha30 * ( beta00 beta01 )
+		"mulpd    %%xmm12, %%xmm4          \n\t" // xmm4 = alpha30 * ( beta02 beta03 )
+		"mulpd    %%xmm9,  %%xmm1          \n\t" // xmm1 = alpha31 * ( beta10 beta11 )
+		"mulpd    %%xmm13, %%xmm5          \n\t" // xmm5 = alpha31 * ( beta12 beta13 )
+		"mulpd    %%xmm10, %%xmm2          \n\t" // xmm2 = alpha32 * ( beta20 beta21 )
+		"mulpd    %%xmm14, %%xmm6          \n\t" // xmm6 = alpha32 * ( beta22 beta23 )
+		"addpd    %%xmm1,  %%xmm0          \n\t" // xmm0 += xmm1;
+		"addpd    %%xmm5,  %%xmm4          \n\t" // xmm4 += xmm5;
+		"addpd    %%xmm2,  %%xmm0          \n\t" // xmm0 += xmm2;
+		"addpd    %%xmm6,  %%xmm4          \n\t" // xmm4 += xmm6;
+		"subpd    %%xmm0,  %%xmm11         \n\t" // xmm11 -= xmm0
+		"subpd    %%xmm4,  %%xmm15         \n\t" // xmm15 -= xmm4
+		"mulpd    %%xmm3,  %%xmm11         \n\t" // xmm11 *= (1/alpha33);
+		"mulpd    %%xmm3,  %%xmm15         \n\t" // xmm15 *= (1/alpha33);
+		"                                  \n\t"
+		"movaps   %%xmm11, 6 * 16(%%rbx)   \n\t" // store ( beta30 beta31 ) = xmm11
+		"movaps   %%xmm15, 7 * 16(%%rbx)   \n\t" // store ( beta32 beta33 ) = xmm15
+		"movlpd   %%xmm11, (%%rcx)         \n\t" // store ( gamma30 ) = xmm11[0]
+		"movhpd   %%xmm11, (%%rcx,%%rdi)   \n\t" // store ( gamma31 ) = xmm11[1]
+		"movlpd   %%xmm15, (%%rdx)         \n\t" // store ( gamma32 ) = xmm15[0]
+		"movhpd   %%xmm15, (%%rdx,%%rdi)   \n\t" // store ( gamma33 ) = xmm15[1]
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+
+		: // output operands (none)
+		: // input operands
+		  "m" (a11),    // 0
+		  "m" (b11),    // 1
+		  "m" (c11),    // 2
+		  "m" (rs_c),   // 3
+		  "m" (cs_c)    // 4
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
+		  "xmm0", "xmm1", "xmm2", "xmm3",
+		  "xmm4", "xmm5", "xmm6", "xmm7",
+		  "xmm8", "xmm9", "xmm10", "xmm11",
+		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "memory"
+	);
+
+}
+
+void bli_ctrsm_l_opt_d4x4(
+                           scomplex* restrict a11,
+                           scomplex* restrict b11,
+                           scomplex* restrict c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+}
+
+void bli_ztrsm_l_opt_d4x4(
+                           dcomplex* restrict a11,
+                           dcomplex* restrict b11,
+                           dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+}
+
diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.h b/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.h
new file mode 100644
index 000000000..4ab828999
--- /dev/null
+++ b/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           ctype* restrict a11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c  \
+                         );
+
+INSERT_GENTPROT_BASIC( trsm_l_opt_d4x4 )
+
diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.c
new file mode 100644
index 000000000..2155031a5
--- /dev/null
+++ b/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.c
@@ -0,0 +1,225 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_strsm_u_opt_d4x4(
+                           float* restrict    a11,
+                           float* restrict    b11,
+                           float* restrict    c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+}
+
+void bli_dtrsm_u_opt_d4x4(
+                           double* restrict   a11,
+                           double* restrict   b11,
+                           double* restrict   c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	__asm__ volatile
+	(
+		"                                  \n\t"
+		"movq      %1, %%rbx               \n\t" // load address of b11.
+		"                                  \n\t"
+		"movaps  0 * 16(%%rbx), %%xmm8     \n\t" // xmm8  = ( beta00 beta01 )
+		"movaps  1 * 16(%%rbx), %%xmm12    \n\t" // xmm9  = ( beta02 beta03 )
+		"movaps  2 * 16(%%rbx), %%xmm9     \n\t" // xmm10 = ( beta10 beta11 )
+		"movaps  3 * 16(%%rbx), %%xmm13    \n\t" // xmm11 = ( beta12 beta13 )
+		"movaps  4 * 16(%%rbx), %%xmm10    \n\t" // xmm12 = ( beta20 beta21 )
+		"movaps  5 * 16(%%rbx), %%xmm14    \n\t" // xmm13 = ( beta22 beta23 )
+		"movaps  6 * 16(%%rbx), %%xmm11    \n\t" // xmm14 = ( beta30 beta31 )
+		"movaps  7 * 16(%%rbx), %%xmm15    \n\t" // xmm15 = ( beta32 beta33 )
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"movq     %0, %%rax                \n\t" // load address of a11
+		"movq     %2, %%rcx                \n\t" // load address of c11
+		"                                  \n\t"
+		"movq     %3, %%rsi                \n\t" // load rs_c
+		"movq     %4, %%rdi                \n\t" // load cs_c
+		"salq     $3, %%rsi                \n\t" // rs_c *= sizeof( double )
+		"salq     $3, %%rdi                \n\t" // cs_c *= sizeof( double )
+		"                                  \n\t"
+		"addq  %%rsi, %%rcx                \n\t" // c11 += (4-1)*rs_c
+		"addq  %%rsi, %%rcx                \n\t"
+		"addq  %%rsi, %%rcx                \n\t"
+		"leaq   (%%rcx,%%rdi,2), %%rdx     \n\t" // c11_2 = c11 + 2*cs_c;
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 0
+		"                                  \n\t"
+		"movddup (3+3*4)*8(%%rax), %%xmm3  \n\t" // load xmm3 = (1/alpha33)
+		"                                  \n\t"
+		"mulpd    %%xmm3, %%xmm11          \n\t" // xmm11 *= (1/alpha33);
+		"mulpd    %%xmm3, %%xmm15          \n\t" // xmm15 *= (1/alpha33);
+		"                                  \n\t"
+		"movaps   %%xmm11, 6 * 16(%%rbx)   \n\t" // store ( beta30 beta31 ) = xmm11
+		"movaps   %%xmm15, 7 * 16(%%rbx)   \n\t" // store ( beta32 beta33 ) = xmm15
+		"movlpd   %%xmm11, (%%rcx)         \n\t" // store ( gamma30 ) = xmm11[0]
+		"movhpd   %%xmm11, (%%rcx,%%rdi)   \n\t" // store ( gamma31 ) = xmm11[1]
+		"movlpd   %%xmm15, (%%rdx)         \n\t" // store ( gamma32 ) = xmm15[0]
+		"movhpd   %%xmm15, (%%rdx,%%rdi)   \n\t" // store ( gamma33 ) = xmm15[1]
+		"subq     %%rsi, %%rcx             \n\t" // c11   -= rs_c
+		"subq     %%rsi, %%rdx             \n\t" // c11_2 -= rs_c
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 1
+		"                                  \n\t"
+		"movddup (2+2*4)*8(%%rax), %%xmm2  \n\t" // load xmm2 = (1/alpha22)
+		"movddup (2+3*4)*8(%%rax), %%xmm3  \n\t" // load xmm3 = alpha23
+		"                                  \n\t"
+		"movaps   %%xmm3,  %%xmm7          \n\t" // xmm7 = xmm3
+		"mulpd    %%xmm11, %%xmm3          \n\t" // xmm3 = alpha23 * ( beta30 beta31 )
+		"mulpd    %%xmm15, %%xmm7          \n\t" // xmm7 = alpha23 * ( beta32 beta33 )
+		"subpd    %%xmm3,  %%xmm10         \n\t" // xmm10 -= xmm3
+		"subpd    %%xmm7,  %%xmm14         \n\t" // xmm14 -= xmm7
+		"mulpd    %%xmm2,  %%xmm10         \n\t" // xmm10 *= (1/alpha22);
+		"mulpd    %%xmm2,  %%xmm14         \n\t" // xmm14 *= (1/alpha22);
+		"                                  \n\t"
+		"movaps   %%xmm10, 4 * 16(%%rbx)   \n\t" // store ( beta20 beta21 ) = xmm10
+		"movaps   %%xmm14, 5 * 16(%%rbx)   \n\t" // store ( beta22 beta23 ) = xmm14
+		"movlpd   %%xmm10, (%%rcx)         \n\t" // store ( gamma20 ) = xmm10[0]
+		"movhpd   %%xmm10, (%%rcx,%%rdi)   \n\t" // store ( gamma21 ) = xmm10[1]
+		"movlpd   %%xmm14, (%%rdx)         \n\t" // store ( gamma22 ) = xmm14[0]
+		"movhpd   %%xmm14, (%%rdx,%%rdi)   \n\t" // store ( gamma23 ) = xmm14[1]
+		"subq     %%rsi, %%rcx             \n\t" // c11   -= rs_c
+		"subq     %%rsi, %%rdx             \n\t" // c11_2 -= rs_c
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 2
+		"                                  \n\t"
+		"movddup (1+1*4)*8(%%rax), %%xmm1  \n\t" // load xmm1 = (1/alpha11)
+		"movddup (1+2*4)*8(%%rax), %%xmm2  \n\t" // load xmm2 = alpha12
+		"movddup (1+3*4)*8(%%rax), %%xmm3  \n\t" // load xmm3 = alpha13
+		"                                  \n\t"
+		"movaps   %%xmm2,  %%xmm6          \n\t" // xmm6 = xmm2
+		"movaps   %%xmm3,  %%xmm7          \n\t" // xmm7 = xmm3
+		"mulpd    %%xmm10, %%xmm2          \n\t" // xmm2 = alpha12 * ( beta20 beta21 )
+		"mulpd    %%xmm14, %%xmm6          \n\t" // xmm6 = alpha12 * ( beta22 beta23 )
+		"mulpd    %%xmm11, %%xmm3          \n\t" // xmm3 = alpha13 * ( beta30 beta31 )
+		"mulpd    %%xmm15, %%xmm7          \n\t" // xmm7 = alpha13 * ( beta32 beta33 )
+		"addpd    %%xmm3,  %%xmm2          \n\t" // xmm2 += xmm3;
+		"addpd    %%xmm7,  %%xmm6          \n\t" // xmm6 += xmm7;
+		"subpd    %%xmm2,  %%xmm9          \n\t" // xmm9  -= xmm2
+		"subpd    %%xmm6,  %%xmm13         \n\t" // xmm13 -= xmm6
+		"mulpd    %%xmm1,  %%xmm9          \n\t" // xmm9  *= (1/alpha11);
+		"mulpd    %%xmm1,  %%xmm13         \n\t" // xmm13 *= (1/alpha11);
+		"                                  \n\t"
+		"movaps   %%xmm9,  2 * 16(%%rbx)   \n\t" // store ( beta10 beta11 ) = xmm9
+		"movaps   %%xmm13, 3 * 16(%%rbx)   \n\t" // store ( beta12 beta13 ) = xmm13
+		"movlpd   %%xmm9,  (%%rcx)         \n\t" // store ( gamma10 ) = xmm9[0]
+		"movhpd   %%xmm9,  (%%rcx,%%rdi)   \n\t" // store ( gamma11 ) = xmm9[1]
+		"movlpd   %%xmm13, (%%rdx)         \n\t" // store ( gamma12 ) = xmm13[0]
+		"movhpd   %%xmm13, (%%rdx,%%rdi)   \n\t" // store ( gamma13 ) = xmm13[1]
+		"subq     %%rsi, %%rcx             \n\t" // c11   -= rs_c
+		"subq     %%rsi, %%rdx             \n\t" // c11_2 -= rs_c
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t" // iteration 3
+		"                                  \n\t"
+		"movddup (0+0*4)*8(%%rax), %%xmm0  \n\t" // load xmm0 = (1/alpha00)
+		"movddup (0+1*4)*8(%%rax), %%xmm1  \n\t" // load xmm1 = alpha01
+		"movddup (0+2*4)*8(%%rax), %%xmm2  \n\t" // load xmm2 = alpha02
+		"movddup (0+3*4)*8(%%rax), %%xmm3  \n\t" // load xmm3 = alpha03
+		"                                  \n\t"
+		"movaps   %%xmm1,  %%xmm5          \n\t" // xmm5 = xmm1
+		"movaps   %%xmm2,  %%xmm6          \n\t" // xmm6 = xmm2
+		"movaps   %%xmm3,  %%xmm7          \n\t" // xmm7 = xmm3
+		"mulpd    %%xmm9,  %%xmm1          \n\t" // xmm1 = alpha01 * ( beta10 beta11 )
+		"mulpd    %%xmm13, %%xmm5          \n\t" // xmm5 = alpha01 * ( beta12 beta13 )
+		"mulpd    %%xmm10, %%xmm2          \n\t" // xmm2 = alpha02 * ( beta20 beta21 )
+		"mulpd    %%xmm14, %%xmm6          \n\t" // xmm6 = alpha02 * ( beta22 beta23 )
+		"mulpd    %%xmm11, %%xmm3          \n\t" // xmm3 = alpha03 * ( beta30 beta31 )
+		"mulpd    %%xmm15, %%xmm7          \n\t" // xmm7 = alpha03 * ( beta32 beta33 )
+		"addpd    %%xmm2,  %%xmm1          \n\t" // xmm1 += xmm2;
+		"addpd    %%xmm6,  %%xmm5          \n\t" // xmm5 += xmm6;
+		"addpd    %%xmm3,  %%xmm1          \n\t" // xmm1 += xmm3;
+		"addpd    %%xmm7,  %%xmm5          \n\t" // xmm5 += xmm7;
+		"subpd    %%xmm1,  %%xmm8          \n\t" // xmm8  -= xmm1
+		"subpd    %%xmm5,  %%xmm12         \n\t" // xmm12 -= xmm5
+		"mulpd    %%xmm0,  %%xmm8          \n\t" // xmm8  *= (1/alpha00);
+		"mulpd    %%xmm0,  %%xmm12         \n\t" // xmm12 *= (1/alpha00);
+		"                                  \n\t"
+		"movaps   %%xmm8,  0 * 16(%%rbx)   \n\t" // store ( beta00 beta01 ) = xmm8
+		"movaps   %%xmm12, 1 * 16(%%rbx)   \n\t" // store ( beta02 beta03 ) = xmm12
+		"movlpd   %%xmm8,  (%%rcx)         \n\t" // store ( gamma00 ) = xmm8[0]
+		"movhpd   %%xmm8,  (%%rcx,%%rdi)   \n\t" // store ( gamma01 ) = xmm8[1]
+		"movlpd   %%xmm12, (%%rdx)         \n\t" // store ( gamma02 ) = xmm12[0]
+		"movhpd   %%xmm12, (%%rdx,%%rdi)   \n\t" // store ( gamma03 ) = xmm12[1]
+		"                                  \n\t"
+		"                                  \n\t"
+		"                                  \n\t"
+
+		: // output operands (none)
+		: // input operands
+		  "m" (a11),    // 0
+		  "m" (b11),    // 1
+		  "m" (c11),    // 2
+		  "m" (rs_c),   // 3
+		  "m" (cs_c)    // 4
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+		  "xmm0", "xmm1", "xmm2", "xmm3",
+		  "xmm4", "xmm5", "xmm6", "xmm7",
+		  "xmm8", "xmm9", "xmm10", "xmm11",
+		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "memory"
+	);
+
+}
+
+void bli_ctrsm_u_opt_d4x4(
+                           scomplex* restrict a11,
+                           scomplex* restrict b11,
+                           scomplex* restrict c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+}
+
+void bli_ztrsm_u_opt_d4x4(
+                           dcomplex* restrict a11,
+                           dcomplex* restrict b11,
+                           dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
+                         )
+{
+	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+}
+
diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.h b/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.h
new file mode 100644
index 000000000..7349b29b8
--- /dev/null
+++ b/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           ctype* restrict a11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c  \
+                         );
+
+INSERT_GENTPROT_BASIC( trsm_u_opt_d4x4 )
+