From 4423e33dc593115cda92c5763d756d7ad1298aa9 Mon Sep 17 00:00:00 2001
From: dnp <devangiparikh@gmail.com>
Date: Wed, 6 Dec 2017 16:35:03 -0600
Subject: [PATCH] Adding SKX kernels and configuration.

---
 config/skx/bli_cntx_init_skx.c             |  78 +++
 config/skx/bli_family_skx.h                | 133 +++++
 config/skx/make_defs.mk                    | 115 +++++
 config_registry                            |   1 +
 frame/base/bli_arch.c                      |   3 +
 frame/base/bli_cpuid.c                     | 120 +++++
 frame/base/bli_cpuid.h                     |   5 +
 frame/base/bli_gks.c                       |   5 +
 frame/include/bli_arch_config.h            |  12 +-
 frame/include/bli_type_defs.h              |   5 +-
 kernels/skx/3/bli_avx512_macros.h          | 171 ++++++
 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c | 547 ++++++++++++++++++++
 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c | 572 +++++++++++++++++++++
 kernels/skx/bli_kernels_skx.h              |  40 ++
 14 files changed, 1802 insertions(+), 5 deletions(-)
 create mode 100644 config/skx/bli_cntx_init_skx.c
 create mode 100644 config/skx/bli_family_skx.h
 create mode 100644 config/skx/make_defs.mk
 create mode 100644 kernels/skx/3/bli_avx512_macros.h
 create mode 100644 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
 create mode 100644 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
 create mode 100644 kernels/skx/bli_kernels_skx.h

diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c
new file mode 100644
index 000000000..bc23295ac
--- /dev/null
+++ b/config/skx/bli_cntx_init_skx.c
@@ -0,0 +1,78 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_cntx_init_skx( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_skx_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native gemm micro-kernels and
+	// their storage preferences.
+	bli_cntx_set_l3_nat_ukrs
+	(
+	  2,
+	  BLIS_GEMM_UKR, BLIS_FLOAT ,    bli_sgemm_skx_asm_32x12_l2, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,    bli_dgemm_skx_asm_16x12_l2, FALSE,
+	  cntx
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    16,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    12,    12,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   480,   240,   144,    72 );
+	bli_blksz_init     ( &blkszs[ BLIS_KC ],   384,   384,   256,   256,
+                                               480,   480,   256,   256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  3072,  3072,  4080,  4080 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  BLIS_NAT, 5,
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  cntx
+	);
+}
+
diff --git a/config/skx/bli_family_skx.h b/config/skx/bli_family_skx.h
new file mode 100644
index 000000000..13d0f788a
--- /dev/null
+++ b/config/skx/bli_family_skx.h
@@ -0,0 +1,133 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_FAMILY_H
+//#define BLIS_FAMILY_H
+
+// -- THREADING PARAMETERS -----------------------------------------------------
+
+#define BLIS_DEFAULT_M_THREAD_RATIO     3
+#define BLIS_DEFAULT_N_THREAD_RATIO     2
+
+#define BLIS_DEFAULT_MR_THREAD_MAX      1
+#define BLIS_DEFAULT_NR_THREAD_MAX      4
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+#define BLIS_SIMD_ALIGN_SIZE             64
+
+#define BLIS_SIMD_SIZE                   64
+#define BLIS_SIMD_NUM_REGISTERS          32
+
+#ifdef BLIS_NO_HBWMALLOC
+
+#include <stdlib.h>
+
+#define BLIS_MALLOC_POOL malloc
+#define BLIS_FREE_POOL free
+
+#else
+
+#include <hbwmalloc.h>
+
+#define BLIS_MALLOC_POOL hbw_malloc
+#define BLIS_FREE_POOL hbw_free
+
+#endif
+
+
+#if 0
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Cache and register blocksizes --
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes)
+//     (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes)
+//     (b) MR (for zero-padding purposes when MR and NR are "swapped")
+//
+
+#define BLIS_DGEMM_UKERNEL             bli_dgemm_opt_16x12_l2
+#define BLIS_DEFAULT_MC_D              144
+#define BLIS_DEFAULT_KC_D              336
+#define BLIS_DEFAULT_NC_D              5760
+#define BLIS_DEFAULT_MR_D              16
+#define BLIS_DEFAULT_NR_D              12
+#define BLIS_PACKDIM_MR_D              16
+#define BLIS_PACKDIM_NR_D              12
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+//#define BLIS_DEFAULT_KR_S              1
+//#define BLIS_DEFAULT_KR_D              1
+//#define BLIS_DEFAULT_KR_C              1
+//#define BLIS_DEFAULT_KR_Z              1
+
+// -- Maximum cache blocksizes (for optimizing edge cases) --
+
+// NOTE: These cache blocksize "extensions" have the same constraints as
+// the corresponding default blocksizes above. When these values are
+// larger than the default blocksizes, blocksizes used at edge cases are
+// enlarged if such an extension would encompass the remaining portion of
+// the matrix dimension.
+
+#define BLIS_MAXIMUM_MC_S              (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
+#define BLIS_MAXIMUM_KC_S              (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
+#define BLIS_MAXIMUM_NC_S              (BLIS_DEFAULT_NC_S +                   0)
+
+#define BLIS_MAXIMUM_MC_D              (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
+#define BLIS_MAXIMUM_KC_D              (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
+#define BLIS_MAXIMUM_NC_D              (BLIS_DEFAULT_NC_D +                   0)
+
+//#define BLIS_MAXIMUM_MC_C              (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4)
+//#define BLIS_MAXIMUM_KC_C              (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4)
+//#define BLIS_MAXIMUM_NC_C              (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4)
+
+//#define BLIS_MAXIMUM_MC_Z              (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4)
+//#define BLIS_MAXIMUM_KC_Z              (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4)
+//#define BLIS_MAXIMUM_NC_Z              (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4)
+
+
+#endif
+
+
+//#endif
+
diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk
new file mode 100644
index 000000000..4655c652d
--- /dev/null
+++ b/config/skx/make_defs.mk
@@ -0,0 +1,115 @@
+#
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas at Austin nor the names
+#     of its contributors may be used to endorse or promote products
+#     derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := skx
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+ifeq ($(CC),)
+CC             := gcc
+CC_VENDOR      := gcc
+endif
+
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
+# NOTE: This is needed to enable posix_memalign().
+CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
+CMISCFLAGS     := -std=c99 -m64
+CPICFLAGS      := -fPIC
+CWARNFLAGS     := -Wall -Wno-unused-function -Wfatal-errors
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0 -DBLIS_NO_HBWMALLOC
+else
+COPTFLAGS      := -O3
+endif
+
+ifeq ($(DEBUG_TYPE),sde)
+CPPROCFLAGS    += -DBLIS_NO_HBWMALLOC
+endif
+
+CKOPTFLAGS     := $(COPTFLAGS)
+
+ifeq ($(CC_VENDOR),gcc)
+CVECFLAGS      := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
+else
+ifeq ($(CC_VENDOR),icc)
+CVECFLAGS      := -xCORE-AVX512
+else
+ifeq ($(CC_VENDOR),clang)
+CVECFLAGS      := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
+else
+$(error gcc, icc, or clang is required for this configuration.)
+endif
+endif
+endif
+
+# The assembler on OS X won't recognize AVX512 without help
+ifneq ($(CC_VENDOR),icc)
+ifeq ($(OS_NAME),Darwin)
+CVECFLAGS      += -Wa,-march=skylake-avx512
+endif
+endif
+
+# --- Determine the archiver and related flags ---
+AR             := ar
+ARFLAGS        := cr
+
+# --- Determine the linker and related flags ---
+LINKER         := $(CC)
+SOFLAGS        := -shared
+
+ifneq ($(DEBUG_TYPE),sde)
+LDFLAGS        := -lmemkind
+else
+LDFLAGS        :=
+endif
+
+ifneq ($(CC_VENDOR),icc)
+LDFLAGS        += -lm
+endif 
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config_registry b/config_registry
index 780f39455..70667b860 100644
--- a/config_registry
+++ b/config_registry
@@ -18,6 +18,7 @@ haswell:     haswell
 sandybridge: sandybridge
 penryn:      penryn
 knl:         knl
+skx:         skx
 
 # AMD architectures.
 zen:         zen/haswell
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index d8e33672c..a5ef20191 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -48,6 +48,9 @@ arch_t bli_arch_query_id( void )
 #endif
 
 	// Intel microarchitectures.
+#ifdef BLIS_FAMILY_SKX
+	id = BLIS_ARCH_SKX;
+#endif
 #ifdef BLIS_FAMILY_KNL
 	id = BLIS_ARCH_KNL;
 #endif
diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c
index a9d77d1fb..d3d8a47bd 100644
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -47,6 +47,10 @@ arch_t bli_cpuid_query_id( void )
 	{
 		// Check for each Intel configuration that is enabled, check for that
 		// microarchitecture. We check from most recent to most dated.
+#ifdef BLIS_CONFIG_SKX
+		if ( bli_cpuid_is_skx( family, model, features ) )
+			return BLIS_ARCH_SKX;
+#endif
 #ifdef BLIS_CONFIG_KNL
 		if ( bli_cpuid_is_knl( family, model, features ) )
 			return BLIS_ARCH_KNL;
@@ -65,6 +69,8 @@ arch_t bli_cpuid_query_id( void )
 #endif
 		// If none of the other sub-configurations were detected, return
 		// the 'generic' arch_t id value.
+		printf("generic\n");
+
 		return BLIS_ARCH_GENERIC;
 	}
 	else if ( vendor == VENDOR_AMD )
@@ -105,6 +111,31 @@ arch_t bli_cpuid_query_id( void )
 }
 
 // -----------------------------------------------------------------------------
+bool_t bli_cpuid_is_skx
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
+{
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_AVX      |
+	                          FEATURE_FMA3     |
+	                          FEATURE_AVX2     |
+	                          FEATURE_AVX512F  |
+	                          FEATURE_AVX512DQ |
+	                          FEATURE_AVX512BW |
+	                          FEATURE_AVX512VL ;
+
+
+	int nvpu = vpu_count();
+
+	if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 ) {
+		return FALSE;
+	}
+
+	return TRUE;
+}
 
 bool_t bli_cpuid_is_knl
      (
@@ -629,6 +660,95 @@ uint32_t bli_cpuid_query
 		return VENDOR_UNKNOWN;
 }
 
+void get_cpu_name(char *cpu_name)
+{
+    uint32_t eax, ebx, ecx, edx;
+
+    __cpuid(0x80000002u, eax, ebx, ecx, edx);
+    //printf("%x %x %x %x\n", eax, ebx, ecx, edx);
+
+    *(uint32_t *)&cpu_name[0]  = eax;
+    *(uint32_t *)&cpu_name[4]  = ebx;
+    *(uint32_t *)&cpu_name[8]  = ecx;
+    *(uint32_t *)&cpu_name[12] = edx;
+
+    __cpuid(0x80000003u, eax, ebx, ecx, edx);
+    //printf("%x %x %x %x\n", eax, ebx, ecx, edx);
+
+    *(uint32_t *)&cpu_name[16+0]  = eax;
+    *(uint32_t *)&cpu_name[16+4]  = ebx;
+    *(uint32_t *)&cpu_name[16+8]  = ecx;
+    *(uint32_t *)&cpu_name[16+12] = edx;
+
+    __cpuid(0x80000004u, eax, ebx, ecx, edx);
+    //printf("%x %x %x %x\n", eax, ebx, ecx, edx);
+
+    *(uint32_t *)&cpu_name[32+0]  = eax;
+    *(uint32_t *)&cpu_name[32+4]  = ebx;
+    *(uint32_t *)&cpu_name[32+8]  = ecx;
+    *(uint32_t *)&cpu_name[32+12] = edx;
+
+}
+
+int vpu_count()
+{
+	char cpu_name[48] = {};
+	char *loc;
+	char model_num[5];
+	int  sku;
+
+	get_cpu_name(cpu_name);
+
+	if (strstr(cpu_name, "Intel(R) Xeon(R)") != NULL)
+	{
+		loc = strstr(cpu_name, "Platinum");
+		if (loc == NULL)
+			loc = strstr(cpu_name, "Gold");
+		if (loc == NULL)
+			loc = strstr(cpu_name, "Silver");
+		if (loc == NULL)
+			loc = strstr(cpu_name, "Bronze");
+		if (loc == NULL)
+			loc = strstr(cpu_name, "W");
+		if (loc == NULL)
+			return -1;
+
+		loc = strstr(loc+1," ");
+		if(loc == NULL)
+			return -1;
+
+		strncpy(model_num, loc+1, 4);
+		model_num[5] = '\0';
+
+		sku = atoi(model_num);
+
+		if      (8199 >= sku && sku >= 8100) return 2;
+		else if (6199 >= sku && sku >= 6100) return 2;
+		else if (sku == 5122)                return 2;
+		else if (5199 >= sku && sku >= 5100) return 1;
+		else if (4199 >= sku && sku >= 4100) return 1;
+		else if (3199 >= sku && sku >= 3100) return 1;
+		else if (2199 >= sku && sku >= 2120) return 2;
+		else if (2119 >= sku && sku >= 2100) return 1;
+		else return -1;
+	}
+	else if (strstr(cpu_name, "Intel(R) Core(TM) i9") != NULL)
+	{
+		return 1;
+	}
+	else if (strstr(cpu_name, "Intel(R) Core(TM) i7") != NULL)
+	{
+		if (strstr(cpu_name, "7800X") != NULL ||
+				strstr(cpu_name, "7820X") != NULL)
+			return 1;
+		else return -1;
+	}
+	else
+	{
+		return -1;
+	}
+}
+
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
 
 int get_cpu_type( int* model, int* part, int* features )
diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h
index a9c99fef4..6cdc02387 100644
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -37,6 +37,7 @@
 
 arch_t   bli_cpuid_query_id( void );
 
+bool_t   bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features );
 bool_t   bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features );
 bool_t   bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features );
 bool_t   bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features );
@@ -100,6 +101,10 @@ static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want )
 
 #include "cpuid.h"
 
+void get_cpu_name(char *cpu_name);
+int vpu_count();
+
+
 enum
 {
 	VENDOR_INTEL,
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 2ae0223de..e1c0ed23f 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -71,6 +71,11 @@ void bli_gks_init( void )
 		// bli_config.h.
 
 		// Intel architectures
+#ifdef BLIS_CONFIG_SKX
+		bli_gks_register_cntx( BLIS_ARCH_SKX,         bli_cntx_init_skx,
+		                                              bli_cntx_init_skx_ref,
+		                                              bli_cntx_init_skx_ind );
+#endif
 #ifdef BLIS_CONFIG_KNL
 		bli_gks_register_cntx( BLIS_ARCH_KNL,         bli_cntx_init_knl,
 		                                              bli_cntx_init_knl_ref,
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index 339cb1f1c..65dad0d71 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -41,7 +41,9 @@
 //
 
 // -- Intel64 architectures --
-
+#ifdef BLIS_CONFIG_SKX
+CNTX_INIT_PROTS( skx )
+#endif
 #ifdef BLIS_CONFIG_KNL
 CNTX_INIT_PROTS( knl )
 #endif
@@ -121,7 +123,9 @@ CNTX_INIT_PROTS( generic )
 #endif
 
 // -- Intel64 architectures --
-
+#ifdef BLIS_FAMILY_SKX
+#include "bli_family_skx.h"
+#endif
 #ifdef BLIS_FAMILY_KNL
 #include "bli_family_knl.h"
 #endif
@@ -189,7 +193,9 @@ CNTX_INIT_PROTS( generic )
 //
 
 // -- Intel64 architectures --
-
+#ifdef BLIS_KERNELS_SKX
+#include "bli_kernels_skx.h"
+#endif
 #ifdef BLIS_KERNELS_KNL
 #include "bli_kernels_knl.h"
 #endif
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index c15b0ed17..d6c1be0d7 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -815,7 +815,8 @@ typedef enum
 typedef enum
 {
 	// Intel
-	BLIS_ARCH_KNL     = 0,
+	BLIS_ARCH_SKX     =0,
+	BLIS_ARCH_KNL,
 	BLIS_ARCH_KNC,
 	BLIS_ARCH_HASWELL,
 	BLIS_ARCH_SANDYBRIDGE,
@@ -842,7 +843,7 @@ typedef enum
 
 } arch_t;
 
-#define BLIS_NUM_ARCHS 16
+#define BLIS_NUM_ARCHS 17
 
 
 //
diff --git a/kernels/skx/3/bli_avx512_macros.h b/kernels/skx/3/bli_avx512_macros.h
new file mode 100644
index 000000000..2dfe6c6f6
--- /dev/null
+++ b/kernels/skx/3/bli_avx512_macros.h
@@ -0,0 +1,171 @@
+#ifndef BLIS_AVX512_MACROS_H
+#define BLIS_AVX512_MACROS_H
+
+//
+// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
+//
+
+#define COMMENT_BEGIN "#"
+#define COMMENT_END
+
+#define STRINGIFY(...) #__VA_ARGS__
+#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
+#define LABEL(label) STRINGIFY(label) ":\n\t"
+
+#define XMM(x) %%xmm##x
+#define YMM(x) %%ymm##x
+#define ZMM(x) %%zmm##x
+#define EAX %%eax
+#define EBX %%ebx
+#define ECX %%ecx
+#define EDX %%edx
+#define EBP %%ebp
+#define EDI %%edi
+#define ESI %%esi
+#define RAX %%rax
+#define RBX %%rbx
+#define RCX %%rcx
+#define RDX %%rdx
+#define RBP %%rbp
+#define RDI %%rdi
+#define RSI %%rsi
+#define K(x) %%k##x
+#define R(x) %%r##x
+#define R8 %%r8
+#define R9 %%r9
+#define R10 %%r10
+#define R11 %%r11
+#define R12 %%r12
+#define R13 %%r13
+#define R14 %%r14
+#define R15 %%r15
+#define RD(x) %%r##x##d
+#define R8D %%r8d
+#define R9D %%r9d
+#define R10D %%r10d
+#define R11D %%r11d
+#define R12D %%r12d
+#define R13D %%r13d
+#define R14D %%r14d
+#define R15D %%r15d
+#define IMM(x) $##x
+#define VAR(x) %[x]
+
+#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
+#define MEM_3(reg,off,scale) (reg,off,scale)
+#define MEM_2(reg,disp) disp(reg)
+#define MEM_1(reg) (reg)
+
+#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
+#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
+#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
+#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
+
+#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
+#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
+#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
+#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
+
+#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
+#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
+#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
+#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
+
+#define MASK_K(n) %{%%k##n%}
+#define MASK_KZ(n) %{%%k##n%}%{z%}
+#define KMOV(to,from) ASM(kmovw from, to)
+#define JKNZD(kreg,label) \
+    ASM(kortestw kreg, kreg) \
+    ASM(jnz label)
+#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
+#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
+
+#define ALIGN16 ASM(.p2align 4)
+#define ALIGN32 ASM(.p2align 5)
+#define RDTSC ASM(rdstc)
+#define MOV(_0, _1) ASM(mov _1, _0)
+#define MOVD(_0, _1) ASM(movd _1, _0)
+#define MOVL(_0, _1) ASM(movl _1, _0)
+#define MOVQ(_0, _1) ASM(movq _1, _0)
+#define VMOVD(_0, _1) ASM(vmovd _1, _0)
+#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
+#define CMP(_0, _1) ASM(cmp _1, _0)
+#define AND(_0, _1) ASM(and _1, _0)
+#define ADD(_0, _1) ASM(add _1, _0)
+#define SUB(_0, _1) ASM(sub _1, _0)
+#define SAL(_0, _1) ASM(sal _1, _0)
+#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
+#define SAR(_0, _1) ASM(sar _1, _0)
+#define SAL1(_0) ASM(sal _0)
+#define SAR1(_0) ASM(sar _0)
+#define LEA(_0, _1) ASM(lea _1, _0)
+#define TEST(_0, _1) ASM(test _1, _0)
+#define DEC(_0) ASM(dec _0)
+#define JLE(_0) ASM(jle _0)
+#define JL(_0) ASM(jl _0)
+#define JNZ(_0) ASM(jnz _0)
+#define JZ(_0) ASM(jz _0)
+#define JNE(_0) ASM(jne _0)
+#define JE(_0) ASM(je _0)
+#define JNC(_0) ASM(jnc _0)
+#define JC(_0) ASM(jc _0)
+#define JMP(_0) ASM(jmp _0)
+#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
+#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
+#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
+#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
+#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
+#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
+#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
+#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
+#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
+#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
+#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
+#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
+#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
+#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
+#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
+#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
+#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
+#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
+#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
+#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
+#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
+#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
+#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
+#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
+#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
+#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
+#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
+#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
+#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
+#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
+#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
+#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
+#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
+#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
+#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
+#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
+#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
+#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
+#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
+#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
+#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
+#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
+#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
+#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
+#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
+#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
+#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
+#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
+#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
+#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
+#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
+#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
+#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
+#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
+#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
+#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
+#define VZEROUPPER() ASM(vzeroupper)
+
+#endif
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
new file mode 100644
index 000000000..0e705f763
--- /dev/null
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
@@ -0,0 +1,547 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <assert.h>
+
+#include "bli_avx512_macros.h"
+
+#define A_L1_PREFETCH_DIST 4 //should be multiple of 2
+
+/*The pointer of B is moved ahead by one iteration of k
+before the loop starts.Therefore, prefetching 3 k iterations
+ahead*/
+#define B_L1_PREFETCH_DIST 4
+
+#define TAIL_NITER 8
+
+#define CACHELINE_SIZE 64 //size of cache line in bytes
+
+/* During each subiteration, prefetching 2 cache lines of B
+ * UNROLL factor ahead. 2cache lines = 16 doubles (NR).
+ * */
+#define PREFETCH_A_L1(n, k) \
+    PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*16*8 + (2*n+k)  * CACHELINE_SIZE))
+
+/* Preloading B for the first iteration of the main loop.
+     * for subiter(1), subiter(2), and subiter(3) */
+#define PREFETCH_B_L1_1ITER \
+    PREFETCH(0, MEM(RBX                   )) \
+    PREFETCH(0, MEM(RBX,    CACHELINE_SIZE)) \
+    PREFETCH(0, MEM(RBX,  2*CACHELINE_SIZE)) \
+    PREFETCH(0, MEM(RBX,  3*CACHELINE_SIZE)) \
+    PREFETCH(0, MEM(RBX,  4*CACHELINE_SIZE)) \
+    PREFETCH(0, MEM(RBX,  5*CACHELINE_SIZE))
+
+#define LOOP_ALIGN ALIGN16
+
+#define UPDATE_C(R1,R2,R3,R4) \
+\
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
+    VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
+    VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
+    VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
+    VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \
+    VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \
+    VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
+    VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
+    LEA(RCX, MEM(RCX,RAX,2))
+
+#define UPDATE_C_BZ(R1,R2,R3,R4) \
+\
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \
+    VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \
+    VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
+    VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
+    LEA(RCX, MEM(RCX,RAX,2))
+
+#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
+    VFMADD231PD(ZMM(R1), ZMM(6), ZMM(1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R1)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \
+    VFMADD231PD(ZMM(R2), ZMM(6), ZMM(1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R2)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
+    VFMADD231PD(ZMM(R3), ZMM(6), ZMM(1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R3)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \
+    VFMADD231PD(ZMM(R4), ZMM(6), ZMM(1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R4)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1))
+
+#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R1)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R2)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R3)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R4)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1))
+
+#ifdef PREFETCH_C_L2
+#undef PREFETCH_C_L2
+#define PREFETCH_C_L2 \
+\
+    PREFETCH(1, MEM(RCX,      0*64)) \
+    PREFETCH(1, MEM(RCX,      1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R12,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R12,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R12,2,0*64)) \
+    PREFETCH(1, MEM(RCX,R12,2,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R13,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R13,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R12,4,0*64)) \
+    PREFETCH(1, MEM(RCX,R12,4,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R14,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R14,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R13,2,0*64)) \
+    PREFETCH(1, MEM(RCX,R13,2,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R15,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R15,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,      0*64)) \
+    PREFETCH(1, MEM(RDX,      1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,R12,1,0*64)) \
+    PREFETCH(1, MEM(RDX,R12,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,R12,2,0*64)) \
+    PREFETCH(1, MEM(RDX,R12,2,1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,R13,1,0*64)) \
+    PREFETCH(1, MEM(RDX,R13,1,1*64))
+
+#else
+#undef PREFETCH_C_L2
+#define PREFETCH_C_L2
+#endif
+
+
+#define PREFETCH_C_L1 \
+\
+    PREFETCHW0(MEM(RCX,      0*64)) \
+    PREFETCHW0(MEM(RCX,      1*64)) \
+    PREFETCHW0(MEM(RCX,R12,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R12,1,1*64)) \
+    PREFETCHW0(MEM(RCX,R12,2,0*64)) \
+    PREFETCHW0(MEM(RCX,R12,2,1*64)) \
+    PREFETCHW0(MEM(RCX,R13,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R13,1,1*64)) \
+    PREFETCHW0(MEM(RCX,R12,4,0*64)) \
+    PREFETCHW0(MEM(RCX,R12,4,1*64)) \
+    PREFETCHW0(MEM(RCX,R14,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R14,1,1*64)) \
+    PREFETCHW0(MEM(RCX,R13,2,0*64)) \
+    PREFETCHW0(MEM(RCX,R13,2,1*64)) \
+    PREFETCHW0(MEM(RCX,R15,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R15,1,1*64)) \
+    PREFETCHW0(MEM(RDX,      0*64)) \
+    PREFETCHW0(MEM(RDX,      1*64)) \
+    PREFETCHW0(MEM(RDX,R12,1,0*64)) \
+    PREFETCHW0(MEM(RDX,R12,1,1*64)) \
+    PREFETCHW0(MEM(RDX,R12,2,0*64)) \
+    PREFETCHW0(MEM(RDX,R12,2,1*64)) \
+    PREFETCHW0(MEM(RDX,R13,1,0*64)) \
+    PREFETCHW0(MEM(RDX,R13,1,1*64))
+
+//
+// n: index in unrolled loop
+//
+// a: ZMM register to load into
+// b: ZMM register to read from
+//
+// ...: addressing for A, except for offset
+//
+#define SUBITER(n) \
+\
+    PREFETCH_A_L1(n, 0) \
+    \
+    VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \
+    VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \
+    VFMADD231PD(ZMM( 8), ZMM(0), ZMM(3)) \
+    VFMADD231PD(ZMM( 9), ZMM(1), ZMM(3)) \
+    VFMADD231PD(ZMM(10), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(11), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \
+    VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \
+    VFMADD231PD(ZMM(12), ZMM(0), ZMM(3)) \
+    VFMADD231PD(ZMM(13), ZMM(1), ZMM(3)) \
+    VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \
+    VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \
+    VFMADD231PD(ZMM(16), ZMM(0), ZMM(3)) \
+    VFMADD231PD(ZMM(17), ZMM(1), ZMM(3)) \
+    VFMADD231PD(ZMM(18), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(19), ZMM(1), ZMM(4)) \
+    \
+    PREFETCH_A_L1(n, 1) \
+    \
+    VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \
+    VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \
+    VFMADD231PD(ZMM(20), ZMM(0), ZMM(3)) \
+    VFMADD231PD(ZMM(21), ZMM(1), ZMM(3)) \
+    VFMADD231PD(ZMM(22), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(23), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \
+    VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \
+    VFMADD231PD(ZMM(24), ZMM(0), ZMM(3)) \
+    VFMADD231PD(ZMM(25), ZMM(1), ZMM(3)) \
+    VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+10)*8)) \
+    VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+11)*8)) \
+    VFMADD231PD(ZMM(28), ZMM(0), ZMM(3)) \
+    VFMADD231PD(ZMM(29), ZMM(1), ZMM(3)) \
+    VFMADD231PD(ZMM(30), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(31), ZMM(1), ZMM(4)) \
+    \
+    VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
+    VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
+
+//This is an array used for the scatter/gather instructions.
+static int64_t offsets[16] __attribute__((aligned(64))) =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
+
+
+void bli_dgemm_skx_asm_16x12_l2(
+                             dim_t            k_,
+                             double* restrict alpha,
+                             double* restrict a,
+                             double* restrict b,
+                             double* restrict beta,
+                             double* restrict c, inc_t rs_c_, inc_t cs_c_,
+                             auxinfo_t*       data,
+                             cntx_t* restrict cntx
+                           )
+{
+    (void)data;
+    (void)cntx;
+
+    const int64_t* offsetPtr = &offsets[0];
+    const int64_t k = k_;
+    const int64_t rs_c = rs_c_;
+    const int64_t cs_c = cs_c_;
+
+    __asm__ volatile
+    (
+
+    VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
+    VMOVAPD(YMM( 7), YMM(8))
+    VMOVAPD(YMM( 9), YMM(8))
+    VMOVAPD(YMM(10), YMM(8))   MOV(RSI, VAR(k)) //loop index
+    VMOVAPD(YMM(11), YMM(8))   MOV(RAX, VAR(a)) //load address of a
+    VMOVAPD(YMM(12), YMM(8))   MOV(RBX, VAR(b)) //load address of b
+    VMOVAPD(YMM(13), YMM(8))   MOV(RCX, VAR(c)) //load address of c
+    VMOVAPD(YMM(14), YMM(8))
+    VMOVAPD(YMM(15), YMM(8))   VMOVAPD(ZMM(0), MEM(RAX, 0*8)) //pre-load a
+    VMOVAPD(YMM(16), YMM(8))   VMOVAPD(ZMM(1), MEM(RAX, 8*8)) //pre-load a
+    VMOVAPD(YMM(17), YMM(8))
+    VMOVAPD(YMM(18), YMM(8))
+    VMOVAPD(YMM(19), YMM(8))   MOV(R12, VAR(cs_c))      //cs_c
+    VMOVAPD(YMM(20), YMM(8))   LEA(R13, MEM(R12,R12,2)) //*3
+    VMOVAPD(YMM(21), YMM(8))   LEA(R14, MEM(R12,R12,4)) //*5
+    VMOVAPD(YMM(22), YMM(8))   LEA(R15, MEM(R14,R12,2)) //*7
+    VMOVAPD(YMM(23), YMM(8))   LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c
+    VMOVAPD(YMM(24), YMM(8))
+    VMOVAPD(YMM(25), YMM(8))   MOV(R8, IMM(16*8)) //mr*sizeof(double)
+    VMOVAPD(YMM(26), YMM(8))   MOV(R9, IMM(12*8)) //nr*sizeof(double)
+    VMOVAPD(YMM(27), YMM(8))
+    VMOVAPD(YMM(28), YMM(8))   LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load
+    VMOVAPD(YMM(29), YMM(8))
+    VMOVAPD(YMM(30), YMM(8))
+    VMOVAPD(YMM(31), YMM(8))
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+#ifdef PREFETCH_A_BEFORE
+    PREFETCH(0, MEM(RAX,0*64))
+    PREFETCH(0, MEM(RAX,1*64))
+    PREFETCH(0, MEM(RAX,2*64))
+    PREFETCH(0, MEM(RAX,3*64))
+    PREFETCH(0, MEM(RAX,4*64))
+    PREFETCH(0, MEM(RAX,5*64))
+    PREFETCH(0, MEM(RAX,6*64))
+    PREFETCH(0, MEM(RAX,7*64))
+
+#endif
+
+#ifdef PREFETCH_B_BEFORE
+    PREFETCH(0, MEM(RBX,0*64))
+    PREFETCH(0, MEM(RBX,1*64))
+    PREFETCH(0, MEM(RBX,2*64))
+    PREFETCH(0, MEM(RBX,3*64))
+    PREFETCH(0, MEM(RBX,4*64))
+    PREFETCH(0, MEM(RBX,5*64))
+#endif
+
+    PREFETCH_C_L2
+
+    MOV(RDI, RSI)
+    AND(RSI, IMM(3))
+    SAR(RDI, IMM(2))
+
+    SUB(RDI, IMM(0+TAIL_NITER))
+    JLE(K_SMALL)
+
+    LOOP_ALIGN
+    LABEL(MAIN_LOOP)
+
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
+        SUBITER(0)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128))
+        SUBITER(1)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192))
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256))
+        SUBITER(2)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320))
+        SUBITER(3)
+
+        LEA(RAX, MEM(RAX,R8,4))
+        LEA(RBX, MEM(RBX,R9,4))
+
+        DEC(RDI)
+
+    JNZ(MAIN_LOOP)
+
+    LABEL(K_SMALL)
+
+    PREFETCH_C_L1
+
+    ADD(RDI, IMM(0+TAIL_NITER))
+    JZ(TAIL_LOOP)
+
+    LOOP_ALIGN
+    LABEL(SMALL_LOOP)
+
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
+        SUBITER(0)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128))
+        SUBITER(1)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192))
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256))
+        SUBITER(2)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320))
+        SUBITER(3)
+
+        LEA(RAX, MEM(RAX,R8,4))
+        LEA(RBX, MEM(RBX,R9,4))
+
+        DEC(RDI)
+
+    JNZ(SMALL_LOOP)
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+    LOOP_ALIGN
+    LABEL(TAIL_LOOP)
+
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
+        SUBITER(0)
+
+        ADD(RAX, R8)
+        ADD(RBX, R9)
+
+        DEC(RSI)
+
+    JNZ(TAIL_LOOP)
+
+    LABEL(POSTACCUM)
+
+#ifdef PREFETCH_A_AFTER
+    MOV(R8, VAR(a))
+    PREFETCH(0, MEM(R8,0*64))
+    PREFETCH(0, MEM(R8,1*64))
+    PREFETCH(0, MEM(R8,2*64))
+    PREFETCH(0, MEM(R8,3*64))
+    PREFETCH(0, MEM(R8,4*64))
+    PREFETCH(0, MEM(R8,5*64))
+    PREFETCH(0, MEM(R8,6*64))
+    PREFETCH(0, MEM(R8,7*64))
+#endif
+
+#ifdef PREFETCH_B_AFTER
+    MOV(R9, VAR(b))
+    PREFETCH(0, MEM(R9,0*64))
+    PREFETCH(0, MEM(R9,1*64))
+    PREFETCH(0, MEM(R9,2*64))
+    PREFETCH(0, MEM(R9,3*64))
+    PREFETCH(0, MEM(R9,4*64))
+    PREFETCH(0, MEM(R9,5*64))
+#endif
+
+    MOV(RAX, VAR(alpha))
+    MOV(RBX, VAR(beta))
+    VBROADCASTSD(ZMM(0), MEM(RAX))
+    VBROADCASTSD(ZMM(1), MEM(RBX))
+
+    MOV(RAX, VAR(cs_c))
+    LEA(RAX, MEM(,RAX,8))
+    MOV(RBX, VAR(rs_c))
+
+    // Check if C is column stride. If not, jump to the slow scattered update
+    CMP(RBX, IMM(1))
+    JNE(SCATTEREDUPDATE)
+
+        VCOMISD(XMM(1), XMM(7))
+        JE(COLSTORBZ)
+
+            UPDATE_C( 8, 9,10,11)
+            UPDATE_C(12,13,14,15)
+            UPDATE_C(16,17,18,19)
+            UPDATE_C(20,21,22,23)
+            UPDATE_C(24,25,26,27)
+            UPDATE_C(28,29,30,31)
+
+        JMP(END)
+        LABEL(COLSTORBZ)
+
+            UPDATE_C_BZ( 8, 9,10,11)
+            UPDATE_C_BZ(12,13,14,15)
+            UPDATE_C_BZ(16,17,18,19)
+            UPDATE_C_BZ(20,21,22,23)
+            UPDATE_C_BZ(24,25,26,27)
+            UPDATE_C_BZ(28,29,30,31)
+
+    JMP(END)
+    LABEL(SCATTEREDUPDATE)
+
+        MOV(RDI, VAR(offsetPtr))
+        VMOVDQA64(ZMM(2), MEM(RDI,0*64))
+        VMOVDQA64(ZMM(3), MEM(RDI,1*64))
+        VPBROADCASTQ(ZMM(6), RBX)
+        VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
+        VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
+
+        VCOMISD(XMM(1), XMM(7))
+        JE(SCATTERBZ)
+
+            UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
+            UPDATE_C_ROW_SCATTERED(12,13,14,15)
+            UPDATE_C_ROW_SCATTERED(16,17,18,19)
+            UPDATE_C_ROW_SCATTERED(20,21,22,23)
+            UPDATE_C_ROW_SCATTERED(24,25,26,27)
+            UPDATE_C_ROW_SCATTERED(28,29,30,31)
+
+        JMP(END)
+        LABEL(SCATTERBZ)
+
+            UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
+            UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
+            UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
+            UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
+            UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
+            UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
+
+    LABEL(END)
+
+    VZEROUPPER()
+
+    : // output operands
+    : // input operands
+      [k]         "m" (k),
+      [a]         "m" (a),
+      [b]         "m" (b),
+      [alpha]     "m" (alpha),
+      [beta]      "m" (beta),
+      [c]         "m" (c),
+      [rs_c]      "m" (rs_c),
+      [cs_c]      "m" (cs_c),
+      [offsetPtr] "m" (offsetPtr)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+      "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+      "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
+      "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
+      "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
+      "zmm30", "zmm31", "memory"
+    );
+}
diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
new file mode 100644
index 000000000..a69b92086
--- /dev/null
+++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
@@ -0,0 +1,572 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#include "bli_avx512_macros.h"
+
+#define CACHELINE_SIZE 64 //size of cache line in bytes
+
+#define A_L1_PREFETCH_DIST 4 //should be multiple of 2
+
+/*The pointer of B is moved ahead by one iteration of k
+before the loop starts.Therefore, prefetching 3 k iterations
+ahead*/
+#define B_L1_PREFETCH_DIST 4
+
+#define TAIL_NITER 8
+
+
+/* During each subiteration, prefetching 2 cache lines of B
+ * UNROLL factor ahead. 2cache lines = 32 floats (NR).
+ * */
+#define PREFETCH_A_L1(n, k) \
+    PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*32*4 + (2*n+k)  * CACHELINE_SIZE))
+
+#define LOOP_ALIGN ALIGN16
+
+#define UPDATE_C(R1,R2,R3,R4) \
+\
+    VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
+    VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
+    VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
+    VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
+    VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
+    VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
+    VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
+    VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
+    LEA(RCX, MEM(RCX,RAX,2))
+
+#define UPDATE_C_BZ(R1,R2,R3,R4) \
+\
+    VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
+    VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
+    VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
+    VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
+    LEA(RCX, MEM(RCX,RAX,2))
+
+#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    KXNORW(K(4), K(0), K(0)) \
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \
+    VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
+    VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \
+    VFMADD231PS(YMM(R1), YMM(6), YMM(1)) \
+    VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
+    VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R1)) \
+    VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    KXNORW(K(4), K(0), K(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \
+    VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \
+    VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \
+    VFMADD231PS(YMM(R2), YMM(6), YMM(1)) \
+    VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
+    VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R2)) \
+    VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1)) \
+    LEA(RDX, MEM(RDX,RAX,1)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    KXNORW(K(4), K(0), K(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \
+    VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
+    VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \
+    VFMADD231PS(YMM(R3), YMM(6), YMM(1)) \
+    VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
+    VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R3)) \
+    VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    KXNORW(K(4), K(0), K(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \
+    VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \
+    VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \
+    VFMADD231PS(YMM(R4), YMM(6), YMM(1)) \
+    VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
+    VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R4)) \
+    VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1)) \
+    LEA(RDX, MEM(RDX,RAX,1))
+
+#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \
+    VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R1)) \
+    VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \
+    VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R2)) \
+    VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1)) \
+    LEA(RDX, MEM(RDX,RAX,1)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \
+    VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R3)) \
+    VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \
+    VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R4)) \
+    VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \
+\
+    LEA(RCX, MEM(RCX,RAX,1)) \
+    LEA(RDX, MEM(RDX,RAX,1))
+
+#ifdef PREFETCH_C_L2
+#undef PREFETCH_C_L2
+#define PREFETCH_C_L2 \
+\
+    PREFETCH(1, MEM(RCX,      0*64)) \
+    PREFETCH(1, MEM(RCX,      1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R12,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R12,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R12,2,0*64)) \
+    PREFETCH(1, MEM(RCX,R12,2,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R13,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R13,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R12,4,0*64)) \
+    PREFETCH(1, MEM(RCX,R12,4,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R14,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R14,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R13,2,0*64)) \
+    PREFETCH(1, MEM(RCX,R13,2,1*64)) \
+    \
+    PREFETCH(1, MEM(RCX,R15,1,0*64)) \
+    PREFETCH(1, MEM(RCX,R15,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,      0*64)) \
+    PREFETCH(1, MEM(RDX,      1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,R12,1,0*64)) \
+    PREFETCH(1, MEM(RDX,R12,1,1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,R12,2,0*64)) \
+    PREFETCH(1, MEM(RDX,R12,2,1*64)) \
+    \
+    PREFETCH(1, MEM(RDX,R13,1,0*64)) \
+    PREFETCH(1, MEM(RDX,R13,1,1*64))
+
+#else
+#undef PREFETCH_C_L2
+#define PREFETCH_C_L2
+#endif
+
+
+#define PREFETCH_C_L1 \
+\
+    PREFETCHW0(MEM(RCX,      0*64)) \
+    PREFETCHW0(MEM(RCX,      1*64)) \
+    PREFETCHW0(MEM(RCX,R12,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R12,1,1*64)) \
+    PREFETCHW0(MEM(RCX,R12,2,0*64)) \
+    PREFETCHW0(MEM(RCX,R12,2,1*64)) \
+    PREFETCHW0(MEM(RCX,R13,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R13,1,1*64)) \
+    PREFETCHW0(MEM(RCX,R12,4,0*64)) \
+    PREFETCHW0(MEM(RCX,R12,4,1*64)) \
+    PREFETCHW0(MEM(RCX,R14,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R14,1,1*64)) \
+    PREFETCHW0(MEM(RCX,R13,2,0*64)) \
+    PREFETCHW0(MEM(RCX,R13,2,1*64)) \
+    PREFETCHW0(MEM(RCX,R15,1,0*64)) \
+    PREFETCHW0(MEM(RCX,R15,1,1*64)) \
+    PREFETCHW0(MEM(RDX,      0*64)) \
+    PREFETCHW0(MEM(RDX,      1*64)) \
+    PREFETCHW0(MEM(RDX,R12,1,0*64)) \
+    PREFETCHW0(MEM(RDX,R12,1,1*64)) \
+    PREFETCHW0(MEM(RDX,R12,2,0*64)) \
+    PREFETCHW0(MEM(RDX,R12,2,1*64)) \
+    PREFETCHW0(MEM(RDX,R13,1,0*64)) \
+    PREFETCHW0(MEM(RDX,R13,1,1*64))
+
+//
+// n: index in unrolled loop
+//
+// a: ZMM register to load into
+// b: ZMM register to read from
+//
+// ...: addressing for B, except for offset
+//
+#define SUBITER(n) \
+\
+    PREFETCH_A_L1(n, 0) \
+    \
+    VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \
+    VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \
+    VFMADD231PS(ZMM( 8), ZMM(0), ZMM(3)) \
+    VFMADD231PS(ZMM( 9), ZMM(1), ZMM(3)) \
+    VFMADD231PS(ZMM(10), ZMM(0), ZMM(4)) \
+    VFMADD231PS(ZMM(11), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \
+    VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \
+    VFMADD231PS(ZMM(12), ZMM(0), ZMM(3)) \
+    VFMADD231PS(ZMM(13), ZMM(1), ZMM(3)) \
+    VFMADD231PS(ZMM(14), ZMM(0), ZMM(4)) \
+    VFMADD231PS(ZMM(15), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \
+    VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \
+    VFMADD231PS(ZMM(16), ZMM(0), ZMM(3)) \
+    VFMADD231PS(ZMM(17), ZMM(1), ZMM(3)) \
+    VFMADD231PS(ZMM(18), ZMM(0), ZMM(4)) \
+    VFMADD231PS(ZMM(19), ZMM(1), ZMM(4)) \
+    \
+    PREFETCH_A_L1(n, 1) \
+    \
+    VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \
+    VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \
+    VFMADD231PS(ZMM(20), ZMM(0), ZMM(3)) \
+    VFMADD231PS(ZMM(21), ZMM(1), ZMM(3)) \
+    VFMADD231PS(ZMM(22), ZMM(0), ZMM(4)) \
+    VFMADD231PS(ZMM(23), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \
+    VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \
+    VFMADD231PS(ZMM(24), ZMM(0), ZMM(3)) \
+    VFMADD231PS(ZMM(25), ZMM(1), ZMM(3)) \
+    VFMADD231PS(ZMM(26), ZMM(0), ZMM(4)) \
+    VFMADD231PS(ZMM(27), ZMM(1), ZMM(4)) \
+    \
+    VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+10)*8)) \
+    VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+11)*8)) \
+    VFMADD231PS(ZMM(28), ZMM(0), ZMM(3)) \
+    VFMADD231PS(ZMM(29), ZMM(1), ZMM(3)) \
+    VFMADD231PS(ZMM(30), ZMM(0), ZMM(4)) \
+    VFMADD231PS(ZMM(31), ZMM(1), ZMM(4)) \
+    \
+    VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
+    VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
+
+//This is an array used for the scatter/gather instructions.
+static int64_t offsets[16] __attribute__((aligned(64))) =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
+
+void bli_sgemm_skx_asm_32x12_l2(
+                             dim_t            k_,
+                             float* restrict alpha,
+                             float* restrict a,
+                             float* restrict b,
+                             float* restrict beta,
+                             float* restrict c, inc_t rs_c_, inc_t cs_c_,
+                             auxinfo_t*       data,
+                             cntx_t* restrict cntx
+                           )
+{
+    (void)data;
+    (void)cntx;
+
+    const int64_t* offsetPtr = &offsets[0];
+    const int64_t k = k_;
+    const int64_t rs_c = rs_c_;
+    const int64_t cs_c = cs_c_;
+
+    __asm__ volatile
+    (
+
+    VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
+    VMOVAPD(YMM( 7), YMM(8))
+    VMOVAPD(YMM( 9), YMM(8))
+    VMOVAPD(YMM(10), YMM(8))   MOV(RSI, VAR(k)) //loop index
+    VMOVAPD(YMM(11), YMM(8))   MOV(RAX, VAR(a)) //load address of a
+    VMOVAPD(YMM(12), YMM(8))   MOV(RBX, VAR(b)) //load address of b
+    VMOVAPD(YMM(13), YMM(8))   MOV(RCX, VAR(c)) //load address of c
+    VMOVAPD(YMM(14), YMM(8))
+    VMOVAPD(YMM(15), YMM(8))   VMOVAPD(ZMM(0), MEM(RAX,  0*4)) //pre-load a
+    VMOVAPD(YMM(16), YMM(8))   VMOVAPD(ZMM(1), MEM(RAX, 16*4)) //pre-load a
+    VMOVAPD(YMM(17), YMM(8))
+    VMOVAPD(YMM(18), YMM(8))
+    VMOVAPD(YMM(19), YMM(8))   MOV(R12, VAR(cs_c))      //cs_c
+    VMOVAPD(YMM(20), YMM(8))   LEA(R13, MEM(R12,R12,2)) //*3
+    VMOVAPD(YMM(21), YMM(8))   LEA(R14, MEM(R12,R12,4)) //*5
+    VMOVAPD(YMM(22), YMM(8))   LEA(R15, MEM(R14,R12,2)) //*7
+    VMOVAPD(YMM(23), YMM(8))   LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c
+    VMOVAPD(YMM(24), YMM(8))
+    VMOVAPD(YMM(25), YMM(8))   MOV(R8, IMM(32*4)) //mr*sizeof(float)
+    VMOVAPD(YMM(26), YMM(8))   MOV(R9, IMM(12*4)) //nr*sizeof(float)
+    VMOVAPD(YMM(27), YMM(8))
+    VMOVAPD(YMM(28), YMM(8))   LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load
+    VMOVAPD(YMM(29), YMM(8))
+    VMOVAPD(YMM(30), YMM(8))
+    VMOVAPD(YMM(31), YMM(8))
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+#ifdef PREFETCH_A_BEFORE
+    /* Prefetching 8 cachlines of A (4 iterations worth of data
+       (32 (MR) x4 (sizeof(float)) x4 iter /64 = 8 cachelines) */
+    PREFETCH(0, MEM(RAX,0*64))
+    PREFETCH(0, MEM(RAX,1*64))
+    PREFETCH(0, MEM(RAX,2*64))
+    PREFETCH(0, MEM(RAX,3*64))
+    PREFETCH(0, MEM(RAX,4*64))
+    PREFETCH(0, MEM(RAX,5*64))
+    PREFETCH(0, MEM(RAX,6*64))
+    PREFETCH(0, MEM(RAX,7*64))
+#endif
+
+#ifdef PREFETCH_B_BEFORE
+	/* Prefetching 3 cachlines of B (4 iterations worth of data
+       (12 (NR) x 4 (sizeof(float)) x 4 iter /64 = 3 cachelines) */
+    PREFETCH(0, MEM(RBX,0*64))
+    PREFETCH(0, MEM(RBX,1*64))
+    PREFETCH(0, MEM(RBX,2*64))
+#endif
+
+    PREFETCH_C_L2
+
+    MOV(RDI, RSI)
+    AND(RSI, IMM(3))
+    SAR(RDI, IMM(2))
+
+    SUB(RDI, IMM(0+TAIL_NITER))
+    JLE(K_SMALL)
+
+    LOOP_ALIGN
+    LABEL(MAIN_LOOP)
+
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
+        SUBITER(0)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64))
+        SUBITER(1)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128))
+        SUBITER(2)
+        SUBITER(3)
+
+        LEA(RAX, MEM(RAX,R8,4))
+        LEA(RBX, MEM(RBX,R9,4))
+
+        DEC(RDI)
+
+    JNZ(MAIN_LOOP)
+
+    LABEL(K_SMALL)
+
+    PREFETCH_C_L1
+
+    ADD(RDI, IMM(0+TAIL_NITER))
+    JZ(TAIL_LOOP)
+
+    LOOP_ALIGN
+    LABEL(SMALL_LOOP)
+
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
+        SUBITER(0)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64))
+        SUBITER(1)
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128))
+        SUBITER(2)
+        SUBITER(3)
+
+        LEA(RAX, MEM(RAX,R8,4))
+        LEA(RBX, MEM(RBX,R9,4))
+
+        DEC(RDI)
+
+    JNZ(SMALL_LOOP)
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+    LOOP_ALIGN
+    LABEL(TAIL_LOOP)
+
+        PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
+        SUBITER(0)
+
+        ADD(RAX, R8)
+        ADD(RBX, R9)
+
+        DEC(RSI)
+
+    JNZ(TAIL_LOOP)
+
+
+    LABEL(POSTACCUM)
+
+#ifdef PREFETCH_A_AFTER
+    MOV(R8, VAR(a))
+    PREFETCH(0, MEM(R8,0*64))
+    PREFETCH(0, MEM(R8,1*64))
+    PREFETCH(0, MEM(R8,2*64))
+    PREFETCH(0, MEM(R8,3*64))
+    PREFETCH(0, MEM(R8,4*64))
+    PREFETCH(0, MEM(R8,5*64))
+    PREFETCH(0, MEM(R8,6*64))
+    PREFETCH(0, MEM(R8,7*64))
+#endif
+
+#ifdef PREFETCH_B_AFTER
+    MOV(R9, VAR(b))
+    PREFETCH(0, MEM(R9,0*64))
+    PREFETCH(0, MEM(R9,1*64))
+    PREFETCH(0, MEM(R9,2*64))
+#endif
+
+    MOV(RAX, VAR(alpha))
+    MOV(RBX, VAR(beta))
+    VBROADCASTSS(ZMM(0), MEM(RAX))
+    VBROADCASTSS(ZMM(1), MEM(RBX))
+
+    MOV(RAX, VAR(cs_c))
+    LEA(RAX, MEM(,RAX,4))
+    MOV(RBX, VAR(rs_c))
+    LEA(RBX, MEM(,RBX,4))
+
+
+    // Check if C is column major (rs_c = 1). If not, jump to the slow scattered update
+    CMP(RBX, IMM(4))
+    JNE(SCATTEREDUPDATE)
+
+        VCOMISD(XMM(1), XMM(7))
+        JE(COLSTORBZ)
+
+            UPDATE_C( 8, 9,10,11)
+            UPDATE_C(12,13,14,15)
+            UPDATE_C(16,17,18,19)
+            UPDATE_C(20,21,22,23)
+            UPDATE_C(24,25,26,27)
+            UPDATE_C(28,29,30,31)
+
+        JMP(END)
+        LABEL(COLSTORBZ)
+
+            UPDATE_C_BZ( 8, 9,10,11)
+            UPDATE_C_BZ(12,13,14,15)
+            UPDATE_C_BZ(16,17,18,19)
+            UPDATE_C_BZ(20,21,22,23)
+            UPDATE_C_BZ(24,25,26,27)
+            UPDATE_C_BZ(28,29,30,31)
+
+    JMP(END)
+    LABEL(SCATTEREDUPDATE)
+
+        LEA(RDX, MEM(RCX,RBX,8))
+        LEA(RDX, MEM(RDX,RBX,8))
+
+        MOV(RDI, VAR(offsetPtr))
+        VMOVDQA64(ZMM(2), MEM(RDI,0*64))
+        VMOVDQA64(ZMM(3), MEM(RDI,1*64))
+        VPBROADCASTQ(ZMM(6), RBX)
+        VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
+        VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
+
+        VCOMISD(XMM(1), XMM(7))
+        JE(SCATTERBZ)
+
+            UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
+            UPDATE_C_ROW_SCATTERED(12,13,14,15)
+            UPDATE_C_ROW_SCATTERED(16,17,18,19)
+            UPDATE_C_ROW_SCATTERED(20,21,22,23)
+            UPDATE_C_ROW_SCATTERED(24,25,26,27)
+            UPDATE_C_ROW_SCATTERED(28,29,30,31)
+
+        JMP(END)
+        LABEL(SCATTERBZ)
+
+            UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
+            UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
+            UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
+            UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
+            UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
+            UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
+
+    LABEL(END)
+
+    VZEROUPPER()
+
+    : // output operands
+    : // input operands
+      [k]         "m" (k),
+      [a]         "m" (a),
+      [b]         "m" (b),
+      [alpha]     "m" (alpha),
+      [beta]      "m" (beta),
+      [c]         "m" (c),
+      [rs_c]      "m" (rs_c),
+      [cs_c]      "m" (cs_c),
+      [offsetPtr] "m" (offsetPtr)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+      "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+      "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
+      "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
+      "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
+      "zmm30", "zmm31", "memory"
+    );
+}
diff --git a/kernels/skx/bli_kernels_skx.h b/kernels/skx/bli_kernels_skx.h
new file mode 100644
index 000000000..1217277e2
--- /dev/null
+++ b/kernels/skx/bli_kernels_skx.h
@@ -0,0 +1,40 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+GEMM_UKR_PROT( float ,   s, gemm_skx_asm_32x12_l2 )
+GEMM_UKR_PROT( float ,   s, gemm_skx_asm_12x32_l2 )
+
+GEMM_UKR_PROT( double,   d, gemm_skx_asm_16x12_l2 )
+
+