mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Added single-precision micro-kernel for Knights Corner aka MIC aka Xeon Phi
This commit is contained in:
@@ -97,7 +97,7 @@
|
||||
// It is sometimes useful to define the various memory alignments in terms
|
||||
// of some other characteristics of the system, such as the cache line size
|
||||
// and the page size.
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_CACHE_LINE_SIZE 256
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
|
||||
@@ -54,35 +54,42 @@
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
#define BLIS_DEFAULT_MC_S 240
|
||||
#define BLIS_DEFAULT_KC_S 240
|
||||
#define BLIS_DEFAULT_NC_S 9600
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 120
|
||||
#define BLIS_DEFAULT_KC_D 240
|
||||
#define BLIS_DEFAULT_NC_D 14400
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
#define BLIS_DEFAULT_4M_MC_C BLIS_DEFAULT_MC_S
|
||||
#define BLIS_DEFAULT_4M_KC_C BLIS_DEFAULT_KC_S
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
#define BLIS_DEFAULT_3M_MC_C BLIS_DEFAULT_MC_S
|
||||
#define BLIS_DEFAULT_3M_KC_C BLIS_DEFAULT_KC_S
|
||||
|
||||
/*
|
||||
#define BLIS_DEFAULT_MC_C 120
|
||||
#define BLIS_DEFAULT_KC_C 240
|
||||
#define BLIS_DEFAULT_NC_C 9600
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 120
|
||||
#define BLIS_DEFAULT_KC_Z 240
|
||||
#define BLIS_DEFAULT_NC_Z 9600
|
||||
*/
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
#define BLIS_DEFAULT_MR_S 30
|
||||
#define BLIS_DEFAULT_NR_S 16
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 30
|
||||
#define BLIS_DEFAULT_NR_D 8
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
//#define BLIS_DEFAULT_MR_C 8
|
||||
//#define BLIS_DEFAULT_NR_C 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 8
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
//#define BLIS_DEFAULT_MR_Z 8
|
||||
//#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
@@ -123,8 +130,8 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
#define BLIS_EXTEND_MR_S 2
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 2
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
@@ -153,12 +160,11 @@
|
||||
// -- gemm --
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
@@ -40,25 +40,6 @@
|
||||
#define B_L1_PREFETCH_DIST 2
|
||||
#define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed.
|
||||
|
||||
void bli_sgemm_opt_30x8(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
BLIS_SGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
//Alternate code path uused if C is not row-major
|
||||
#define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \
|
||||
{ \
|
||||
@@ -268,7 +249,7 @@ void bli_sgemm_opt_30x8(
|
||||
}
|
||||
|
||||
//This is an array used for the scattter/gather instructions.
|
||||
int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
extern int offsets[16];
|
||||
|
||||
|
||||
//#define MONITORS
|
||||
575
kernels/mic/3/bli_sgemm_opt_30x16.c
Normal file
575
kernels/mic/3/bli_sgemm_opt_30x16.c
Normal file
@@ -0,0 +1,575 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2012, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
||||
OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
#define A_L1_PREFETCH_DIST 4
|
||||
#define B_L1_PREFETCH_DIST 2
|
||||
#define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed.
|
||||
|
||||
//Alternate code path uused if C is not row-major
|
||||
#define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \
|
||||
{ \
|
||||
__asm kmov k3, ebx \
|
||||
__asm GATHER##NUM: \
|
||||
__asm vgatherdps zmm31{k3}, [BASE_DEST + zmm30 * 4] \
|
||||
__asm jknzd k3, GATHER##NUM \
|
||||
\
|
||||
__asm vmulps REG1, REG1, 0[r12]{1to16} /*scale by alpha*/ \
|
||||
__asm vfmadd132ps zmm31, REG1, 0[r13]{1to16} /*scale by beta, add in result*/\
|
||||
__asm kmov k3, ebx \
|
||||
\
|
||||
__asm SCATTER##NUM: \
|
||||
__asm vscatterdps [BASE_DEST + zmm30 * 4]{k3}, zmm31 \
|
||||
__asm jknzd k3, SCATTER##NUM \
|
||||
__asm add BASE_DEST, r11 \
|
||||
}
|
||||
|
||||
|
||||
//One iteration of the k_r loop.
|
||||
//Each iteration, we prefetch A into L1 and into L2
|
||||
#define ONE_ITER_MAIN_LOOP(C_ADDR, COUNTER) \
|
||||
{\
|
||||
__asm vbroadcastf32x4 zmm30, 0[r15] \
|
||||
__asm vmovaps zmm31, 0[rbx] \
|
||||
\
|
||||
__asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \
|
||||
__asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \
|
||||
__asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \
|
||||
__asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\
|
||||
__asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\
|
||||
__asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 0[r15 + r14] \
|
||||
__asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \
|
||||
__asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \
|
||||
__asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \
|
||||
__asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 2*64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 3*64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \
|
||||
__asm add r15, r12 \
|
||||
__asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\
|
||||
__asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \
|
||||
__asm dec COUNTER \
|
||||
__asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \
|
||||
\
|
||||
\
|
||||
__asm vprefetch1 0[rbx + r13] \
|
||||
__asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \
|
||||
__asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \
|
||||
__asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \
|
||||
__asm add rbx, r9 \
|
||||
__asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \
|
||||
__asm cmp COUNTER, 0 \
|
||||
__asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \
|
||||
}
|
||||
|
||||
//One iteration of the k_r loop.
|
||||
//Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch one line of C into the L2 cache
|
||||
//Current placement of this prefetch instruction is somewhat arbitrary.
|
||||
#define ONE_ITER_PC_L2(C_ADDR) \
|
||||
{\
|
||||
__asm vbroadcastf32x4 zmm30, 0[r15] \
|
||||
__asm vmovaps zmm31, 0[rbx] \
|
||||
\
|
||||
__asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \
|
||||
__asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \
|
||||
__asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \
|
||||
__asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\
|
||||
__asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\
|
||||
__asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 0[r15 + r14] \
|
||||
__asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \
|
||||
__asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \
|
||||
__asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \
|
||||
__asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \
|
||||
__asm vprefetch1 0[C_ADDR] \
|
||||
__asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 2*64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 3*64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \
|
||||
__asm add r15, r12 \
|
||||
__asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\
|
||||
__asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \
|
||||
__asm add C_ADDR, r11 \
|
||||
__asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \
|
||||
__asm dec r8 \
|
||||
__asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \
|
||||
\
|
||||
\
|
||||
__asm vprefetch1 0[rbx + r13] \
|
||||
__asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \
|
||||
__asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \
|
||||
__asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \
|
||||
__asm add rbx, r9 \
|
||||
__asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \
|
||||
__asm cmp r8, 0 \
|
||||
__asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \
|
||||
\
|
||||
}
|
||||
|
||||
//One iteration of the k_r loop.
|
||||
//Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch 3 cache lines of C into the L1 cache
|
||||
//Current placement of these prefetch instructions is somewhat arbitrary.
|
||||
#define ONE_ITER_PC_L1(C_ADDR) \
|
||||
{\
|
||||
__asm vbroadcastf32x4 zmm30, 0[r15] \
|
||||
__asm vmovaps zmm31, 0[rbx] \
|
||||
\
|
||||
__asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \
|
||||
__asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \
|
||||
__asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \
|
||||
__asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\
|
||||
__asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \
|
||||
__asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\
|
||||
__asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 0[r15 + r14] \
|
||||
__asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \
|
||||
__asm vprefetch0 0[C_ADDR] \
|
||||
__asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \
|
||||
__asm add C_ADDR, r11 \
|
||||
__asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \
|
||||
__asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \
|
||||
__asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \
|
||||
__asm vprefetch0 0[C_ADDR] \
|
||||
__asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \
|
||||
__asm add C_ADDR, r11 \
|
||||
__asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 2*64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \
|
||||
__asm vprefetch0 0[C_ADDR] \
|
||||
__asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \
|
||||
__asm add C_ADDR, r11 \
|
||||
__asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \
|
||||
\
|
||||
__asm vprefetch1 3*64[r15 + r14] \
|
||||
__asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \
|
||||
__asm add r15, r12 \
|
||||
__asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\
|
||||
__asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \
|
||||
__asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \
|
||||
__asm dec r8 \
|
||||
__asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \
|
||||
\
|
||||
\
|
||||
__asm vprefetch1 0[rbx + r13] \
|
||||
__asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \
|
||||
__asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \
|
||||
__asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \
|
||||
__asm add rbx, r9 \
|
||||
__asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \
|
||||
__asm cmp r8, 0 \
|
||||
__asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \
|
||||
\
|
||||
}
|
||||
|
||||
//This is an array used for the scattter/gather instructions.
|
||||
int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
|
||||
|
||||
//#define MONITORS
|
||||
//#define LOOPMON
|
||||
void bli_sgemm_opt_30x16(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
float * a_next = bli_auxinfo_next_a( data );
|
||||
float * b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
int * offsetPtr = &offsets[0];
|
||||
|
||||
#ifdef MONITORS
|
||||
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
|
||||
#endif
|
||||
#ifdef LOOPMON
|
||||
int tlooph, tloopl, blooph, bloopl;
|
||||
#endif
|
||||
|
||||
__asm
|
||||
{
|
||||
#ifdef MONITORS
|
||||
rdtsc
|
||||
mov topl, eax
|
||||
mov toph, edx
|
||||
#endif
|
||||
vpxord zmm0, zmm0, zmm0
|
||||
vmovaps zmm1, zmm0 //clear out registers
|
||||
vmovaps zmm2, zmm0
|
||||
mov rsi, k //loop index
|
||||
vmovaps zmm3, zmm0
|
||||
|
||||
mov r11, rs_c //load row stride
|
||||
vmovaps zmm4, zmm0
|
||||
sal r11, 2 //scale row stride
|
||||
vmovaps zmm5, zmm0
|
||||
mov r15, a //load address of a
|
||||
vmovaps zmm6, zmm0
|
||||
mov rbx, b //load address of b
|
||||
vmovaps zmm7, zmm0
|
||||
|
||||
vmovaps zmm8, zmm0
|
||||
lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11
|
||||
vmovaps zmm9, zmm0
|
||||
vmovaps zmm10, zmm0
|
||||
mov rdi, r11
|
||||
vmovaps zmm11, zmm0
|
||||
sal rdi, 2 //rdi has 4*r11
|
||||
|
||||
vmovaps zmm12, zmm0
|
||||
mov rcx, c //load address of c for prefetching
|
||||
vmovaps zmm13, zmm0
|
||||
vmovaps zmm14, zmm0
|
||||
mov r8, k
|
||||
vmovaps zmm15, zmm0
|
||||
|
||||
vmovaps zmm16, zmm0
|
||||
vmovaps zmm17, zmm0
|
||||
mov r13, L2_PREFETCH_DIST*4*16
|
||||
vmovaps zmm18, zmm0
|
||||
mov r14, L2_PREFETCH_DIST*4*32
|
||||
vmovaps zmm19, zmm0
|
||||
vmovaps zmm20, zmm0
|
||||
vmovaps zmm21, zmm0
|
||||
vmovaps zmm22, zmm0
|
||||
|
||||
vmovaps zmm23, zmm0
|
||||
sub r8, 30 + L2_PREFETCH_DIST //Check if we have over 40 operations to do.
|
||||
vmovaps zmm24, zmm0
|
||||
mov r8, 30
|
||||
vmovaps zmm25, zmm0
|
||||
mov r9, 16*4 //amount to increment b* by each iteration
|
||||
vmovaps zmm26, zmm0
|
||||
mov r12, 32*4 //amount to increment a* by each iteration
|
||||
vmovaps zmm27, zmm0
|
||||
vmovaps zmm28, zmm0
|
||||
vmovaps zmm29, zmm0
|
||||
|
||||
#ifdef MONITORS
|
||||
rdtsc
|
||||
mov midl, eax
|
||||
mov midh, edx
|
||||
#endif
|
||||
jle CONSIDER_UNDER_40
|
||||
sub rsi, 30 + L2_PREFETCH_DIST
|
||||
|
||||
//First 30 iterations
|
||||
LOOPREFECHCL2:
|
||||
ONE_ITER_PC_L2(rcx)
|
||||
jne LOOPREFECHCL2
|
||||
mov rcx, c
|
||||
|
||||
//Main Loop.
|
||||
LOOPMAIN:
|
||||
ONE_ITER_MAIN_LOOP(rcx, rsi)
|
||||
jne LOOPMAIN
|
||||
|
||||
//Penultimate 22 iterations.
|
||||
//Break these off from the main loop to avoid prefetching extra shit.
|
||||
mov r14, a_next
|
||||
mov r13, b_next
|
||||
sub r14, r15
|
||||
sub r13, rbx
|
||||
|
||||
mov rsi, L2_PREFETCH_DIST-10
|
||||
LOOPMAIN2:
|
||||
ONE_ITER_MAIN_LOOP(rcx, rsi)
|
||||
jne LOOPMAIN2
|
||||
|
||||
|
||||
//Last 10 iterations
|
||||
mov r8, 10
|
||||
LOOPREFETCHCL1:
|
||||
ONE_ITER_PC_L1(rcx)
|
||||
jne LOOPREFETCHCL1
|
||||
|
||||
|
||||
jmp POSTACCUM
|
||||
|
||||
//Alternate main loop, with no prefetching of C
|
||||
//Used when <= 40 iterations
|
||||
CONSIDER_UNDER_40:
|
||||
mov rsi, k
|
||||
LOOP_UNDER_40:
|
||||
ONE_ITER_MAIN_LOOP(rcx, rsi)
|
||||
jne LOOP_UNDER_40
|
||||
|
||||
|
||||
|
||||
POSTACCUM:
|
||||
|
||||
#ifdef MONITORS
|
||||
rdtsc
|
||||
mov mid2l, eax
|
||||
mov mid2h, edx
|
||||
#endif
|
||||
|
||||
mov r9, c //load address of c for update
|
||||
mov r12, alpha //load address of alpha
|
||||
|
||||
// Check if C is row stride. If not, jump to the slow scattered update
|
||||
mov r14, cs_c
|
||||
dec r14
|
||||
jne SCATTEREDUPDATE
|
||||
|
||||
mov r14, beta
|
||||
vbroadcastss zmm31, 0[r14]
|
||||
|
||||
|
||||
vmulps zmm0, zmm0, 0[r12]{1to16}
|
||||
vmulps zmm1, zmm1, 0[r12]{1to16}
|
||||
vmulps zmm2, zmm2, 0[r12]{1to16}
|
||||
vmulps zmm3, zmm3, 0[r12]{1to16}
|
||||
vfmadd231ps zmm0, zmm31, [r9+0]
|
||||
vfmadd231ps zmm1, zmm31, [r9+r11+0]
|
||||
vfmadd231ps zmm2, zmm31, [r9+2*r11+0]
|
||||
vfmadd231ps zmm3, zmm31, [r9+r10+0]
|
||||
vmovaps [r9+0], zmm0
|
||||
vmovaps [r9+r11+0], zmm1
|
||||
vmovaps [r9+2*r11+0], zmm2
|
||||
vmovaps [r9+r10+0], zmm3
|
||||
add r9, rdi
|
||||
|
||||
vmulps zmm4, zmm4, 0[r12]{1to16}
|
||||
vmulps zmm5, zmm5, 0[r12]{1to16}
|
||||
vmulps zmm6, zmm6, 0[r12]{1to16}
|
||||
vmulps zmm7, zmm7, 0[r12]{1to16}
|
||||
vfmadd231ps zmm4, zmm31, [r9+0]
|
||||
vfmadd231ps zmm5, zmm31, [r9+r11+0]
|
||||
vfmadd231ps zmm6, zmm31, [r9+2*r11+0]
|
||||
vfmadd231ps zmm7, zmm31, [r9+r10+0]
|
||||
vmovaps [r9+0], zmm4
|
||||
vmovaps [r9+r11+0], zmm5
|
||||
vmovaps [r9+2*r11+0], zmm6
|
||||
vmovaps [r9+r10+0], zmm7
|
||||
add r9, rdi
|
||||
|
||||
vmulps zmm8, zmm8, 0[r12]{1to16}
|
||||
vmulps zmm9, zmm9, 0[r12]{1to16}
|
||||
vmulps zmm10, zmm10, 0[r12]{1to16}
|
||||
vmulps zmm11, zmm11, 0[r12]{1to16}
|
||||
vfmadd231ps zmm8, zmm31, [r9+0]
|
||||
vfmadd231ps zmm9, zmm31, [r9+r11+0]
|
||||
vfmadd231ps zmm10, zmm31, [r9+2*r11+0]
|
||||
vfmadd231ps zmm11, zmm31, [r9+r10+0]
|
||||
vmovaps [r9+0], zmm8
|
||||
vmovaps [r9+r11+0], zmm9
|
||||
vmovaps [r9+2*r11+0], zmm10
|
||||
vmovaps [r9+r10+0], zmm11
|
||||
add r9, rdi
|
||||
|
||||
vmulps zmm12, zmm12, 0[r12]{1to16}
|
||||
vmulps zmm13, zmm13, 0[r12]{1to16}
|
||||
vmulps zmm14, zmm14, 0[r12]{1to16}
|
||||
vmulps zmm15, zmm15, 0[r12]{1to16}
|
||||
vfmadd231ps zmm12, zmm31, [r9+0]
|
||||
vfmadd231ps zmm13, zmm31, [r9+r11+0]
|
||||
vfmadd231ps zmm14, zmm31, [r9+2*r11+0]
|
||||
vfmadd231ps zmm15, zmm31, [r9+r10+0]
|
||||
vmovaps [r9+0], zmm12
|
||||
vmovaps [r9+r11+0], zmm13
|
||||
vmovaps [r9+2*r11+0], zmm14
|
||||
vmovaps [r9+r10+0], zmm15
|
||||
add r9, rdi
|
||||
|
||||
vmulps zmm16, zmm16, 0[r12]{1to16}
|
||||
vmulps zmm17, zmm17, 0[r12]{1to16}
|
||||
vmulps zmm18, zmm18, 0[r12]{1to16}
|
||||
vmulps zmm19, zmm19, 0[r12]{1to16}
|
||||
vfmadd231ps zmm16, zmm31, [r9+0]
|
||||
vfmadd231ps zmm17, zmm31, [r9+r11+0]
|
||||
vfmadd231ps zmm18, zmm31, [r9+2*r11+0]
|
||||
vfmadd231ps zmm19, zmm31, [r9+r10+0]
|
||||
vmovaps [r9+0], zmm16
|
||||
vmovaps [r9+r11+0], zmm17
|
||||
vmovaps [r9+2*r11+0], zmm18
|
||||
vmovaps [r9+r10+0], zmm19
|
||||
add r9, rdi
|
||||
|
||||
vmulps zmm20, zmm20, 0[r12]{1to16}
|
||||
vmulps zmm21, zmm21, 0[r12]{1to16}
|
||||
vmulps zmm22, zmm22, 0[r12]{1to16}
|
||||
vmulps zmm23, zmm23, 0[r12]{1to16}
|
||||
vfmadd231ps zmm20, zmm31, [r9+0]
|
||||
vfmadd231ps zmm21, zmm31, [r9+r11+0]
|
||||
vfmadd231ps zmm22, zmm31, [r9+2*r11+0]
|
||||
vfmadd231ps zmm23, zmm31, [r9+r10+0]
|
||||
vmovaps [r9+0], zmm20
|
||||
vmovaps [r9+r11+0], zmm21
|
||||
vmovaps [r9+2*r11+0], zmm22
|
||||
vmovaps [r9+r10+0], zmm23
|
||||
add r9, rdi
|
||||
|
||||
vmulps zmm24, zmm24, 0[r12]{1to16}
|
||||
vmulps zmm25, zmm25, 0[r12]{1to16}
|
||||
vmulps zmm26, zmm26, 0[r12]{1to16}
|
||||
vmulps zmm27, zmm27, 0[r12]{1to16}
|
||||
vfmadd231ps zmm24, zmm31, [r9+0]
|
||||
vfmadd231ps zmm25, zmm31, [r9+r11+0]
|
||||
vfmadd231ps zmm26, zmm31, [r9+2*r11+0]
|
||||
vfmadd231ps zmm27, zmm31, [r9+r10+0]
|
||||
vmovaps [r9+0], zmm24
|
||||
vmovaps [r9+r11+0], zmm25
|
||||
vmovaps [r9+2*r11+0], zmm26
|
||||
vmovaps [r9+r10+0], zmm27
|
||||
add r9, rdi
|
||||
|
||||
vmulps zmm28, zmm28, 0[r12]{1to16}
|
||||
vmulps zmm29, zmm29, 0[r12]{1to16}
|
||||
vfmadd231ps zmm28, zmm31, [r9+0]
|
||||
vfmadd231ps zmm29, zmm31, [r9+r11+0]
|
||||
vmovaps [r9+0], zmm28
|
||||
vmovaps [r9+r11+0], zmm29
|
||||
|
||||
jmp END
|
||||
|
||||
SCATTEREDUPDATE:
|
||||
|
||||
mov r10, offsetPtr
|
||||
vmovaps zmm31, 0[r10]
|
||||
vpbroadcastd zmm30, cs_c
|
||||
mov r13, beta
|
||||
vpmulld zmm30, zmm31, zmm30
|
||||
|
||||
mov ebx, 0xFFFF
|
||||
UPDATE_C_ROW_SCATTERED(zmm0, 0, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm1, 1, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm2, 2, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm3, 3, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm4, 4, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm5, 5, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm6, 6, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm7, 7, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm8, 8, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm9, 9, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm10, 10, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm11, 11, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm12, 12, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm13, 13, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm14, 14, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm15, 15, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm16, 16, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm17, 17, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm18, 18, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm19, 19, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm20, 20, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm21, 21, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm22, 22, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm23, 23, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm24, 24, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm25, 25, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm26, 26, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm27, 27, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm28, 28, r9)
|
||||
UPDATE_C_ROW_SCATTERED(zmm29, 29, r9)
|
||||
|
||||
END:
|
||||
#ifdef MONITORS
|
||||
rdtsc
|
||||
mov botl, eax
|
||||
mov both, edx
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef LOOPMON
|
||||
printf("looptime = \t%d\n", bloopl - tloopl);
|
||||
#endif
|
||||
#ifdef MONITORS
|
||||
dim_t top = ((dim_t)toph << 32) | topl;
|
||||
dim_t mid = ((dim_t)midh << 32) | midl;
|
||||
dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
|
||||
dim_t bot = ((dim_t)both << 32) | botl;
|
||||
printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
|
||||
#endif
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user