mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Added beta == 0 optimization to x86_64 ukernel.
Details: - Modified x86_64 gemm microkernel so that when beta is zero, C is not read from memory (nor scaled by beta). - Fixed minor bug in test suite driver when "Test all combinations of storage schemes?" switch is disabled, which would result in redundant tests being executed for matrix-only (e.g. level-1m, level-3) operations if multiple vector storage schemes were specified. - Restored debug flags as default in clarksville configuration.
This commit is contained in:
@@ -80,9 +80,9 @@ CC := gcc
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CDBGFLAGS := #-g
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
COPTFLAGS := -O2 #-fomit-frame-pointer
|
||||
CVECFLAGS := -msse3 -march=nocona -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into two groups: one for optimizable code, and
|
||||
|
||||
@@ -320,23 +320,12 @@ void bli_dgemm_opt_d4x4(
|
||||
"movddup (%%rbx), %%xmm7 \n\t" // load beta and duplicate
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
//"movq %6, %%rcx \n\t" // load address of c
|
||||
//"movq %8, %%rdi \n\t" // load cs_c
|
||||
//"salq $3, %%rdi \n\t" // cs_c *= sizeof(double)
|
||||
" \n\t"
|
||||
"movq %7, %%rsi \n\t" // load rs_c
|
||||
"movq %%rsi, %%r8 \n\t" // make a copy of rs_c
|
||||
//"salq $3, %%rsi \n\t" // rs_c *= sizeof(double)
|
||||
"leaq (,%%rsi,8), %%rsi \n\t" // rs_c *= sizeof(double)
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // xmm8: xmm9: xmm10: xmm11:
|
||||
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
|
||||
" \n\t" // ab10 ) ab11 ) ab12 ) ab13 )
|
||||
@@ -368,12 +357,26 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // assert: c % 16 == 0 && rs_c == 1
|
||||
" \n\t"
|
||||
" \n\t" // determine if
|
||||
" \n\t" // c % 16 == 0, AND
|
||||
" \n\t" // rs_c == 1
|
||||
" \n\t" // ie: aligned and column-stored
|
||||
" \n\t"
|
||||
"cmpq $1, %%r8 \n\t" // set ZF if rs_c == 1.
|
||||
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
|
||||
"testq $15, %%rcx \n\t" // set ZF if c & 16 is zero.
|
||||
"setz %%bh \n\t" // bh = ( ZF == 1 ? 1 : 0 );
|
||||
" \n\t" // and(bl,bh) will reveal result
|
||||
" \n\t"
|
||||
" \n\t" // now avoid loading C if beta == 0
|
||||
" \n\t"
|
||||
//"xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero.
|
||||
//"ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0.
|
||||
//"je .BETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"jne .COLSTORED \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
@@ -381,8 +384,6 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t"
|
||||
".GENSTORED: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlpd (%%rcx), %%xmm0 \n\t" // load c00 and c10,
|
||||
"movhpd (%%rcx,%%rsi), %%xmm0 \n\t"
|
||||
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
@@ -450,7 +451,7 @@ void bli_dgemm_opt_d4x4(
|
||||
"addpd %%xmm11, %%xmm0 \n\t" // add the gemm result,
|
||||
"movlpd %%xmm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm0, (%%rcx,%%rsi) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlpd (%%rdx), %%xmm1 \n\t" // load c23 and c33,
|
||||
"movhpd (%%rdx,%%rsi), %%xmm1 \n\t"
|
||||
@@ -460,8 +461,6 @@ void bli_dgemm_opt_d4x4(
|
||||
"movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm1, (%%rdx,%%rsi) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"jmp .DONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -521,7 +520,7 @@ void bli_dgemm_opt_d4x4(
|
||||
"mulpd %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addpd %%xmm11, %%xmm0 \n\t" // add the gemm result,
|
||||
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rdx), %%xmm1 \n\t" // load c23 and c33,
|
||||
"mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha,
|
||||
@@ -529,6 +528,114 @@ void bli_dgemm_opt_d4x4(
|
||||
"addpd %%xmm15, %%xmm1 \n\t" // add the gemm result,
|
||||
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
"jmp .DONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".BETAZERO: \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"jne .COLSTORBZ \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".GENSTORBZ: \n\t"
|
||||
" \n\t" // skip loading c00 and c10,
|
||||
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm8, (%%rcx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm8, (%%rcx,%%rsi) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c20 and c30,
|
||||
"mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm12, (%%rdx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm12, (%%rdx,%%rsi) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c01 and c11,
|
||||
"mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm9, (%%rcx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm9, (%%rcx,%%rsi) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c21 and c31,
|
||||
"mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm13, (%%rdx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm13, (%%rdx,%%rsi) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c02 and c12,
|
||||
"mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm10, (%%rcx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm10, (%%rcx,%%rsi) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c22 and c32,
|
||||
"mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm14, (%%rdx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm14, (%%rdx,%%rsi) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c03 and c13,
|
||||
"mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm11, (%%rcx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm11, (%%rcx,%%rsi) \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c23 and c33,
|
||||
"mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm15, (%%rdx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm15, (%%rdx,%%rsi) \n\t"
|
||||
" \n\t"
|
||||
"jmp .DONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".COLSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c00 and c10,
|
||||
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
"movaps %%xmm8, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c20 and c30,
|
||||
"mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha,
|
||||
"movaps %%xmm12, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c01 and c11,
|
||||
"mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha,
|
||||
"movaps %%xmm9, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c21 and c31,
|
||||
"mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha,
|
||||
"movaps %%xmm13, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c02 and c12,
|
||||
"mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha,
|
||||
"movaps %%xmm10, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c22 and c32,
|
||||
"mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha,
|
||||
"movaps %%xmm14, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c03 and c13,
|
||||
"mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha,
|
||||
"movaps %%xmm11, (%%rcx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c23 and c33,
|
||||
"mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha,
|
||||
"movaps %%xmm15, (%%rdx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DONE: \n\t"
|
||||
|
||||
@@ -978,7 +978,34 @@ void libblis_test_op_driver( test_params_t* params,
|
||||
// Only run combinations where all operands of either type (matrices
|
||||
// or vectors) are stored in one storage scheme or another (no mixing
|
||||
// of schemes within the same operand type).
|
||||
n_store_combos = n_mstorage * n_vstorage;
|
||||
unsigned int n_mat_operands = 0;
|
||||
unsigned int n_vec_operands = 0;
|
||||
|
||||
for ( o = 0; o < n_operands; ++o )
|
||||
{
|
||||
operand_t operand_type
|
||||
= libblis_test_get_operand_type_for_char( o_types[o] );
|
||||
if ( operand_type == BLIS_TEST_MATRIX_OPERAND ) ++n_mat_operands;
|
||||
else if ( operand_type == BLIS_TEST_VECTOR_OPERAND ) ++n_vec_operands;
|
||||
}
|
||||
|
||||
// We compute the total number of storage combinations based on whether
|
||||
// the current operation has only matrix operands, only vector operands,
|
||||
// or both.
|
||||
if ( n_vec_operands == 0 )
|
||||
{
|
||||
n_store_combos = n_mstorage;
|
||||
n_vstorage = 1;
|
||||
}
|
||||
else if ( n_mat_operands == 0 )
|
||||
{
|
||||
n_store_combos = n_vstorage;
|
||||
n_mstorage = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
n_store_combos = n_mstorage * n_vstorage;
|
||||
}
|
||||
|
||||
sc_str = ( char** ) malloc( n_store_combos * sizeof( char* ) );
|
||||
|
||||
|
||||
Reference in New Issue
Block a user