Added beta == 0 optimization to x86_64 ukernel.

Details:
- Modified x86_64 gemm microkernel so that when beta is zero, C is not read
  from memory (nor scaled by beta).
- Fixed minor bug in test suite driver when "Test all combinations of storage
  schemes?" switch is disabled, which would result in redundant tests being
  executed for matrix-only (e.g. level-1m, level-3) operations if multiple
  vector storage schemes were specified.
- Restored debug flags as default in clarksville configuration.
This commit is contained in:
Field G. Van Zee
2013-06-07 11:04:10 -05:00
parent f1aa6b81cc
commit 05a657a6b9
3 changed files with 155 additions and 21 deletions

View File

@@ -80,9 +80,9 @@ CC := gcc
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CDBGFLAGS := #-g
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O2 -fomit-frame-pointer
COPTFLAGS := -O2 #-fomit-frame-pointer
CVECFLAGS := -msse3 -march=nocona -mfpmath=sse
# Aggregate all of the flags into two groups: one for optimizable code, and

View File

@@ -320,23 +320,12 @@ void bli_dgemm_opt_d4x4(
"movddup (%%rbx), %%xmm7 \n\t" // load beta and duplicate
" \n\t"
" \n\t"
//"movq %6, %%rcx \n\t" // load address of c
//"movq %8, %%rdi \n\t" // load cs_c
//"salq $3, %%rdi \n\t" // cs_c *= sizeof(double)
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c
"movq %%rsi, %%r8 \n\t" // make a copy of rs_c
//"salq $3, %%rsi \n\t" // rs_c *= sizeof(double)
"leaq (,%%rsi,8), %%rsi \n\t" // rs_c *= sizeof(double)
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c;
" \n\t"
" \n\t"
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
" \n\t" // ab10 ) ab11 ) ab12 ) ab13 )
@@ -368,12 +357,26 @@ void bli_dgemm_opt_d4x4(
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
" \n\t"
" \n\t"
" \n\t" // assert: c % 16 == 0 && rs_c == 1
" \n\t"
" \n\t" // determine if
" \n\t" // c % 16 == 0, AND
" \n\t" // rs_c == 1
" \n\t" // ie: aligned and column-stored
" \n\t"
"cmpq $1, %%r8 \n\t" // set ZF if rs_c == 1.
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
"testq $15, %%rcx \n\t" // set ZF if c & 16 is zero.
"setz %%bh \n\t" // bh = ( ZF == 1 ? 1 : 0 );
" \n\t" // and(bl,bh) will reveal result
" \n\t"
" \n\t" // now avoid loading C if beta == 0
" \n\t"
//"xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero.
//"ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0.
//"je .BETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
" \n\t"
" \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"jne .COLSTORED \n\t" // jump to column storage case
" \n\t"
@@ -381,8 +384,6 @@ void bli_dgemm_opt_d4x4(
" \n\t"
".GENSTORED: \n\t"
" \n\t"
" \n\t"
" \n\t"
"movlpd (%%rcx), %%xmm0 \n\t" // load c00 and c10,
"movhpd (%%rcx,%%rsi), %%xmm0 \n\t"
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
@@ -450,7 +451,7 @@ void bli_dgemm_opt_d4x4(
"addpd %%xmm11, %%xmm0 \n\t" // add the gemm result,
"movlpd %%xmm0, (%%rcx) \n\t" // and store back to memory.
"movhpd %%xmm0, (%%rcx,%%rsi) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"movlpd (%%rdx), %%xmm1 \n\t" // load c23 and c33,
"movhpd (%%rdx,%%rsi), %%xmm1 \n\t"
@@ -460,8 +461,6 @@ void bli_dgemm_opt_d4x4(
"movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory.
"movhpd %%xmm1, (%%rdx,%%rsi) \n\t"
" \n\t"
" \n\t"
" \n\t"
"jmp .DONE \n\t" // jump to end.
" \n\t"
" \n\t"
@@ -521,7 +520,7 @@ void bli_dgemm_opt_d4x4(
"mulpd %%xmm7, %%xmm0 \n\t" // scale by beta,
"addpd %%xmm11, %%xmm0 \n\t" // add the gemm result,
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"movaps (%%rdx), %%xmm1 \n\t" // load c23 and c33,
"mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha,
@@ -529,6 +528,114 @@ void bli_dgemm_opt_d4x4(
"addpd %%xmm15, %%xmm1 \n\t" // add the gemm result,
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
" \n\t"
"jmp .DONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".BETAZERO: \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"jne .COLSTORBZ \n\t" // jump to column storage case
" \n\t"
" \n\t"
" \n\t"
".GENSTORBZ: \n\t"
" \n\t" // skip loading c00 and c10,
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
"movlpd %%xmm8, (%%rcx) \n\t" // and store back to memory.
"movhpd %%xmm8, (%%rcx,%%rsi) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c20 and c30,
"mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha,
"movlpd %%xmm12, (%%rdx) \n\t" // and store back to memory.
"movhpd %%xmm12, (%%rdx,%%rsi) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c01 and c11,
"mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha,
"movlpd %%xmm9, (%%rcx) \n\t" // and store back to memory.
"movhpd %%xmm9, (%%rcx,%%rsi) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c21 and c31,
"mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha,
"movlpd %%xmm13, (%%rdx) \n\t" // and store back to memory.
"movhpd %%xmm13, (%%rdx,%%rsi) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c02 and c12,
"mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha,
"movlpd %%xmm10, (%%rcx) \n\t" // and store back to memory.
"movhpd %%xmm10, (%%rcx,%%rsi) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c22 and c32,
"mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha,
"movlpd %%xmm14, (%%rdx) \n\t" // and store back to memory.
"movhpd %%xmm14, (%%rdx,%%rsi) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c03 and c13,
"mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha,
"movlpd %%xmm11, (%%rcx) \n\t" // and store back to memory.
"movhpd %%xmm11, (%%rcx,%%rsi) \n\t"
" \n\t"
" \n\t" // skip loading c23 and c33,
"mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha,
"movlpd %%xmm15, (%%rdx) \n\t" // and store back to memory.
"movhpd %%xmm15, (%%rdx,%%rsi) \n\t"
" \n\t"
"jmp .DONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
".COLSTORBZ: \n\t"
" \n\t"
" \n\t" // skip loading c00 and c10,
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
"movaps %%xmm8, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c20 and c30,
"mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha,
"movaps %%xmm12, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c01 and c11,
"mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha,
"movaps %%xmm9, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c21 and c31,
"mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha,
"movaps %%xmm13, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c02 and c12,
"mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha,
"movaps %%xmm10, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c22 and c32,
"mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha,
"movaps %%xmm14, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c03 and c13,
"mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha,
"movaps %%xmm11, (%%rcx) \n\t" // and store back to memory.
" \n\t"
" \n\t" // skip loading c23 and c33,
"mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha,
"movaps %%xmm15, (%%rdx) \n\t" // and store back to memory.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".DONE: \n\t"

View File

@@ -978,7 +978,34 @@ void libblis_test_op_driver( test_params_t* params,
// Only run combinations where all operands of either type (matrices
// or vectors) are stored in one storage scheme or another (no mixing
// of schemes within the same operand type).
n_store_combos = n_mstorage * n_vstorage;
unsigned int n_mat_operands = 0;
unsigned int n_vec_operands = 0;
for ( o = 0; o < n_operands; ++o )
{
operand_t operand_type
= libblis_test_get_operand_type_for_char( o_types[o] );
if ( operand_type == BLIS_TEST_MATRIX_OPERAND ) ++n_mat_operands;
else if ( operand_type == BLIS_TEST_VECTOR_OPERAND ) ++n_vec_operands;
}
// We compute the total number of storage combinations based on whether
// the current operation has only matrix operands, only vector operands,
// or both.
if ( n_vec_operands == 0 )
{
n_store_combos = n_mstorage;
n_vstorage = 1;
}
else if ( n_mat_operands == 0 )
{
n_store_combos = n_vstorage;
n_mstorage = 1;
}
else
{
n_store_combos = n_mstorage * n_vstorage;
}
sc_str = ( char** ) malloc( n_store_combos * sizeof( char* ) );