Removed support for duplication.

Details:
- Removed support for duplication from the gemmtrsm/trsm micro-kernels
  and all framework code.
- Updated test suite modules according to above changes.
This commit is contained in:
Field G. Van Zee
2013-11-08 11:17:34 -06:00
parent 68a5910974
commit 376bbb59c8
72 changed files with 206 additions and 1873 deletions

View File

@@ -152,34 +152,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -265,10 +237,6 @@
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#include "bli_gemm_8x8.h"

View File

@@ -146,30 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -262,10 +238,6 @@
//#include "bli_trsm_l_ref_4x4.h"
//#include "bli_trsm_u_ref_4x4.h"
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#define GEMM_UKERNEL gemm_opt_d4x4

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 1
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -261,10 +233,6 @@
#include "bli_gemm_opt_d4x4.h"
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#define GEMM_UKERNEL gemm_opt_d4x4

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 8
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#include "bli_gemm_opt_30x8.h"

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#include "bli_gemm_4x6.h"

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
//#define GEMM_UKERNEL gemm_ref_mxn

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#define GEMM_UKERNEL gemm_ref_mxn

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -261,10 +233,6 @@
#include "bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h"
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#define GEMM_UKERNEL gemm_opt_8x4_ref_u4_nodupl_avx1

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -265,10 +237,6 @@
#include "bli_gemmtrsm_l_opt_mxn.h"
#include "bli_gemmtrsm_u_opt_mxn.h"
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#define GEMM_UKERNEL gemm_opt_mxn

View File

@@ -41,8 +41,7 @@ void bli_sgemmtrsm_l_opt_mxn(
float* restrict alpha,
float* restrict a10,
float* restrict a11,
float* restrict bd01,
float* restrict bd11,
float* restrict b01,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
@@ -58,7 +57,7 @@ void bli_sgemmtrsm_l_opt_mxn(
bli_sgemm_opt_mxn( k,
minus_one,
a10,
bd01,
b01,
alpha,
b11, rs_b, cs_b,
a_next,
@@ -66,7 +65,6 @@ void bli_sgemmtrsm_l_opt_mxn(
bli_strsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
@@ -77,8 +75,7 @@ void bli_dgemmtrsm_l_opt_mxn(
double* restrict alpha,
double* restrict a10,
double* restrict a11,
double* restrict bd01,
double* restrict bd11,
double* restrict b01,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
@@ -171,19 +168,6 @@ void bli_dgemmtrsm_l_opt_mxn(
k MR
Thus, with duplication enabled, the operation takes the form of:
b11 = alpha * b11 - a10 * bd01;
b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11;
And if duplication is disabled, the operation reduces to:
b11 = alpha * b11 - a10 * b01; (Note: Here, b01 == bd01.)
b11 = inv(a11) * b11;
c11 = b11;
A note on optimization:
- This implementation simply calls the gemm micro-kernel and then the
trsm micro-kernel. Let's assume that the gemm micro-kernel has already
@@ -208,24 +192,20 @@ void bli_dgemmtrsm_l_opt_mxn(
double* restrict minus_one = bli_dm1;
/* Reminder: if duplication is disabled, then bd01 == b01, bd11 == b11. */
/* b11 = alpha * b11 - a10 * bd01; */
/* b11 = alpha * b11 - a10 * b01; */
bli_dgemm_opt_mxn( k,
minus_one,
a10,
bd01,
b01,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */
/* b11 = inv(a11) * b11;
c11 = b11; */
bli_dtrsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
@@ -236,8 +216,7 @@ void bli_cgemmtrsm_l_opt_mxn(
scomplex* restrict alpha,
scomplex* restrict a10,
scomplex* restrict a11,
scomplex* restrict bd01,
scomplex* restrict bd11,
scomplex* restrict b01,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
@@ -253,7 +232,7 @@ void bli_cgemmtrsm_l_opt_mxn(
bli_cgemm_opt_mxn( k,
minus_one,
a10,
bd01,
b01,
alpha,
b11, rs_b, cs_b,
a_next,
@@ -261,7 +240,6 @@ void bli_cgemmtrsm_l_opt_mxn(
bli_ctrsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
@@ -272,8 +250,7 @@ void bli_zgemmtrsm_l_opt_mxn(
dcomplex* restrict alpha,
dcomplex* restrict a10,
dcomplex* restrict a11,
dcomplex* restrict bd01,
dcomplex* restrict bd11,
dcomplex* restrict b11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
@@ -289,7 +266,7 @@ void bli_zgemmtrsm_l_opt_mxn(
bli_zgemm_opt_mxn( k,
minus_one,
a10,
bd01,
b01,
alpha,
b11, rs_b, cs_b,
a_next,
@@ -297,7 +274,6 @@ void bli_zgemmtrsm_l_opt_mxn(
bli_ztrsm_l_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}

View File

@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b01, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -41,8 +41,7 @@ void bli_sgemmtrsm_u_opt_mxn(
float* restrict alpha,
float* restrict a12,
float* restrict a11,
float* restrict bd21,
float* restrict bd11,
float* restrict b21,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
@@ -58,7 +57,7 @@ void bli_sgemmtrsm_u_opt_mxn(
bli_sgemm_opt_mxn( k,
minus_one,
a12,
bd21,
b21,
alpha,
b11, rs_b, cs_b,
a_next,
@@ -66,7 +65,6 @@ void bli_sgemmtrsm_u_opt_mxn(
bli_strsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
@@ -77,8 +75,7 @@ void bli_dgemmtrsm_u_opt_mxn(
double* restrict alpha,
double* restrict a12,
double* restrict a11,
double* restrict bd21,
double* restrict bd11,
double* restrict b21,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
@@ -207,24 +204,20 @@ void bli_dgemmtrsm_u_opt_mxn(
double* restrict minus_one = bli_dm1;
/* Reminder: if duplication is disabled, then bd21 == b21, bd11 == b11. */
/* b11 = alpha * b11 - a12 * bd21; */
/* b11 = alpha * b11 - a12 * b21; */
bli_dgemm_opt_mxn( k,
minus_one,
a12,
bd21,
b21,
alpha,
b11, rs_b, cs_b,
a_next,
b_next );
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */
/* b11 = inv(a11) * b11;
c11 = b11; */
bli_dtrsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
@@ -235,8 +228,7 @@ void bli_cgemmtrsm_u_opt_mxn(
scomplex* restrict alpha,
scomplex* restrict a12,
scomplex* restrict a11,
scomplex* restrict bd21,
scomplex* restrict bd11,
scomplex* restrict b21,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
@@ -252,7 +244,7 @@ void bli_cgemmtrsm_u_opt_mxn(
bli_cgemm_opt_mxn( k,
minus_one,
a12,
bd21,
b21,
alpha,
b11, rs_b, cs_b,
a_next,
@@ -260,7 +252,6 @@ void bli_cgemmtrsm_u_opt_mxn(
bli_ctrsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}
@@ -271,8 +262,7 @@ void bli_zgemmtrsm_u_opt_mxn(
dcomplex* restrict alpha,
dcomplex* restrict a12,
dcomplex* restrict a11,
dcomplex* restrict bd21,
dcomplex* restrict bd11,
dcomplex* restrict b21,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
@@ -288,7 +278,7 @@ void bli_zgemmtrsm_u_opt_mxn(
bli_zgemm_opt_mxn( k,
minus_one,
a12,
bd21,
b21,
alpha,
b11, rs_b, cs_b,
a_next,
@@ -296,7 +286,6 @@ void bli_zgemmtrsm_u_opt_mxn(
bli_ztrsm_u_opt_mxn( a11,
b11,
bd11,
c11, rs_c, cs_c );
}

View File

@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b21, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -39,14 +39,12 @@
void bli_strsm_l_opt_mxn(
float* restrict a,
float* restrict b,
float* restrict bd,
float* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_strsm_l_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
@@ -55,7 +53,6 @@ void bli_strsm_l_opt_mxn(
void bli_dtrsm_l_opt_mxn(
double* restrict a,
double* restrict b,
double* restrict bd,
double* restrict c, inc_t rs_c, inc_t cs_c
)
{
@@ -76,11 +73,6 @@ void bli_dtrsm_l_opt_mxn(
where A11 is MR x MR and lower triangular, B11 is MR x NR, and C11 is
MR x NR.
NOTE: Here, this trsm micro-kernel supports element "duplication", a
feature that is enabled or disabled in bli_kernel.h. Duplication factors
are also defined in the aforementioned header. Duplication is NOT
commonly used and most developers may assume it is disabled.
Parameters:
- a11: The address of A11, which is the MR x MR lower triangular block
@@ -89,8 +81,6 @@ void bli_dtrsm_l_opt_mxn(
been inverted and the strictly upper triangle contains zeros.
- b11: The address of B11, which is the MR x NR subpartition of the
current packed (row-stored) micro-panel of B.
- bd11: The address of the duplicated copy of B11. If duplication is
disabled, then bd11 == b11.
- c11: The address of C11, which is the MR x NR block of the output
matrix (ie: the matrix provided by the user to the highest-level
trsm API call). C11 corresponds to the elements that exist in
@@ -110,12 +100,6 @@ void bli_dtrsm_l_opt_mxn(
- Note that the diagonal of the triangular matrix A11 contains the INVERSE
of those elements. This is done during packing so that we can avoid
expensive division instructions within this micro-kernel.
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
then the result must be written to three places: the sub-block within the
duplicated copy of the current micro-panel of B, the sub-block within the
current packed micro-panel of B, and the sub-block of the output matrix C.
When duplication is not used, the micro-kernel should update only the
latter two locations.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
@@ -191,14 +175,12 @@ void bli_dtrsm_l_opt_mxn(
void bli_ctrsm_l_opt_mxn(
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict bd,
scomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ctrsm_l_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
@@ -207,14 +189,12 @@ void bli_ctrsm_l_opt_mxn(
void bli_ztrsm_l_opt_mxn(
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict bd,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ztrsm_l_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}

View File

@@ -42,7 +42,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);

View File

@@ -39,14 +39,12 @@
void bli_strsm_u_opt_mxn(
float* restrict a,
float* restrict b,
float* restrict bd,
float* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_strsm_u_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
@@ -55,7 +53,6 @@ void bli_strsm_u_opt_mxn(
void bli_dtrsm_u_opt_mxn(
double* restrict a,
double* restrict b,
double* restrict bd,
double* restrict c, inc_t rs_c, inc_t cs_c
)
{
@@ -76,11 +73,6 @@ void bli_dtrsm_u_opt_mxn(
where A11 is MR x MR and upper triangular, B11 is MR x NR, and C11 is
MR x NR.
NOTE: Here, this trsm micro-kernel supports element "duplication", a
feature that is enabled or disabled in bli_kernel.h. Duplication factors
are also defined in the aforementioned header. Duplication is NOT
commonly used and most developers may assume it is disabled.
Parameters:
- a11: The address of A11, which is the MR x MR upper triangular block
@@ -89,8 +81,6 @@ void bli_dtrsm_u_opt_mxn(
been inverted and the strictly lower triangle contains zeros.
- b11: The address of B11, which is the MR x NR subpartition of the
current packed (row-stored) micro-panel of B.
- bd11: The address of the duplicated copy of B11. If duplication is
disabled, then bd11 == b11.
- c11: The address of C11, which is the MR x NR block of the output
matrix (ie: the matrix provided by the user to the highest-level
trsm API call). C11 corresponds to the elements that exist in
@@ -110,12 +100,6 @@ void bli_dtrsm_u_opt_mxn(
- Note that the diagonal of the triangular matrix A11 contains the INVERSE
of those elements. This is done during packing so that we can avoid
expensive division instructions within this micro-kernel.
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
then the result must be written to three places: the sub-block within the
duplicated copy of the current micro-panel of B, the sub-block within the
current packed micro-panel of B, and the sub-block of the output matrix C.
When duplication is not used, the micro-kernel should update only the
latter two locations.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
@@ -191,14 +175,12 @@ void bli_dtrsm_u_opt_mxn(
void bli_ctrsm_u_opt_mxn(
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict bd,
scomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ctrsm_u_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}
@@ -207,14 +189,12 @@ void bli_ctrsm_u_opt_mxn(
void bli_ztrsm_u_opt_mxn(
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict bd,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
/* Just call the reference implementation. */
bli_ztrsm_u_ref_mxn( a,
b,
bd,
c, rs_c, cs_c );
}

View File

@@ -42,7 +42,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);

View File

@@ -53,10 +53,10 @@ void bli_axpyd( obj_t* alpha,
dt_x = bli_obj_datatype( *x );
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
bli_axpyd_unb_var1( &alpha_local,
x,

View File

@@ -53,10 +53,10 @@ void bli_scal2d( obj_t* beta,
dt_x = bli_obj_datatype( *x );
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
bli_scal2d_unb_var1( &beta_local,
x,

View File

@@ -52,10 +52,10 @@ void bli_scald( obj_t* beta,
dt_x = bli_obj_datatype( *x );
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
bli_scald_unb_var1( &beta_local,
x );

View File

@@ -52,10 +52,10 @@ void bli_setd( obj_t* beta,
dt_x = bli_obj_datatype( *x );
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
bli_obj_init_scalar_copy_of( dt_x,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
bli_setd_unb_var1( &beta_local,
x );

View File

@@ -143,13 +143,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -160,8 +153,6 @@ void PASTEMAC(ch,varname)( \
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
@@ -176,7 +167,6 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a2; \
ctype* restrict b2; \
\
dim_t k_nr; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
@@ -215,9 +205,6 @@ void PASTEMAC(ch,varname)( \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_nr = k * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
@@ -229,12 +216,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -243,11 +224,6 @@ void PASTEMAC(ch,varname)( \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -274,7 +250,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -285,7 +261,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -306,7 +282,7 @@ void PASTEMAC(ch,varname)( \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, b1, NR*NDUP, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}

View File

@@ -143,12 +143,11 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
/* Temporary buffer for incremental packing of B. */ \
ctype bp[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,nifac) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
@@ -226,12 +225,9 @@ void PASTEMAC(ch,varname)( \
b1 = b_cast; \
c1 = c_cast; \
\
/* The current packed micro-panel of B will always be stored in bd. */ \
bp = bd; \
\
/* Since we pack micro-panels of B incrementall, one at a time, the
/* Since we pack micro-panels of B incrementaly, one at a time, the
address of the next micro-panel of B remains constant. */ \
b2 = bd; \
b2 = bp; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \

View File

@@ -152,13 +152,6 @@ void PASTEMAC(ch,varname)( \
\
guint_t t_id = omp_get_thread_num(); \
guint_t n_threads = omp_get_num_threads(); \
\
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
@@ -170,8 +163,6 @@ void PASTEMAC(ch,varname)( \
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
@@ -186,7 +177,6 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a2; \
ctype* restrict b2; \
\
dim_t k_nr; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
@@ -217,9 +207,6 @@ void PASTEMAC(ch,varname)( \
\
m_iter = m / MR; \
m_left = m % MR; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_nr = k * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
@@ -231,12 +218,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = t_id; j < n_iter; j += n_threads ) \
@@ -246,11 +227,6 @@ void PASTEMAC(ch,varname)( \
\
a1 = a_cast; \
c11 = c1; \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -272,7 +248,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -295,7 +271,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -318,11 +294,6 @@ void PASTEMAC(ch,varname)( \
\
a1 = a_cast; \
c11 = c1; \
\
/* If duplication is needed, copy the n_left (+ padding) columns
of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -342,7 +313,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -368,7 +339,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -384,7 +355,7 @@ void PASTEMAC(ch,varname)( \
} /* end omp parallel */ \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, bp, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}

View File

@@ -1,377 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
void bli_gemm_ker_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
dim_t m = bli_obj_length( *c );
dim_t n = bli_obj_width( *c );
dim_t k = bli_obj_width( *a );
void* buf_a = bli_obj_buffer_at_off( *a );
inc_t rs_a = bli_obj_row_stride( *a );
inc_t cs_a = bli_obj_col_stride( *a );
inc_t ps_a = bli_obj_panel_stride( *a );
void* buf_b = bli_obj_buffer_at_off( *b );
inc_t rs_b = bli_obj_row_stride( *b );
inc_t cs_b = bli_obj_col_stride( *b );
inc_t ps_b = bli_obj_panel_stride( *b );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
num_t dt_alpha;
void* buf_alpha;
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bli_obj_is_complex( *a ) && bli_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// If alpha is a scalar constant, use dt_exec to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the alpha object and extract the buffer at the alpha offset.
bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha );
// If beta is a scalar constant, use dt_exec to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the beta object and extract the buffer at the beta offset.
bli_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrname ) \
\
void PASTEMAC(ch,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC(ch,mr); \
\
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict a2; \
ctype* restrict b2; \
\
dim_t k_nr; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_nr = k * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( j == n_iter - 1 && n_left == 0 ) \
b2 = b_cast; \
} \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Bottom edge handling. */ \
if ( m_left ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( j == n_iter - 1 && n_left == 0 ) \
b2 = b_cast; \
\
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
if ( n_left ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* If duplication is needed, copy the n_left (+ padding) columns
of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Right edge loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
{ \
a2 = a_cast; \
b2 = b_cast; \
} \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the right edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a_cast; \
b2 = b_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the bottom-right corner of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( gemm_ker_var2, GEMM_UKERNEL )

View File

@@ -147,13 +147,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -164,8 +157,6 @@ void PASTEMAC(ch,varname)( \
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const bool_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
@@ -181,7 +172,6 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b2; \
\
doff_t diagoffc_ij; \
dim_t k_nr; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
@@ -245,9 +235,6 @@ void PASTEMAC(ch,varname)( \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_nr = k * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
@@ -259,12 +246,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -273,11 +254,6 @@ void PASTEMAC(ch,varname)( \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,7 +289,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -334,7 +310,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -345,7 +321,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \

View File

@@ -147,13 +147,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -164,8 +157,6 @@ void PASTEMAC(ch,varname)( \
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const bool_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
@@ -181,7 +172,6 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b2; \
\
doff_t diagoffc_ij; \
dim_t k_nr; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
@@ -245,9 +235,6 @@ void PASTEMAC(ch,varname)( \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_nr = k * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
@@ -259,12 +246,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -273,11 +254,6 @@ void PASTEMAC(ch,varname)( \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,7 +289,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -334,7 +310,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -345,7 +321,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \

View File

@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -148,8 +141,6 @@ void PASTEMAC(ch,varname)( \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
@@ -162,7 +153,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict bp_i; \
ctype* restrict b1_i; \
ctype* restrict a2; \
ctype* restrict b2; \
\
@@ -171,7 +162,6 @@ void PASTEMAC(ch,varname)( \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
@@ -232,10 +222,6 @@ void PASTEMAC(ch,varname)( \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_a1011 = bli_min( k, diagoffa + m ); \
k_nr = k_a1011 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * PACKMR; \
@@ -247,12 +233,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -261,11 +241,6 @@ void PASTEMAC(ch,varname)( \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -285,11 +260,11 @@ void PASTEMAC(ch,varname)( \
{ \
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
bp. */ \
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( k, diagoffa_i + MR ); \
\
bp_i = bp + off_a1011 * NR * NDUP; \
b1_i = b1 + off_a1011 * NR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1011 * PACKMR; \
@@ -308,7 +283,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_a1011, \
alpha_cast, \
a1, \
bp_i, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -324,7 +299,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_a1011, \
alpha_cast, \
a1, \
bp_i, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -356,7 +331,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
one, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -367,7 +342,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \

View File

@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -148,8 +141,6 @@ void PASTEMAC(ch,varname)( \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
@@ -162,7 +153,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict bp_i; \
ctype* restrict b1_i; \
ctype* restrict a2; \
ctype* restrict b2; \
\
@@ -171,7 +162,6 @@ void PASTEMAC(ch,varname)( \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
@@ -240,10 +230,6 @@ void PASTEMAC(ch,varname)( \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_a1112 = k; \
k_nr = k_a1112 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * PACKMR; \
@@ -255,12 +241,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -269,11 +249,6 @@ void PASTEMAC(ch,varname)( \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -293,11 +268,11 @@ void PASTEMAC(ch,varname)( \
{ \
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
bp. */ \
b1. */ \
off_a1112 = bli_max( diagoffa_i, 0 ); \
k_a1112 = k - off_a1112; \
\
bp_i = bp + off_a1112 * NR * NDUP; \
b1_i = b1 + off_a1112 * NR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1112 * PACKMR; \
@@ -316,7 +291,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_a1112, \
alpha_cast, \
a1, \
bp_i, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -332,7 +307,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_a1112, \
alpha_cast, \
a1, \
bp_i, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -364,7 +339,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
one, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -375,7 +350,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -397,7 +372,7 @@ void PASTEMAC(ch,varname)( \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( trmm_lu_ker_var2, GEMM_UKERNEL )

View File

@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -149,8 +142,6 @@ void PASTEMAC(ch,varname)( \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
@@ -172,7 +163,6 @@ void PASTEMAC(ch,varname)( \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
@@ -252,12 +242,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -273,12 +257,6 @@ void PASTEMAC(ch,varname)( \
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
k_nr = k_b1121 * NR; \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,7 +291,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_b1121, \
alpha_cast, \
a1_i, \
bp, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -329,7 +307,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_b1121, \
alpha_cast, \
a1_i, \
bp, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -368,7 +346,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
one, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -379,7 +357,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -400,7 +378,7 @@ void PASTEMAC(ch,varname)( \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( trmm_rl_ker_var2, GEMM_UKERNEL )

View File

@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -149,8 +142,6 @@ void PASTEMAC(ch,varname)( \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
@@ -172,7 +163,6 @@ void PASTEMAC(ch,varname)( \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j; \
@@ -253,12 +243,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -273,12 +257,6 @@ void PASTEMAC(ch,varname)( \
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
k_nr = k_b0111 * NR; \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,7 +291,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_b0111, \
alpha_cast, \
a1_i, \
bp, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -329,7 +307,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k_b0111, \
alpha_cast, \
a1_i, \
bp, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -368,7 +346,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
one, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -379,7 +357,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -400,7 +378,7 @@ void PASTEMAC(ch,varname)( \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( trmm_ru_ker_var2, GEMM_UKERNEL )

View File

@@ -140,12 +140,6 @@ void bli_trsm( side_t side,
alpha,
&alpha_local );
//
// NOTE: we need to disable the use of the right-hand side control tree
// if duplication is enabled since the trsm_r macrokernels do not support
// duplication.
//
// Choose the control tree.
if ( bli_is_left( side ) ) cntl = trsm_l_cntl;
else cntl = trsm_r_cntl;

View File

@@ -119,13 +119,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -138,8 +131,6 @@ void PASTEMAC(ch,varname)( \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
@@ -151,11 +142,10 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict bp01; \
ctype* restrict bp11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
ctype* restrict b2; \
\
@@ -164,7 +154,6 @@ void PASTEMAC(ch,varname)( \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
@@ -237,10 +226,6 @@ void PASTEMAC(ch,varname)( \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_a1011 = bli_min( k, diagoffa + m ); \
k_nr = k_a1011 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * PACKMR; \
@@ -252,12 +237,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -266,11 +245,6 @@ void PASTEMAC(ch,varname)( \
c11 = c1 + (0 )*rstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -298,16 +272,13 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
a11 = a1 + k_a10 * PACKMR; \
a10 = a1; \
a11 = a1 + k_a10 * PACKMR; \
\
/* Now compute the corresponding addresses in Bd. */ \
bp01 = bp + off_a10 * NR * NDUP; \
bp11 = bp + off_a11 * NR * NDUP; \
\
/* Index into b1 to locate the MR x NR block of b1 that will
be updated by the trsm subproblem. */ \
b11 = b1 + off_a11 * PACKNR; \
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + off_a10 * PACKNR; \
b11 = b1 + off_a11 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1011 * PACKMR; \
@@ -327,8 +298,7 @@ void PASTEMAC(ch,varname)( \
alpha_cast, \
a10, \
a11, \
bp01, \
bp11, \
b01, \
b11, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -340,8 +310,7 @@ void PASTEMAC(ch,varname)( \
alpha_cast, \
a10, \
a11, \
bp01, \
bp11, \
b01, \
b11, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -373,7 +342,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
b1, \
alpha_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -384,7 +353,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \

View File

@@ -119,13 +119,6 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,packnr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ] \
@@ -138,8 +131,6 @@ void PASTEMAC(ch,varname)( \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
@@ -151,11 +142,10 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict a12; \
ctype* restrict a11; \
ctype* restrict bp21; \
ctype* restrict bp11; \
ctype* restrict b21; \
ctype* restrict b11; \
ctype* restrict a2; \
ctype* restrict b2; \
\
@@ -164,7 +154,6 @@ void PASTEMAC(ch,varname)( \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
@@ -246,10 +235,6 @@ void PASTEMAC(ch,varname)( \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_a1112 = k; \
k_nr = k_a1112 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * PACKMR; \
@@ -261,12 +246,6 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -275,11 +254,6 @@ void PASTEMAC(ch,varname)( \
c11 = c1 + (m_iter-1)*rstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -309,16 +283,13 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
a12 = a1 + k_a11 * PACKMR; \
a11 = a1; \
a12 = a1 + k_a11 * PACKMR; \
\
/* Now compute the corresponding addresses in Bd. */ \
bp11 = bp + off_a11 * NR * NDUP; \
bp21 = bp + off_a12 * NR * NDUP; \
\
/* Index into b1 to locate the MR x NR block of b1 that will be
updated by the trsm subproblem. */ \
b11 = b1 + off_a11 * PACKNR; \
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + off_a11 * PACKNR; \
b21 = b1 + off_a12 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1112 * PACKMR; \
@@ -338,8 +309,7 @@ void PASTEMAC(ch,varname)( \
alpha_cast, \
a12, \
a11, \
bp21, \
bp11, \
b21, \
b11, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -351,8 +321,7 @@ void PASTEMAC(ch,varname)( \
alpha_cast, \
a12, \
a11, \
bp21, \
bp11, \
b21, \
b11, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -384,7 +353,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
b1, \
alpha_cast, \
c11, rs_c, cs_c, \
a2, b2 ); \
@@ -395,7 +364,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
a2, b2 ); \
@@ -433,7 +402,7 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: bp11 after (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \

View File

@@ -314,7 +314,6 @@ void PASTEMAC(ch,varname)( \
b11, \
a12, \
a11, \
a11, \
c11, cs_c, rs_c, \
b2, a2 ); \
} \
@@ -327,7 +326,6 @@ void PASTEMAC(ch,varname)( \
b11, \
a12, \
a11, \
a11, \
ct, cs_ct, rs_ct, \
b2, a2 ); \
\

View File

@@ -308,7 +308,6 @@ void PASTEMAC(ch,varname)( \
b11, \
a10, \
a11, \
a11, \
c11, cs_c, rs_c, \
b2, a2 ); \
} \
@@ -321,7 +320,6 @@ void PASTEMAC(ch,varname)( \
b11, \
a10, \
a11, \
a11, \
ct, cs_ct, rs_ct, \
b2, a2 ); \
\
@@ -390,18 +388,6 @@ void PASTEMAC(ch,varname)( \
b1 += k_b0111 * PACKNR; \
c1 += cstep_c; \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC2( trsm_ru_ker_var2, GEMMTRSM_L_UKERNEL, GEMM_UKERNEL )

View File

@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b01, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
@@ -56,22 +55,20 @@ void PASTEMAC(ch,varname)( \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \
\
/* b11 = alpha * b11 - a10 * bd01; */ \
/* b11 = alpha * b11 - a10 * b01; */ \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a10, \
bd01, \
b01, \
alpha, \
b11, rs_b, cs_b, \
a_next, \
b_next ); \
\
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */ \
/* b11 = inv(a11) * b11;
c11 = b11; */ \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bd11, \
c11, rs_c, cs_c ); \
}

View File

@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b01, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b21, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
@@ -60,18 +59,16 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a12, \
bd21, \
b21, \
alpha, \
b11, rs_b, cs_b, \
a_next, \
b_next ); \
\
/* b11 = inv(a11) * b11;
bd11 = b11; (skipped if duplication is disabled)
c11 = b11; */ \
/* b11 = inv(a11) * b11;
c11 = b11; */ \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bd11, \
c11, rs_c, cs_c ); \
}

View File

@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b21, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -41,7 +41,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \

View File

@@ -42,7 +42,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);

View File

@@ -41,7 +41,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \

View File

@@ -42,7 +42,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);

View File

@@ -53,17 +53,17 @@
if ( incx_blas < 0 ) \
{ \
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think of
this is that negative strides effectively reverse the order of
the vector, but without any explicit data movements.) This is
also how BLIS interprets negative strides. The differences is
that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */ \
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */ \
x_blis = (x_blas) + (n-1)*(-incx_blas); \
incx_blis = ( inc_t )(incx_blas); \
} \

View File

@@ -234,13 +234,6 @@
#define bli_zpackkr BLIS_PACKDIM_KR_Z
#define bli_zpacknr BLIS_PACKDIM_NR_Z
// Duplication factors
#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S
#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
// Incremental packing factors
#define bli_snifac BLIS_DEFAULT_NI_FAC

View File

@@ -189,11 +189,6 @@ extern "C" {
#include "bli_trsv.h"
// -- Helper operands for ukernels --
#include "bli_dupl.h"
// -- Level-3 operations --
#include "bli_gemm.h"

View File

@@ -1,73 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based interface.
//
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(opname)( \
obj_t* b, \
obj_t* bd \
) \
{ \
PASTEMAC0(varname)( b, \
bd ); \
}
GENFRONT( dupl, DUPL_KERNEL )
//
// Define BLAS-like interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,opname)( \
dim_t k, \
ctype* b, \
ctype* bd \
) \
{ \
PASTEMAC(ch,varname)( k, \
b, \
bd ); \
}
INSERT_GENTFUNC_BASIC( dupl, DUPL_KERNEL )

View File

@@ -1,58 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_dupl_unb_var1.h"
//
// Prototype object-based interface.
//
void bli_dupl( obj_t* b,
obj_t* bd );
//
// Prototype BLAS-like interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname)( \
dim_t k, \
ctype* b, \
ctype* bd \
);
INSERT_GENTPROT_BASIC( dupl )

View File

@@ -1,108 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T dupl_fp
typedef void (*FUNCPTR_T)(
dim_t k,
void* b,
void* bd
);
static FUNCPTR_T GENARRAY(ftypes,dupl_unb_var1);
void bli_dupl_unb_var1( obj_t* b,
obj_t* bd )
{
num_t dt_b = bli_obj_datatype( *b );
dim_t k;
void* buf_b = bli_obj_buffer_at_off( *b );
void* buf_bd = bli_obj_buffer_at_off( *bd );
FUNCPTR_T f;
// The k dimension is the one that is "perpendicular" to the
// storage dimension.
if ( bli_obj_is_row_stored( *b ) ) k = bli_obj_length( *b );
else k = bli_obj_width( *b );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_b];
// Invoke the function.
f( k,
buf_b,
buf_bd );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
dim_t n, \
void* b, \
void* bd \
) \
{ \
ctype* b_cast = b; \
ctype* bd_cast = bd; \
\
const dim_t NDUP = PASTEMAC(ch,ndup); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
\
dim_t i, j, el, d; \
\
for ( el = 0; el < n; ++el ) \
{ \
i = el / NR; \
j = el % NR; \
\
for ( d = 0; d < NDUP; ++d ) \
{ \
*(bd_cast + el*NDUP + d) = *(b_cast + i*PACKNR + j); \
} \
} \
}
INSERT_GENTFUNC_BASIC( dupl_unb_var1, dupl_unb_var1 )

View File

@@ -1,56 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interface.
//
void bli_dupl_unb_var1( obj_t* b,
obj_t* bd );
//
// Prototype BLAS-like interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t n, \
void* b, \
void* bd \
);
INSERT_GENTPROT_BASIC( dupl_unb_var1 )

View File

@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict aL, \
ctype* restrict a, \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict bT, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
@@ -59,7 +58,7 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
aL, \
bdT, \
bT, \
alpha, \
b, rs_b, cs_b, \
a_next, \
@@ -67,7 +66,6 @@ void PASTEMAC(ch,varname)( \
\
PASTEMAC(ch,trsmukr)( a, \
b, \
bd, \
c, rs_c, cs_c ); \
}

View File

@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict aL, \
ctype* restrict a, \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict bT, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict aR, \
ctype* restrict a, \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict bB, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
@@ -59,14 +58,13 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
aR, \
bdB, \
bB, \
alpha, \
b, rs_b, cs_b, \
a_next, b_next ); \
\
PASTEMAC(ch,trsmukr)( a, \
b, \
bd, \
c, rs_c, cs_c ); \
}

View File

@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict aR, \
ctype* restrict a, \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict bB, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -41,7 +41,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \

View File

@@ -39,7 +39,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);

View File

@@ -41,7 +41,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \

View File

@@ -39,7 +39,6 @@
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);

View File

@@ -1,152 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sdupl_opt_var1(
dim_t n_elem,
float* b,
float* bd
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_ddupl_opt_var1(
dim_t n_elem,
double* b,
double* bd
)
{
dim_t n_iter = n_elem / 8;
dim_t n_left = n_elem % 8;
__asm__ volatile
(
" \n\t"
"movl %2, %%eax \n\t" // load address of b.
"movl %3, %%ebx \n\t" // load address of bd.
" \n\t"
" \n\t"
" \n\t"
"movl %0, %%esi \n\t" // i = n_iter;
"testl %%esi, %%esi \n\t" // check n_iter via logical AND.
"je .CONSIDERNLEFT \n\t" // if i == 0, jump to code that
" \n\t" // contains the n_left loop.
" \n\t"
" \n\t"
".LOOPNITER: \n\t" // MAIN LOOP
" \n\t"
"movddup 0 * 8(%%eax), %%xmm0 \n\t"
"movddup 1 * 8(%%eax), %%xmm1 \n\t"
"movddup 2 * 8(%%eax), %%xmm2 \n\t"
"movddup 3 * 8(%%eax), %%xmm3 \n\t"
"movddup 4 * 8(%%eax), %%xmm4 \n\t"
"movddup 5 * 8(%%eax), %%xmm5 \n\t"
"movddup 6 * 8(%%eax), %%xmm6 \n\t"
"movddup 7 * 8(%%eax), %%xmm7 \n\t"
"addl $64, %%eax \n\t" // b += 8;
" \n\t"
"movapd %%xmm0, 0 * 16(%%ebx) \n\t"
"movapd %%xmm1, 1 * 16(%%ebx) \n\t"
"movapd %%xmm2, 2 * 16(%%ebx) \n\t"
"movapd %%xmm3, 3 * 16(%%ebx) \n\t"
"movapd %%xmm4, 4 * 16(%%ebx) \n\t"
"movapd %%xmm5, 5 * 16(%%ebx) \n\t"
"movapd %%xmm6, 6 * 16(%%ebx) \n\t"
"movapd %%xmm7, 7 * 16(%%ebx) \n\t"
"addl $128, %%ebx \n\t" // bd += 16;
" \n\t"
"decl %%esi \n\t" // i -= 1;
"jne .LOOPNITER \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".CONSIDERNLEFT: \n\t"
" \n\t"
"movl %1, %%esi \n\t" // i = n_left;
"testl %%esi, %%esi \n\t" // check n_left via logical AND.
"je .DONE \n\t" // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter n_left loop.
" \n\t"
" \n\t"
".LOOPNLEFT: \n\t" // EDGE LOOP
" \n\t"
"movddup 0 * 8(%%eax), %%xmm0 \n\t"
"addl $8, %%eax \n\t" // b += 1;
" \n\t"
"movapd %%xmm0, 0 * 16(%%ebx) \n\t"
"addl $16, %%ebx \n\t" // bd += 2;
" \n\t"
"decl %%esi \n\t" // i -= 1;
"jne .LOOPNLEFT \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".DONE: \n\t"
" \n\t"
: // output operands (none)
: // input operands
"r" (n_iter),
"r" (n_left),
"m" (b),
"m" (bd)
: // register clobber list
"eax", "ebx", "esi",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
"memory"
);
}
void bli_cdupl_opt_var1(
dim_t k,
scomplex* b,
scomplex* bd
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zdupl_opt_var1(
dim_t k,
dcomplex* b,
dcomplex* bd
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}

View File

@@ -1,46 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t n_elem, \
ctype* b, \
ctype* bd \
);
INSERT_GENTPROT_BASIC( dupl_opt_var1 )

View File

@@ -39,8 +39,7 @@ void bli_sgemmtrsm_l_opt_d4x4(
float* restrict alpha,
float* restrict a10,
float* restrict a11,
float* restrict bd01,
float* restrict bd11,
float* restrict b01,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
@@ -55,8 +54,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
double* restrict alpha,
double* restrict a10,
double* restrict a11,
double* restrict bd01,
double* restrict bd11,
double* restrict b01,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
@@ -73,8 +71,8 @@ void bli_dgemmtrsm_l_opt_d4x4(
(
" \n\t"
"movq %2, %%rax \n\t" // load address of a10.
"movq %4, %%rbx \n\t" // load address of bd01.
//"movq %11, %%r9 \n\t" // load address of b_next.
"movq %4, %%rbx \n\t" // load address of b01.
//"movq %10, %%r9 \n\t" // load address of b_next.
" \n\t"
"subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte
"subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
@@ -83,7 +81,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b.
"movaps -8 * 16(%%rbx), %%xmm2 \n\t"
" \n\t"
//"movq %7, %%rcx \n\t" // load address of c11
//"movq %6, %%rcx \n\t" // load address of c11
//"movq %9, %%rdi \n\t" // load cs_c
//"leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double)
//"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // load address of c + 2*cs_c;
@@ -320,7 +318,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
" \n\t"
" \n\t"
" \n\t"
"movq %6, %%rbx \n\t" // load address of b11.
"movq %5, %%rbx \n\t" // load address of b11.
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
@@ -354,7 +352,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
" \n\t"
"movq %10, %%rax \n\t" // load address of alpha
"movq %9, %%rax \n\t" // load address of alpha
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t"
@@ -394,10 +392,10 @@ void bli_dgemmtrsm_l_opt_d4x4(
" \n\t"
" \n\t"
"movq %3, %%rax \n\t" // load address of a11
"movq %7, %%rcx \n\t" // load address of c11
"movq %6, %%rcx \n\t" // load address of c11
" \n\t"
"movq %8, %%rsi \n\t" // load rs_c
"movq %9, %%rdi \n\t" // load cs_c
"movq %7, %%rsi \n\t" // load rs_c
"movq %8, %%rdi \n\t" // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
" \n\t"
@@ -514,18 +512,17 @@ void bli_dgemmtrsm_l_opt_d4x4(
: // output operands (none)
: // input operands
"m" (k_iter),
"m" (k_left),
"m" (a10),
"m" (a11),
"m" (bd01),
"m" (bd11),
"m" (b11),
"m" (c11),
"m" (rs_c),
"m" (cs_c),
"m" (alpha),
"m" (b_next)
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a10), // 2
"m" (a11), // 3
"m" (b01), // 4
"m" (b11), // 5
"m" (c11), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (alpha), // 9
"m" (b_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -542,8 +539,7 @@ void bli_cgemmtrsm_l_opt_d4x4(
scomplex* restrict alpha,
scomplex* restrict a10,
scomplex* restrict a11,
scomplex* restrict bd01,
scomplex* restrict bd11,
scomplex* restrict b01,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
@@ -558,8 +554,7 @@ void bli_zgemmtrsm_l_opt_d4x4(
dcomplex* restrict alpha,
dcomplex* restrict a10,
dcomplex* restrict a11,
dcomplex* restrict bd01,
dcomplex* restrict bd11,
dcomplex* restrict b01,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,

View File

@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b01, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -39,8 +39,7 @@ void bli_sgemmtrsm_u_opt_d4x4(
float* restrict alpha,
float* restrict a12,
float* restrict a11,
float* restrict bd21,
float* restrict bd11,
float* restrict b21,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
@@ -55,8 +54,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
double* restrict alpha,
double* restrict a12,
double* restrict a11,
double* restrict bd21,
double* restrict bd11,
double* restrict b21,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
@@ -73,7 +71,8 @@ void bli_dgemmtrsm_u_opt_d4x4(
(
" \n\t"
"movq %2, %%rax \n\t" // load address of a12.
"movq %4, %%rbx \n\t" // load address of bd21.
"movq %4, %%rbx \n\t" // load address of b21.
//"movq %10, %%r9 \n\t" // load address of b_next.
" \n\t"
"addq $8 * 16, %%rax \n\t" // increment pointers to allow byte
"addq $8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
@@ -302,7 +301,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
" \n\t"
" \n\t"
" \n\t"
"movq %6, %%rbx \n\t" // load address of b11.
"movq %5, %%rbx \n\t" // load address of b11.
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
@@ -336,7 +335,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
" \n\t"
"movq %10, %%rax \n\t" // load address of alpha
"movq %9, %%rax \n\t" // load address of alpha
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t"
@@ -376,10 +375,10 @@ void bli_dgemmtrsm_u_opt_d4x4(
" \n\t"
" \n\t"
"movq %3, %%rax \n\t" // load address of a11
"movq %7, %%rcx \n\t" // load address of c11
"movq %6, %%rcx \n\t" // load address of c11
" \n\t"
"movq %8, %%rsi \n\t" // load rs_c
"movq %9, %%rdi \n\t" // load cs_c
"movq %7, %%rsi \n\t" // load rs_c
"movq %8, %%rdi \n\t" // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
" \n\t"
@@ -499,17 +498,17 @@ void bli_dgemmtrsm_u_opt_d4x4(
: // output operands (none)
: // input operands
"m" (k_iter),
"m" (k_left),
"m" (a12),
"m" (a11),
"m" (bd21),
"m" (bd11),
"m" (b11),
"m" (c11),
"m" (rs_c),
"m" (cs_c),
"m" (alpha)
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a12), // 2
"m" (a11), // 3
"m" (b21), // 4
"m" (b11), // 5
"m" (c11), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (alpha) // 9
"m" (b_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -526,8 +525,7 @@ void bli_cgemmtrsm_u_opt_d4x4(
scomplex* restrict alpha,
scomplex* restrict a12,
scomplex* restrict a11,
scomplex* restrict bd21,
scomplex* restrict bd11,
scomplex* restrict b21,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
@@ -542,8 +540,7 @@ void bli_zgemmtrsm_u_opt_d4x4(
dcomplex* restrict alpha,
dcomplex* restrict a12,
dcomplex* restrict a11,
dcomplex* restrict bd21,
dcomplex* restrict bd11,
dcomplex* restrict b21,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,

View File

@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict alpha, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b21, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \

View File

@@ -3,7 +3,7 @@ c #rg # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major;
c #rji # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
0 # Test all combinations of storage schemes?
32 # General stride spacing (for cases when testing general stride)
sdcz #sdcz # Datatype(s) to test
d #sdcz # Datatype(s) to test
100 # Problem size: first to test
300 # Problem size: maximum to test
100 # Problem size: increment between experiments

View File

@@ -64,8 +64,7 @@ void libblis_test_gemmtrsm_ukr_impl( mt_impl_t impl,
obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bdx1,
obj_t* bd11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 );
@@ -82,13 +81,10 @@ void libblis_test_gemmtrsm_ukr_check( side_t side,
void bli_gemmtrsm_ukr_make_subparts( dim_t k,
obj_t* a,
obj_t* b,
obj_t* bd,
obj_t* a1x,
obj_t* a11,
obj_t* bx1,
obj_t* b11,
obj_t* bdx1,
obj_t* bd11 );
obj_t* b11 );
void libblis_test_gemmtrsm_ukr_deps( test_params_t* params, test_op_t* op )
@@ -166,10 +162,10 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
obj_t kappa;
obj_t alpha;
obj_t a_big, a, b, bd;
obj_t a_big, a, b;
obj_t b11, c11;
obj_t ap, bp;
obj_t a1xp, a11p, bdx1, bd11, bx1p, b11p;
obj_t a1xp, a11p, bx1p, b11p;
obj_t c11_save;
@@ -201,8 +197,6 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
sc_str[0], m, n, &c11 );
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
sc_str[0], m, n, &c11_save );
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
sc_b, k+m, 4*n, &bd );
// Set alpha.
if ( bli_obj_is_real( b ) )
@@ -264,8 +258,8 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
// Create subpartitions from the a and b panels.
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &bd,
&a1xp, &a11p, &bx1p, &b11p, &bdx1, &bd11 );
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
&a1xp, &a11p, &bx1p, &b11p );
// Repeat the experiment n_repeats times and record results.
@@ -279,7 +273,7 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
time = bli_clock();
libblis_test_gemmtrsm_ukr_impl( impl, side, &alpha,
&a1xp, &a11p, &bdx1, &bd11, &b11p, &c11 );
&a1xp, &a11p, &bx1p, &b11p, &c11 );
time_min = bli_clock_min_diff( time_min, time );
}
@@ -304,7 +298,6 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
bli_obj_free( &b );
bli_obj_free( &c11 );
bli_obj_free( &c11_save );
bli_obj_free( &bd );
}
@@ -314,15 +307,14 @@ void libblis_test_gemmtrsm_ukr_impl( mt_impl_t impl,
obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bdx1,
obj_t* bd11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 )
{
switch ( impl )
{
case BLIS_TEST_SEQ_UKERNEL:
bli_gemmtrsm_ukr( alpha, a1x, a11, bdx1, bd11, b11, c11 );
bli_gemmtrsm_ukr( alpha, a1x, a11, bx1, b11, c11 );
break;
default:
@@ -431,20 +423,16 @@ void libblis_test_gemmtrsm_ukr_check( side_t side,
void bli_gemmtrsm_ukr_make_subparts( dim_t k,
obj_t* a,
obj_t* b,
obj_t* bd,
obj_t* a1x,
obj_t* a11,
obj_t* bx1,
obj_t* b11,
obj_t* bdx1,
obj_t* bd11 )
obj_t* b11 )
{
dim_t mr = bli_obj_length( *a );
dim_t nr = bli_obj_width( *b );
dim_t off_a1x, off_a11;
dim_t off_bx1, off_b11;
dim_t off_bdx1, off_bd11;
if ( bli_obj_is_lower( *a ) )
{
@@ -452,8 +440,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
off_a11 = k;
off_bx1 = 0;
off_b11 = k;
off_bdx1 = 0;
off_bd11 = k;
}
else
{
@@ -461,8 +447,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
off_a11 = 0;
off_bx1 = mr;
off_b11 = 0;
off_bdx1 = mr;
off_bd11 = 0;
}
bli_obj_init_subpart_from( *a, *a1x );
@@ -488,28 +472,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
// Set the diagonal offset of a11 to 0 (which overwrites the diagonal
// offset value it inherited from a).
bli_obj_set_diag_offset( 0, *a11 );
// If duplication is disabled, alias bdxx objects to bxx.
if ( TRUE )
{
bli_obj_alias_to( *bx1, *bdx1 );
bli_obj_alias_to( *b11, *bd11 );
}
else // if duplication is enabled
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
bli_obj_init_subpart_from( *b, *bdx1 );
bli_obj_set_dims( k, nr, *bdx1 );
bli_obj_inc_offs( off_bdx1, 0, *bdx1 );
bli_obj_init_subpart_from( *b, *bd11 );
bli_obj_set_dims( mr, nr, *bd11 );
bli_obj_inc_offs( off_bd11, 0, *bd11 );
// Now update the buffer fields of bdx1, bd11, and then call
// bli_dupl().
}
}
@@ -527,8 +489,7 @@ typedef void (*FUNCPTR_T)(
void* alpha,
void* a1x,
void* a11,
void* bdx1,
void* bd11,
void* bx1,
void* b11,
void* c11, inc_t rs_c, inc_t cs_c,
void* a_next,
@@ -542,8 +503,7 @@ static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukr);
void bli_gemmtrsm_ukr( obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bdx1,
obj_t* bd11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 )
{
@@ -555,9 +515,7 @@ void bli_gemmtrsm_ukr( obj_t* alpha,
void* buf_a11 = bli_obj_buffer_at_off( *a11 );
void* buf_bdx1 = bli_obj_buffer_at_off( *bdx1 );
void* buf_bd11 = bli_obj_buffer_at_off( *bd11 );
void* buf_bx1 = bli_obj_buffer_at_off( *bx1 );
void* buf_b11 = bli_obj_buffer_at_off( *b11 );
@@ -579,12 +537,11 @@ void bli_gemmtrsm_ukr( obj_t* alpha,
buf_alpha,
buf_a1x,
buf_a11,
buf_bdx1,
buf_bd11,
buf_bx1,
buf_b11,
buf_c11, rs_c, cs_c,
buf_a1x,
buf_bdx1 );
buf_bx1 );
}
@@ -596,8 +553,7 @@ void PASTEMAC(ch,varname)( \
void* alpha, \
void* a1x, \
void* a11, \
void* bdx1, \
void* bd11, \
void* bx1, \
void* b11, \
void* c11, inc_t rs_c, inc_t cs_c, \
void* a_next, \
@@ -608,8 +564,7 @@ void PASTEMAC(ch,varname)( \
alpha, \
a1x, \
a11, \
bdx1, \
bd11, \
bx1, \
b11, \
c11, rs_c, cs_c, \
a_next, \

View File

@@ -40,8 +40,7 @@ void libblis_test_gemmtrsm_ukr( test_params_t* params, test_op_t* op );
void bli_gemmtrsm_ukr( obj_t* alpha,
obj_t* a1x,
obj_t* a11,
obj_t* bdx1,
obj_t* bd11,
obj_t* bx1,
obj_t* b11,
obj_t* c11 );
@@ -53,8 +52,7 @@ void PASTEMAC(ch,varname)( \
void* alpha, \
void* a1x, \
void* a11, \
void* bdx1, \
void* bd11, \
void* bx1, \
void* b11, \
void* c11, inc_t rs_c, inc_t cs_c, \
void* a_next, \

View File

@@ -653,18 +653,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
BLIS_DEFAULT_NI_Z );
*/
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-3 packing duplication s d c z \n" );
libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n",
BLIS_DEFAULT_NUM_DUPL_S,
BLIS_DEFAULT_NUM_DUPL_D,
BLIS_DEFAULT_NUM_DUPL_C,
BLIS_DEFAULT_NUM_DUPL_Z );
libblis_test_fprintf_c( os, " elements per register %5u %5u %5u %5u\n",
BLIS_NUM_ELEM_PER_REG_S,
BLIS_NUM_ELEM_PER_REG_D,
BLIS_NUM_ELEM_PER_REG_C,
BLIS_NUM_ELEM_PER_REG_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" );
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
BLIS_DEFAULT_L2_MC_S,

View File

@@ -63,7 +63,6 @@ void libblis_test_trsm_ukr_impl( mt_impl_t impl,
side_t side,
obj_t* a,
obj_t* b,
obj_t* bd,
obj_t* c );
void libblis_test_trsm_ukr_check( side_t side,
@@ -148,7 +147,7 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
uplo_t uploa;
obj_t kappa;
obj_t a, b, bd, c;
obj_t a, b, c;
obj_t ap, bp;
obj_t c_save;
@@ -177,8 +176,6 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
sc_str[0], m, n, &c );
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
sc_str[0], m, n, &c_save );
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
sc_b, m, 4*n, &bd );
// Set the structure, uplo, and diagonal offset properties of A.
bli_obj_set_struc( BLIS_TRIANGULAR, a );
@@ -229,14 +226,11 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
// Re-pack the contents of b to bp.
bli_packm_blk_var2( &BLIS_ONE, &b, &bp );
// Re-duplicate the contents of bp to bd.
bli_dupl( &bp, &bd );
bli_copym( &c_save, &c );
time = bli_clock();
libblis_test_trsm_ukr_impl( impl, side, &ap, &bp, &bd, &c );
libblis_test_trsm_ukr_impl( impl, side, &ap, &bp, &c );
time_min = bli_clock_min_diff( time_min, time );
}
@@ -268,13 +262,12 @@ void libblis_test_trsm_ukr_impl( mt_impl_t impl,
side_t side,
obj_t* a,
obj_t* b,
obj_t* bd,
obj_t* c )
{
switch ( impl )
{
case BLIS_TEST_SEQ_UKERNEL:
bli_trsm_ukr( a, b, bd, c );
bli_trsm_ukr( a, b, c );
break;
default:
@@ -386,7 +379,6 @@ void libblis_test_trsm_ukr_check( side_t side,
typedef void (*FUNCPTR_T)(
void* a,
void* b,
void* bd,
void* c, inc_t rs_c, inc_t cs_c
);
@@ -396,7 +388,6 @@ static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukr);
void bli_trsm_ukr( obj_t* a,
obj_t* b,
obj_t* bd,
obj_t* c )
{
num_t dt = bli_obj_datatype( *c );
@@ -405,8 +396,6 @@ void bli_trsm_ukr( obj_t* a,
void* buf_b = bli_obj_buffer_at_off( *b );
void* buf_bd = bli_obj_buffer_at_off( *bd );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
@@ -421,7 +410,6 @@ void bli_trsm_ukr( obj_t* a,
// Invoke the function.
f( buf_a,
buf_b,
buf_bd,
buf_c, rs_c, cs_c );
}
@@ -432,13 +420,11 @@ void bli_trsm_ukr( obj_t* a,
void PASTEMAC(ch,varname)( \
void* a, \
void* b, \
void* bd, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
PASTEMAC(ch,ukrname)( a, \
b, \
bd, \
c, rs_c, cs_c ); \
}

View File

@@ -39,7 +39,6 @@ void libblis_test_trsm_ukr( test_params_t* params, test_op_t* op );
//
void bli_trsm_ukr( obj_t* a,
obj_t* b,
obj_t* bd,
obj_t* c );
#undef GENTPROT
@@ -48,7 +47,6 @@ void bli_trsm_ukr( obj_t* a,
void PASTEMAC(ch,varname)( \
void* a, \
void* b, \
void* bd, \
void* c, inc_t rs_c, inc_t cs_c \
);

View File

@@ -146,34 +146,6 @@
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 1
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
#define GEMM_UKERNEL gemm_ref_mxn