Merge branch 'master' of github.com:flame/blis

This commit is contained in:
Field G. Van Zee
2014-08-24 15:56:21 -05:00
6 changed files with 78 additions and 63 deletions

View File

@@ -49,7 +49,7 @@
// -- Define default 3m-specific kernel names ----------------------------------
//
// Level-3 3m
// Level-3
//
// gemm3m micro-kernels
@@ -216,44 +216,44 @@
// 3m cache blocksizes
#ifndef BLIS_DEFAULT_3M_MC_C
#define BLIS_DEFAULT_3M_MC_C ((BLIS_DEFAULT_MC_S)/1)
#define BLIS_DEFAULT_3M_MC_C BLIS_DEFAULT_MC_S
#endif
#ifndef BLIS_DEFAULT_3M_KC_C
#define BLIS_DEFAULT_3M_KC_C ((BLIS_DEFAULT_KC_S)/2)
#define BLIS_DEFAULT_3M_KC_C ((BLIS_DEFAULT_KC_S)/2)
#endif
#ifndef BLIS_DEFAULT_3M_NC_C
#define BLIS_DEFAULT_3M_NC_C ((BLIS_DEFAULT_NC_S)/1)
#define BLIS_DEFAULT_3M_NC_C BLIS_DEFAULT_NC_S
#endif
#ifndef BLIS_DEFAULT_3M_MC_Z
#define BLIS_DEFAULT_3M_MC_Z ((BLIS_DEFAULT_MC_D)/1)
#define BLIS_DEFAULT_3M_MC_Z BLIS_DEFAULT_MC_D
#endif
#ifndef BLIS_DEFAULT_3M_KC_Z
#define BLIS_DEFAULT_3M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
#define BLIS_DEFAULT_3M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
#endif
#ifndef BLIS_DEFAULT_3M_NC_Z
#define BLIS_DEFAULT_3M_NC_Z ((BLIS_DEFAULT_NC_D)/1)
#define BLIS_DEFAULT_3M_NC_Z BLIS_DEFAULT_NC_D
#endif
// 3m cache blocksize extensions
#ifndef BLIS_EXTEND_3M_MC_C
#define BLIS_EXTEND_3M_MC_C 0
#define BLIS_EXTEND_3M_MC_C BLIS_EXTEND_MC_S
#endif
#ifndef BLIS_EXTEND_3M_KC_C
#define BLIS_EXTEND_3M_KC_C 0
#define BLIS_EXTEND_3M_KC_C ((BLIS_EXTEND_KC_S)/2)
#endif
#ifndef BLIS_EXTEND_3M_NC_C
#define BLIS_EXTEND_3M_NC_C 0
#define BLIS_EXTEND_3M_NC_C BLIS_EXTEND_NC_S
#endif
#ifndef BLIS_EXTEND_3M_MC_Z
#define BLIS_EXTEND_3M_MC_Z 0
#define BLIS_EXTEND_3M_MC_Z BLIS_EXTEND_MC_D
#endif
#ifndef BLIS_EXTEND_3M_KC_Z
#define BLIS_EXTEND_3M_KC_Z 0
#define BLIS_EXTEND_3M_KC_Z ((BLIS_EXTEND_KC_D)/2)
#endif
#ifndef BLIS_EXTEND_3M_NC_Z
#define BLIS_EXTEND_3M_NC_Z 0
#define BLIS_EXTEND_3M_NC_Z BLIS_EXTEND_NC_D
#endif

View File

@@ -49,7 +49,7 @@
// -- Define default 4m-specific kernel names ----------------------------------
//
// Level-3 4m
// Level-3
//
// gemm4m micro-kernels
@@ -216,44 +216,44 @@
// 4m cache blocksizes
#ifndef BLIS_DEFAULT_4M_MC_C
#define BLIS_DEFAULT_4M_MC_C ((BLIS_DEFAULT_MC_S)/1)
#define BLIS_DEFAULT_4M_MC_C BLIS_DEFAULT_MC_S
#endif
#ifndef BLIS_DEFAULT_4M_KC_C
#define BLIS_DEFAULT_4M_KC_C ((BLIS_DEFAULT_KC_S)/2)
#define BLIS_DEFAULT_4M_KC_C ((BLIS_DEFAULT_KC_S)/2)
#endif
#ifndef BLIS_DEFAULT_4M_NC_C
#define BLIS_DEFAULT_4M_NC_C ((BLIS_DEFAULT_NC_S)/1)
#define BLIS_DEFAULT_4M_NC_C BLIS_DEFAULT_NC_S
#endif
#ifndef BLIS_DEFAULT_4M_MC_Z
#define BLIS_DEFAULT_4M_MC_Z ((BLIS_DEFAULT_MC_D)/1)
#define BLIS_DEFAULT_4M_MC_Z BLIS_DEFAULT_MC_D
#endif
#ifndef BLIS_DEFAULT_4M_KC_Z
#define BLIS_DEFAULT_4M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
#define BLIS_DEFAULT_4M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
#endif
#ifndef BLIS_DEFAULT_4M_NC_Z
#define BLIS_DEFAULT_4M_NC_Z ((BLIS_DEFAULT_NC_D)/1)
#define BLIS_DEFAULT_4M_NC_Z BLIS_DEFAULT_NC_D
#endif
// 4m cache blocksize extensions
#ifndef BLIS_EXTEND_4M_MC_C
#define BLIS_EXTEND_4M_MC_C 0
#define BLIS_EXTEND_4M_MC_C BLIS_EXTEND_MC_S
#endif
#ifndef BLIS_EXTEND_4M_KC_C
#define BLIS_EXTEND_4M_KC_C 0
#define BLIS_EXTEND_4M_KC_C ((BLIS_EXTEND_KC_S)/2)
#endif
#ifndef BLIS_EXTEND_4M_NC_C
#define BLIS_EXTEND_4M_NC_C 0
#define BLIS_EXTEND_4M_NC_C BLIS_EXTEND_NC_S
#endif
#ifndef BLIS_EXTEND_4M_MC_Z
#define BLIS_EXTEND_4M_MC_Z 0
#define BLIS_EXTEND_4M_MC_Z BLIS_EXTEND_MC_D
#endif
#ifndef BLIS_EXTEND_4M_KC_Z
#define BLIS_EXTEND_4M_KC_Z 0
#define BLIS_EXTEND_4M_KC_Z ((BLIS_EXTEND_KC_D)/2)
#endif
#ifndef BLIS_EXTEND_4M_NC_Z
#define BLIS_EXTEND_4M_NC_Z 0
#define BLIS_EXTEND_4M_NC_Z BLIS_EXTEND_NC_D
#endif

View File

@@ -269,6 +269,21 @@
// -- Maximum register blocksize search ----------------------------------------
// The macro-kernels oftentimes need to statically allocate a temporary
// MR x NR micro-tile of C. This micro-tile must be sized such that it will
// work for both native and 4m/3m implementations, since the user can switch
// between them at runtime. In order to facilitate the sizing of those
// micro-tiles, we must determine the largest the register blocksizes would
// need to be to accommodate both native and 4m/3m-based complex
// micro-kernels. For real datatypes, the maximum is never larger than the
// actual s and d register blocksizes. However, for complex datatypes, the
// "native" register blocksizes may differ from the "virtual" register
// blocksizes used by the 4m/3m implementations. Usually, it is the register
// blocksizes used for 4m/3m-based complex micro-kernels that would be
// larger, and thus determine the maximum for c and z datatypes. But, we
// prefer not to assume this, therefore, we always take the larger of the
// two values.
//
// Find the largest register blocksize MR.
//

View File

@@ -215,11 +215,11 @@
#define bli_obj_is_4m_packed( obj ) \
\
( ( (obj).info & BLIS_PACK_4M_BIT ) )
( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_4M )
#define bli_obj_is_3m_packed( obj ) \
\
( ( (obj).info & BLIS_PACK_3M_BIT ) )
( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3M )
#define bli_obj_pack_buffer_type( obj ) \
\

View File

@@ -524,11 +524,11 @@
#define bli_is_4m_packed( schema ) \
\
( ( schema & BLIS_PACK_4M_BIT ) )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_4M )
#define bli_is_3m_packed( schema ) \
\
( ( schema & BLIS_PACK_3M_BIT ) )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3M )

View File

@@ -214,32 +214,32 @@ typedef struct
- 11: precision (0 == single, 1 == double)
- 12: unused
15 ~ 13 Execution numerical datatype
- 13 domain (0 == real, 1 == complex)
- 13: domain (0 == real, 1 == complex)
- 14: precision (0 == single, 1 == double)
- 15: unused
20 ~ 16 Packed type/status
- 00000 0 == not packed
- 10000 1 == packed (unspecified; row, column, or vector)
- 10000 2 == packed by rows
- 10001 3 == packed by columns
- 10010 4 == packed by row panels
- 10011 5 == packed by column panels
- 10100 6 == packed by row panels (4m)
- 10101 7 == packed by column panels (4m)
- 11000 8 == packed by row panels (3m)
- 11001 9 == packed by column panels (3m)
21 Packed panel order if upper-stored
21 ~ 16 Packed type/status
- 000000: not packed
- 100000: packed (unspecified; by rows, columns, or vector)
- 100000: packed by rows
- 100001: packed by columns
- 100010: packed by row panels
- 100011: packed by column panels
- 100110: packed by 4m row panels
- 100111: packed by 4m column panels
- 101010: packed by 3m row panels
- 101011: packed by 3m column panels
22 Packed panel order if upper-stored
- 0 == forward order if upper
- 1 == reverse order if upper
22 Packed panel order if lower-stored
23 Packed panel order if lower-stored
- 0 == forward order if lower
- 1 == reverse order if lower
24 ~ 23 Packed buffer type
25 ~ 24 Packed buffer type
- 0 == block of A
- 1 == panel of B
- 2 == panel of C
- 3 == general use
26 ~ 25 Structure type
27 ~ 26 Structure type
- 0 == general
- 1 == Hermitian
- 2 == symmetric
@@ -263,13 +263,12 @@ typedef struct
#define BLIS_PACK_SCHEMA_SHIFT 16
#define BLIS_PACK_RC_SHIFT 16
#define BLIS_PACK_PANEL_SHIFT 17
#define BLIS_PACK_4M_SHIFT 18
#define BLIS_PACK_3M_SHIFT 19
#define BLIS_PACK_SHIFT 20
#define BLIS_PACK_REV_IF_UPPER_SHIFT 21
#define BLIS_PACK_REV_IF_LOWER_SHIFT 22
#define BLIS_PACK_BUFFER_SHIFT 23
#define BLIS_STRUC_SHIFT 25
#define BLIS_PACK_FORMAT_SHIFT 18
#define BLIS_PACK_SHIFT 21
#define BLIS_PACK_REV_IF_UPPER_SHIFT 22
#define BLIS_PACK_REV_IF_LOWER_SHIFT 23
#define BLIS_PACK_BUFFER_SHIFT 24
#define BLIS_STRUC_SHIFT 26
//
// -- BLIS info bit field masks ------------------------------------------------
@@ -292,8 +291,7 @@ typedef struct
#define BLIS_PACK_SCHEMA_BITS ( 0x1F << BLIS_PACK_SCHEMA_SHIFT )
#define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT )
#define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT )
#define BLIS_PACK_4M_BIT ( 0x1 << BLIS_PACK_4M_SHIFT )
#define BLIS_PACK_3M_BIT ( 0x1 << BLIS_PACK_3M_SHIFT )
#define BLIS_PACK_FORMAT_BITS ( 0x7 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT )
#define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT )
#define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT )
@@ -328,15 +326,17 @@ typedef struct
#define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT
#define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT
#define BLIS_BITVAL_NOT_PACKED 0x0
#define BLIS_BITVAL_4M ( 0x1 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_3M ( 0x2 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_PACKED_UNSPEC BLIS_PACK_BIT
#define BLIS_BITVAL_PACKED_ROWS BLIS_PACK_BIT
#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_4M ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_4M_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_4M ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_4M_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_3M ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_3M_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_3M ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_3M_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT )
#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_4M ( BLIS_PACK_BIT | BLIS_BITVAL_4M | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_4M ( BLIS_PACK_BIT | BLIS_BITVAL_4M | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_3M ( BLIS_PACK_BIT | BLIS_BITVAL_3M | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_3M ( BLIS_PACK_BIT | BLIS_BITVAL_3M | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0
#define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT
#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0