From 37308f9a502b56d94fa52a7df71c676a46c3be3d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 26 Mar 2013 12:43:14 -0500 Subject: [PATCH] Align packed panel strides with system alignment. Details: - Pass panel strides through bli_align_dim_to_sys() to ensure that each subsequent packed panel of A and B begins at an aligned address. (The first panel is presumably aligned to system alignment because it is aligned to a page boundary, which is typically much larger.) - Rearranged code in packm_init_pack() to prevent additional conditional blocks as a result of the aforementioned change. - Adjusted contiguous memory allocator so that the system memory alignment is used to allocate enough space for each block no matter what kind of register blocking is used (even if register blocksize is unit and every row/column needs maximal padding). - Adjusted default blocksizes in reference configuration so that MC*KC and KC*NC result in identical footprints for all datatypes. --- config/reference/bli_kernel.h | 10 +-- frame/1m/packm/bli_packm_init.c | 127 +++++++++++++++++++++----------- frame/base/bli_mem.c | 35 ++++++--- frame/base/bli_obj.c | 2 +- frame/base/bli_obj.h | 2 +- 5 files changed, 112 insertions(+), 64 deletions(-) diff --git a/config/reference/bli_kernel.h b/config/reference/bli_kernel.h index 5909cb7e4..e5f4de29e 100644 --- a/config/reference/bli_kernel.h +++ b/config/reference/bli_kernel.h @@ -61,21 +61,21 @@ // and thus it would be constraint (2b) that would be needed instead of (1b). // -#define BLIS_DEFAULT_MC_S 128 +#define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8192 #define BLIS_DEFAULT_MC_D 128 #define BLIS_DEFAULT_KC_D 256 -#define BLIS_DEFAULT_NC_D 8192 +#define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MC_C 128 #define BLIS_DEFAULT_KC_C 256 -#define BLIS_DEFAULT_NC_C 8192 +#define BLIS_DEFAULT_NC_C 4096 -#define BLIS_DEFAULT_MC_Z 128 +#define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 256 -#define BLIS_DEFAULT_NC_Z 8192 +#define BLIS_DEFAULT_NC_Z 2048 // -- Default register blocksizes for inner kernel -- diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index b34099fba..69718891a 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -175,6 +175,7 @@ void bli_packm_init_pack( bool_t densify, mem_t* mem_p; dim_t m_p_pad, n_p_pad; siz_t size_p; + siz_t elem_size_p; inc_t rs_p, cs_p; void* buf; @@ -220,7 +221,7 @@ void bli_packm_init_pack( bool_t densify, mem_p = bli_obj_pack_mem( *p ); // Compute the dimensions padded by the dimension multiples. These - // dimensions represent the dimensions of the packed matrices, including + // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. // We compute them by starting with the effective dimensions of c (now // in p) and aligning them to the dimension multiples (typically equal @@ -229,64 +230,59 @@ void bli_packm_init_pack( bool_t densify, m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mult_m_dim ); n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), mult_n_dim ); - // Compute the size of the packed buffer. - size_p = m_p_pad * n_p_pad * bli_obj_elem_size( *p ); - - if ( bli_mem_is_unalloc( mem_p ) ) - { - // If the mem_t object of p has not yet been allocated, then acquire - // a memory block of type pack_buf_type. - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); - } - else - { - // If the mem_t object is currently allocated and smaller than is - // needed, then something is very wrong, since the cache blocksizes - // that drive the level-3 blocked algorithms are the same ones that - // determine the sizes of the blocks within our memory allocator's - // memory pools. This branch should never be executed. - if ( bli_mem_size( mem_p ) < size_p ) bli_abort(); - } - - // Save the padded (packed) dimensions into the packed object. It is - // important to save these dimensions since they represent the actual - // dimensions of the zero-padded matrix. + // Save the padded dimensions into the packed object. It is important + // to save these dimensions since they represent the actual dimensions + // of the zero-padded matrix. bli_obj_set_packed_dims( m_p_pad, n_p_pad, *p ); - // Grab the buffer address from the mem_t object and copy it to the - // main object buffer field. (Sometimes this buffer address will be - // copied when the value is already up-to-date, because it persists - // in the main object buffer field across loop iterations.) - buf = bli_mem_buffer( mem_p ); - bli_obj_set_buffer( buf, *p ); + // Now we prepare to compute strides, align them, and compute the + // total number of bytes needed for the packed buffer. After that, + // we will acquire an appropriate block of memory from the memory + // allocator. + // Extract the element size for the packed object. + elem_size_p = bli_obj_elem_size( *p ); // Set the row and column strides of p based on the pack schema. if ( pack_schema == BLIS_PACKED_ROWS ) { - // For regular row storage, the packed width of our mem_t region + // For regular row storage, the padded width of our matrix // should be used for the row stride, with the column stride set // to one. By using the WIDTH of the mem_t region, we allow for // zero-padding (if necessary/desired) along the right edge of // the matrix. - rs_p = bli_obj_packed_width( *p ); + rs_p = n_p_pad; cs_p = 1; + // Align the leading dimension according to the system alignment so + // that the second, third, etc rows begin at aligned addresses. + rs_p = bli_align_dim_to_sys( rs_p, elem_size_p ); + + // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); + + // Compute the size of the packed buffer. + size_p = m_p_pad * rs_p * elem_size_p; } else if ( pack_schema == BLIS_PACKED_COLUMNS ) { - // For regular column storage, the packed length of our mem_t region + // For regular column storage, the padded length of our matrix // should be used for the column stride, with the row stride set // to one. By using the LENGTH of the mem_t region, we allow for // zero-padding (if necessary/desired) along the bottom edge of // the matrix. - cs_p = bli_obj_packed_length( *p ); + cs_p = m_p_pad; rs_p = 1; + // Align the leading dimension according to the system alignment so + // that the second, third, etc columns begin at aligned addresses. + cs_p = bli_align_dim_to_sys( cs_p, elem_size_p ); + + // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); + + // Compute the size of the packed buffer. + size_p = cs_p * n_p_pad * elem_size_p; } else if ( pack_schema == BLIS_PACKED_ROW_PANELS ) { @@ -308,15 +304,21 @@ void bli_packm_init_pack( bool_t densify, // The "panel stride" of a panel packed object is interpreted as the // distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the WIDTH of the packed mem_t region - // to determine the panel "width"; this will allow for zero-padding - // (if necessary/desired) along the far end of each panel (ie: the - // right edge of the matrix). - ps_p = cs_p * bli_obj_packed_width( *p ); + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each panel (ie: the right edge of the matrix). + ps_p = cs_p * n_p_pad; + + // Align the panel dimension according to the system alignment so + // that the second, third, etc panels begin at aligned addresses. + ps_p = bli_align_dim_to_sys( ps_p, elem_size_p ); // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); bli_obj_set_panel_stride( ps_p, *p ); + + // Compute the size of the packed buffer. + size_p = ps_p * (m_p_pad / m_panel) * elem_size_p; } else if ( pack_schema == BLIS_PACKED_COL_PANELS ) { @@ -338,21 +340,56 @@ void bli_packm_init_pack( bool_t densify, // The "panel stride" of a panel packed object is interpreted as the // distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the LENGTH of the packed mem_t region - // to determine the panel "length"; this will allow for zero-padding - // (if necessary/desired) along the far end of each panel (ie: the - // bottom edge of the matrix). - ps_p = bli_obj_packed_length( *p ) * rs_p; + // element of panel k+1. We use the padded length computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each panel (ie: the bottom edge of the matrix). + ps_p = m_p_pad * rs_p; + + // Align the panel dimension according to the system alignment so + // that the second, third, etc panels begin at aligned addresses. + ps_p = bli_align_dim_to_sys( ps_p, elem_size_p ); // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); bli_obj_set_panel_stride( ps_p, *p ); + + // Compute the size of the packed buffer. + size_p = ps_p * (n_p_pad / n_panel) * elem_size_p; } else { // If the pack schema is something else, we assume stride information // of p is set later on, by the implementation. + + size_p = 0; } + + + if ( bli_mem_is_unalloc( mem_p ) ) + { + // If the mem_t object of p has not yet been allocated, then acquire + // a memory block of type pack_buf_type. + bli_mem_acquire_m( size_p, + pack_buf_type, + mem_p ); + } + else + { + // If the mem_t object is currently allocated and smaller than is + // needed, then something is very wrong, since the cache blocksizes + // that drive the level-3 blocked algorithms are the same ones that + // determine the sizes of the blocks within our memory allocator's + // memory pools. This branch should never be executed. + if ( bli_mem_size( mem_p ) < size_p ) bli_abort(); + } + + // Grab the buffer address from the mem_t object and copy it to the + // main object buffer field. (Sometimes this buffer address will be + // copied when the value is already up-to-date, because it persists + // in the main object buffer field across loop iterations.) + buf = bli_mem_buffer( mem_p ); + bli_obj_set_buffer( buf, *p ); + } diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 799258d4a..db3060667 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -37,22 +37,33 @@ // Define the size of pool blocks. These may be adjusted so that they can // handle inflated blocksizes at edge cases. -#define BLIS_POOL_MC_Z BLIS_DEFAULT_MC_Z -#define BLIS_POOL_KC_Z BLIS_DEFAULT_KC_Z -#define BLIS_POOL_NC_Z BLIS_DEFAULT_NC_Z +#define BLIS_POOL_MC_D BLIS_DEFAULT_MC_D +#define BLIS_POOL_KC_D BLIS_DEFAULT_KC_D +#define BLIS_POOL_NC_D BLIS_DEFAULT_NC_D // Define each pool's block size. -#define BLIS_MK_BLOCK_SIZE ( BLIS_POOL_MC_Z * \ - BLIS_POOL_KC_Z * \ - sizeof( dcomplex ) \ +// NOTE: Here we assume the "worst" case of the register blocking +// being unit and every row of A and column of B needing maximum +// padding to conform to the system alignment. +#define BLIS_MK_BLOCK_SIZE ( BLIS_POOL_MC_D * \ + ( BLIS_POOL_KC_D + \ + ( BLIS_MEMORY_ALIGNMENT_BOUNDARY / \ + sizeof( double ) \ + ) \ + ) * \ + sizeof( double ) \ ) -#define BLIS_KN_BLOCK_SIZE ( BLIS_POOL_KC_Z * \ - BLIS_POOL_NC_Z * \ - sizeof( dcomplex ) \ +#define BLIS_KN_BLOCK_SIZE ( ( BLIS_POOL_KC_D + \ + ( BLIS_MEMORY_ALIGNMENT_BOUNDARY / \ + sizeof( double ) \ + ) \ + ) * \ + BLIS_POOL_NC_D * \ + sizeof( double ) \ ) -#define BLIS_MN_BLOCK_SIZE ( BLIS_POOL_MC_Z * \ - BLIS_POOL_NC_Z * \ - sizeof( dcomplex ) \ +#define BLIS_MN_BLOCK_SIZE ( BLIS_POOL_MC_D * \ + BLIS_POOL_NC_D * \ + sizeof( double ) \ ) // Define each pool's total size. diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index eb87e8a45..e999023f2 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -425,7 +425,7 @@ dim_t bli_align_dim_to_mult( dim_t dim, dim_t dim_mult ) return dim; } -dim_t bli_align_dim_to_sys( dim_t dim, dim_t elem_size ) +dim_t bli_align_dim_to_sys( dim_t dim, siz_t elem_size ) { dim = ( ( dim * elem_size + BLIS_MEMORY_ALIGNMENT_BOUNDARY - 1 ) / BLIS_MEMORY_ALIGNMENT_BOUNDARY ) * diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h index bcc58c2f8..89b4c2619 100644 --- a/frame/base/bli_obj.h +++ b/frame/base/bli_obj.h @@ -95,7 +95,7 @@ siz_t bli_datatype_size( num_t dt ); dim_t bli_align_dim_to_mult( dim_t dim, dim_t dim_mult ); -dim_t bli_align_dim_to_sys( dim_t dim, dim_t elem_size ); +dim_t bli_align_dim_to_sys( dim_t dim, siz_t elem_size ); num_t bli_datatype_union( num_t dt1, num_t dt2 );