diff --git a/config/reference/bli_kernel.h b/config/reference/bli_kernel.h index 5909cb7e4..e5f4de29e 100644 --- a/config/reference/bli_kernel.h +++ b/config/reference/bli_kernel.h @@ -61,21 +61,21 @@ // and thus it would be constraint (2b) that would be needed instead of (1b). // -#define BLIS_DEFAULT_MC_S 128 +#define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8192 #define BLIS_DEFAULT_MC_D 128 #define BLIS_DEFAULT_KC_D 256 -#define BLIS_DEFAULT_NC_D 8192 +#define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MC_C 128 #define BLIS_DEFAULT_KC_C 256 -#define BLIS_DEFAULT_NC_C 8192 +#define BLIS_DEFAULT_NC_C 4096 -#define BLIS_DEFAULT_MC_Z 128 +#define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 256 -#define BLIS_DEFAULT_NC_Z 8192 +#define BLIS_DEFAULT_NC_Z 2048 // -- Default register blocksizes for inner kernel -- diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index b34099fba..69718891a 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -175,6 +175,7 @@ void bli_packm_init_pack( bool_t densify, mem_t* mem_p; dim_t m_p_pad, n_p_pad; siz_t size_p; + siz_t elem_size_p; inc_t rs_p, cs_p; void* buf; @@ -220,7 +221,7 @@ void bli_packm_init_pack( bool_t densify, mem_p = bli_obj_pack_mem( *p ); // Compute the dimensions padded by the dimension multiples. These - // dimensions represent the dimensions of the packed matrices, including + // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. // We compute them by starting with the effective dimensions of c (now // in p) and aligning them to the dimension multiples (typically equal @@ -229,64 +230,59 @@ void bli_packm_init_pack( bool_t densify, m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mult_m_dim ); n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), mult_n_dim ); - // Compute the size of the packed buffer. - size_p = m_p_pad * n_p_pad * bli_obj_elem_size( *p ); - - if ( bli_mem_is_unalloc( mem_p ) ) - { - // If the mem_t object of p has not yet been allocated, then acquire - // a memory block of type pack_buf_type. - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); - } - else - { - // If the mem_t object is currently allocated and smaller than is - // needed, then something is very wrong, since the cache blocksizes - // that drive the level-3 blocked algorithms are the same ones that - // determine the sizes of the blocks within our memory allocator's - // memory pools. This branch should never be executed. - if ( bli_mem_size( mem_p ) < size_p ) bli_abort(); - } - - // Save the padded (packed) dimensions into the packed object. It is - // important to save these dimensions since they represent the actual - // dimensions of the zero-padded matrix. + // Save the padded dimensions into the packed object. It is important + // to save these dimensions since they represent the actual dimensions + // of the zero-padded matrix. bli_obj_set_packed_dims( m_p_pad, n_p_pad, *p ); - // Grab the buffer address from the mem_t object and copy it to the - // main object buffer field. (Sometimes this buffer address will be - // copied when the value is already up-to-date, because it persists - // in the main object buffer field across loop iterations.) - buf = bli_mem_buffer( mem_p ); - bli_obj_set_buffer( buf, *p ); + // Now we prepare to compute strides, align them, and compute the + // total number of bytes needed for the packed buffer. After that, + // we will acquire an appropriate block of memory from the memory + // allocator. + // Extract the element size for the packed object. + elem_size_p = bli_obj_elem_size( *p ); // Set the row and column strides of p based on the pack schema. if ( pack_schema == BLIS_PACKED_ROWS ) { - // For regular row storage, the packed width of our mem_t region + // For regular row storage, the padded width of our matrix // should be used for the row stride, with the column stride set // to one. By using the WIDTH of the mem_t region, we allow for // zero-padding (if necessary/desired) along the right edge of // the matrix. - rs_p = bli_obj_packed_width( *p ); + rs_p = n_p_pad; cs_p = 1; + // Align the leading dimension according to the system alignment so + // that the second, third, etc rows begin at aligned addresses. + rs_p = bli_align_dim_to_sys( rs_p, elem_size_p ); + + // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); + + // Compute the size of the packed buffer. + size_p = m_p_pad * rs_p * elem_size_p; } else if ( pack_schema == BLIS_PACKED_COLUMNS ) { - // For regular column storage, the packed length of our mem_t region + // For regular column storage, the padded length of our matrix // should be used for the column stride, with the row stride set // to one. By using the LENGTH of the mem_t region, we allow for // zero-padding (if necessary/desired) along the bottom edge of // the matrix. - cs_p = bli_obj_packed_length( *p ); + cs_p = m_p_pad; rs_p = 1; + // Align the leading dimension according to the system alignment so + // that the second, third, etc columns begin at aligned addresses. + cs_p = bli_align_dim_to_sys( cs_p, elem_size_p ); + + // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); + + // Compute the size of the packed buffer. + size_p = cs_p * n_p_pad * elem_size_p; } else if ( pack_schema == BLIS_PACKED_ROW_PANELS ) { @@ -308,15 +304,21 @@ void bli_packm_init_pack( bool_t densify, // The "panel stride" of a panel packed object is interpreted as the // distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the WIDTH of the packed mem_t region - // to determine the panel "width"; this will allow for zero-padding - // (if necessary/desired) along the far end of each panel (ie: the - // right edge of the matrix). - ps_p = cs_p * bli_obj_packed_width( *p ); + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each panel (ie: the right edge of the matrix). + ps_p = cs_p * n_p_pad; + + // Align the panel dimension according to the system alignment so + // that the second, third, etc panels begin at aligned addresses. + ps_p = bli_align_dim_to_sys( ps_p, elem_size_p ); // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); bli_obj_set_panel_stride( ps_p, *p ); + + // Compute the size of the packed buffer. + size_p = ps_p * (m_p_pad / m_panel) * elem_size_p; } else if ( pack_schema == BLIS_PACKED_COL_PANELS ) { @@ -338,21 +340,56 @@ void bli_packm_init_pack( bool_t densify, // The "panel stride" of a panel packed object is interpreted as the // distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the LENGTH of the packed mem_t region - // to determine the panel "length"; this will allow for zero-padding - // (if necessary/desired) along the far end of each panel (ie: the - // bottom edge of the matrix). - ps_p = bli_obj_packed_length( *p ) * rs_p; + // element of panel k+1. We use the padded length computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each panel (ie: the bottom edge of the matrix). + ps_p = m_p_pad * rs_p; + + // Align the panel dimension according to the system alignment so + // that the second, third, etc panels begin at aligned addresses. + ps_p = bli_align_dim_to_sys( ps_p, elem_size_p ); // Store the strides in p. bli_obj_set_incs( rs_p, cs_p, *p ); bli_obj_set_panel_stride( ps_p, *p ); + + // Compute the size of the packed buffer. + size_p = ps_p * (n_p_pad / n_panel) * elem_size_p; } else { // If the pack schema is something else, we assume stride information // of p is set later on, by the implementation. + + size_p = 0; } + + + if ( bli_mem_is_unalloc( mem_p ) ) + { + // If the mem_t object of p has not yet been allocated, then acquire + // a memory block of type pack_buf_type. + bli_mem_acquire_m( size_p, + pack_buf_type, + mem_p ); + } + else + { + // If the mem_t object is currently allocated and smaller than is + // needed, then something is very wrong, since the cache blocksizes + // that drive the level-3 blocked algorithms are the same ones that + // determine the sizes of the blocks within our memory allocator's + // memory pools. This branch should never be executed. + if ( bli_mem_size( mem_p ) < size_p ) bli_abort(); + } + + // Grab the buffer address from the mem_t object and copy it to the + // main object buffer field. (Sometimes this buffer address will be + // copied when the value is already up-to-date, because it persists + // in the main object buffer field across loop iterations.) + buf = bli_mem_buffer( mem_p ); + bli_obj_set_buffer( buf, *p ); + } diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 799258d4a..db3060667 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -37,22 +37,33 @@ // Define the size of pool blocks. These may be adjusted so that they can // handle inflated blocksizes at edge cases. -#define BLIS_POOL_MC_Z BLIS_DEFAULT_MC_Z -#define BLIS_POOL_KC_Z BLIS_DEFAULT_KC_Z -#define BLIS_POOL_NC_Z BLIS_DEFAULT_NC_Z +#define BLIS_POOL_MC_D BLIS_DEFAULT_MC_D +#define BLIS_POOL_KC_D BLIS_DEFAULT_KC_D +#define BLIS_POOL_NC_D BLIS_DEFAULT_NC_D // Define each pool's block size. -#define BLIS_MK_BLOCK_SIZE ( BLIS_POOL_MC_Z * \ - BLIS_POOL_KC_Z * \ - sizeof( dcomplex ) \ +// NOTE: Here we assume the "worst" case of the register blocking +// being unit and every row of A and column of B needing maximum +// padding to conform to the system alignment. +#define BLIS_MK_BLOCK_SIZE ( BLIS_POOL_MC_D * \ + ( BLIS_POOL_KC_D + \ + ( BLIS_MEMORY_ALIGNMENT_BOUNDARY / \ + sizeof( double ) \ + ) \ + ) * \ + sizeof( double ) \ ) -#define BLIS_KN_BLOCK_SIZE ( BLIS_POOL_KC_Z * \ - BLIS_POOL_NC_Z * \ - sizeof( dcomplex ) \ +#define BLIS_KN_BLOCK_SIZE ( ( BLIS_POOL_KC_D + \ + ( BLIS_MEMORY_ALIGNMENT_BOUNDARY / \ + sizeof( double ) \ + ) \ + ) * \ + BLIS_POOL_NC_D * \ + sizeof( double ) \ ) -#define BLIS_MN_BLOCK_SIZE ( BLIS_POOL_MC_Z * \ - BLIS_POOL_NC_Z * \ - sizeof( dcomplex ) \ +#define BLIS_MN_BLOCK_SIZE ( BLIS_POOL_MC_D * \ + BLIS_POOL_NC_D * \ + sizeof( double ) \ ) // Define each pool's total size. diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index eb87e8a45..e999023f2 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -425,7 +425,7 @@ dim_t bli_align_dim_to_mult( dim_t dim, dim_t dim_mult ) return dim; } -dim_t bli_align_dim_to_sys( dim_t dim, dim_t elem_size ) +dim_t bli_align_dim_to_sys( dim_t dim, siz_t elem_size ) { dim = ( ( dim * elem_size + BLIS_MEMORY_ALIGNMENT_BOUNDARY - 1 ) / BLIS_MEMORY_ALIGNMENT_BOUNDARY ) * diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h index bcc58c2f8..89b4c2619 100644 --- a/frame/base/bli_obj.h +++ b/frame/base/bli_obj.h @@ -95,7 +95,7 @@ siz_t bli_datatype_size( num_t dt ); dim_t bli_align_dim_to_mult( dim_t dim, dim_t dim_mult ); -dim_t bli_align_dim_to_sys( dim_t dim, dim_t elem_size ); +dim_t bli_align_dim_to_sys( dim_t dim, siz_t elem_size ); num_t bli_datatype_union( num_t dt1, num_t dt2 );