Perform her2k var1 loops in sequence.

Details:
- Changed variant 1 of her2k so that the two rank-k products are computed
  and accumulated in sequence rather than fused into one loop. This is
  necessary if BLIS is to be configured to provide only enough contiguous
  memory for one panel of B.
This commit is contained in:
Field G. Van Zee
2013-03-15 12:21:42 -05:00
parent c95c270eba
commit 1e76cae00c
2 changed files with 167 additions and 64 deletions

View File

@@ -71,17 +71,85 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
c,
cntl_sub_scalm( cntl ) );
// Initialize objects for packing B' and A'.
//
// Perform first rank-k update: C = C + alpha * A * B'.
//
// Initialize object for packing B'.
bl2_packm_init( bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack B' and scale by alpha (if instructed).
bl2_packm_int( alpha,
bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding region
// of Bh_pack. We compute the width of the subpartition taking the
// location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &bh_pack, &bhL_pack );
// Initialize objects for packing A1 and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Perform herk subproblem.
bl2_herk_int( alpha,
&a1_pack,
&bhL_pack,
beta,
&c1L_pack,
cntl_sub_herk( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1L_pack, &c1L,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
//
// Perform second rank-k update: C = C + conj(alpha) * B * A'.
//
// Initialize object for packing A'.
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
ah, &ah_pack,
@@ -91,43 +159,32 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
b_alg = bl2_determine_blocksize_b( i, m_trans, b,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1 and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, a, &a1 );
// Acquire partitions for B1 and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, b, &b1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding regions
// of Bh_pack and Ah_pack. We compute the width of the subpartition
// taking the location of the diagonal into account.
// Partition off the stored region of C1 and the corresponding region
// of Ah_pack. We compute the width of the subpartition taking the
// location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &bh_pack, &bhL_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &ah_pack, &ahL_pack );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Initialize objects for packing B1 and C1.
bl2_packm_init( &b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
&b1, &b1_pack,
@@ -138,16 +195,13 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Perform her2k subproblem.
bl2_her2k_int( alpha,
&a1_pack,
&bhL_pack,
alpha_conj,
&b1_pack,
&ahL_pack,
beta,
&c1L_pack,
cntl_sub_her2k( cntl ) );
// Perform herk subproblem.
bl2_herk_int( alpha_conj,
&b1_pack,
&ahL_pack,
&BLIS_ONE,
&c1L_pack,
cntl_sub_herk( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1L_pack, &c1L,
@@ -156,8 +210,6 @@ void bl2_her2k_l_blk_var1( obj_t* alpha,
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
bl2_obj_release_pack( &b1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1L_pack );

View File

@@ -71,17 +71,84 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
c,
cntl_sub_scalm( cntl ) );
// Initialize objects for packing B' and A'.
//
// Perform first rank-k update: C = C + alpha * A * B'.
//
// Initialize object for packing B'.
bl2_packm_init( bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack B' and scale by alpha (if instructed).
bl2_packm_int( alpha,
bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_f( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding region
// of Bh_pack. We compute the width of the subpartition taking the
// location of the diagonal into account.
offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) );
nR = bl2_obj_width_after_trans( c1 ) - offR;
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &c1, &c1R );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &bh_pack, &bhR_pack );
// Initialize objects for packing A1 and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1R, &c1R_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1R, &c1R_pack,
cntl_sub_packm_c( cntl ) );
// Perform herk subproblem.
bl2_herk_int( alpha,
&a1_pack,
&bhR_pack,
beta,
&c1R_pack,
cntl_sub_herk( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1R_pack, &c1R,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
//
// Perform second rank-k update: C = C + conj(alpha) * B * A'.
//
// Initialize object for packing A'.
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
ah, &ah_pack,
@@ -91,42 +158,31 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_f( i, m_trans, a,
b_alg = bl2_determine_blocksize_f( i, m_trans, b,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1, and C1.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, a, &a1 );
// Acquire partitions for B1 and C1.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, b, &b1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding regions
// of Bh_pack and Ah_pack. We compute the width of the subpartition
// taking the location of the diagonal into account.
// Partition off the stored region of C1 and the corresponding region
// of Ah_pack. We compute the width of the subpartition taking the
// location of the diagonal into account.
offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) );
nR = bl2_obj_width_after_trans( c1 ) - offR;
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &c1, &c1R );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &bh_pack, &bhR_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &ah_pack, &ahR_pack );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Initialize objects for packing B1 and C1.
bl2_packm_init( &b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1R, &c1R_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
&b1, &b1_pack,
@@ -137,16 +193,13 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
&c1R, &c1R_pack,
cntl_sub_packm_c( cntl ) );
// Perform her2k subproblem.
bl2_her2k_int( alpha,
&a1_pack,
&bhR_pack,
alpha_conj,
&b1_pack,
&ahR_pack,
beta,
&c1R_pack,
cntl_sub_her2k( cntl ) );
// Perform herk subproblem.
bl2_herk_int( alpha_conj,
&b1_pack,
&ahR_pack,
&BLIS_ONE,
&c1R_pack,
cntl_sub_herk( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1R_pack, &c1R,
@@ -155,8 +208,6 @@ void bl2_her2k_u_blk_var1( obj_t* alpha,
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
bl2_obj_release_pack( &b1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1R_pack );