From 1e76cae00cb0a04544aaae1ade878686b238d283 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 15 Mar 2013 12:21:42 -0500 Subject: [PATCH] Perform her2k var1 loops in sequence. Details: - Changed variant 1 of her2k so that the two rank-k products are computed and accumulated in sequence rather than fused into one loop. This is necessary if BLIS is to be configured to provide only enough contiguous memory for one panel of B. --- frame/3/her2k/bl2_her2k_l_blk_var1.c | 116 +++++++++++++++++++-------- frame/3/her2k/bl2_her2k_u_blk_var1.c | 115 ++++++++++++++++++-------- 2 files changed, 167 insertions(+), 64 deletions(-) diff --git a/frame/3/her2k/bl2_her2k_l_blk_var1.c b/frame/3/her2k/bl2_her2k_l_blk_var1.c index cc86f2327..863826134 100644 --- a/frame/3/her2k/bl2_her2k_l_blk_var1.c +++ b/frame/3/her2k/bl2_her2k_l_blk_var1.c @@ -71,17 +71,85 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, c, cntl_sub_scalm( cntl ) ); - // Initialize objects for packing B' and A'. + // + // Perform first rank-k update: C = C + alpha * A * B'. + // + + // Initialize object for packing B'. bl2_packm_init( bh, &bh_pack, cntl_sub_packm_b( cntl ) ); - bl2_packm_init( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); // Pack B' and scale by alpha (if instructed). bl2_packm_int( alpha, bh, &bh_pack, cntl_sub_packm_b( cntl ) ); + // Partition along the m dimension. + for ( i = 0; i < m_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bl2_determine_blocksize_b( i, m_trans, a, + cntl_blocksize( cntl ) ); + + // Acquire partitions for A1 and C1. + bl2_acquire_mpart_b2t( BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bl2_acquire_mpart_b2t( BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Partition off the stored region of C1 and the corresponding region + // of Bh_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. + offL = 0; + nL = bl2_min( bl2_obj_width_after_trans( c1 ), + bl2_obj_diag_offset_after_trans( c1 ) + b_alg ); + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offL, nL, &c1, &c1L ); + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offL, nL, &bh_pack, &bhL_pack ); + + // Initialize objects for packing A1 and C1. + bl2_packm_init( &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + bl2_packm_init( &c1L, &c1L_pack, + cntl_sub_packm_c( cntl ) ); + + // Pack A1 and scale by alpha (if instructed). + bl2_packm_int( alpha, + &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + + // Pack C1 and scale by beta (if instructed). + bl2_packm_int( beta, + &c1L, &c1L_pack, + cntl_sub_packm_c( cntl ) ); + + // Perform herk subproblem. + bl2_herk_int( alpha, + &a1_pack, + &bhL_pack, + beta, + &c1L_pack, + cntl_sub_herk( cntl ) ); + + // Unpack C1 (if C1 was packed). + bl2_unpackm_int( &c1L_pack, &c1L, + cntl_sub_unpackm_c( cntl ) ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + bl2_obj_release_pack( &a1_pack ); + bl2_obj_release_pack( &bh_pack ); + + // + // Perform second rank-k update: C = C + conj(alpha) * B * A'. + // + + // Initialize object for packing A'. + bl2_packm_init( ah, &ah_pack, + cntl_sub_packm_b( cntl ) ); + // Pack A' and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, ah, &ah_pack, @@ -91,43 +159,32 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, for ( i = 0; i < m_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bl2_determine_blocksize_b( i, m_trans, a, + b_alg = bl2_determine_blocksize_b( i, m_trans, b, cntl_blocksize( cntl ) ); - // Acquire partitions for A1, B1 and C1. - bl2_acquire_mpart_b2t( BLIS_SUBPART1, - i, b_alg, a, &a1 ); + // Acquire partitions for B1 and C1. bl2_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, b, &b1 ); bl2_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, c, &c1 ); - // Partition off the stored region of C1 and the corresponding regions - // of Bh_pack and Ah_pack. We compute the width of the subpartition - // taking the location of the diagonal into account. + // Partition off the stored region of C1 and the corresponding region + // of Ah_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. offL = 0; nL = bl2_min( bl2_obj_width_after_trans( c1 ), bl2_obj_diag_offset_after_trans( c1 ) + b_alg ); bl2_acquire_mpart_l2r( BLIS_SUBPART1, offL, nL, &c1, &c1L ); - bl2_acquire_mpart_l2r( BLIS_SUBPART1, - offL, nL, &bh_pack, &bhL_pack ); bl2_acquire_mpart_l2r( BLIS_SUBPART1, offL, nL, &ah_pack, &ahL_pack ); - // Initialize objects for packing A1, B1, and C1. - bl2_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + // Initialize objects for packing B1 and C1. bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) ); bl2_packm_init( &c1L, &c1L_pack, cntl_sub_packm_c( cntl ) ); - // Pack A1 and scale by alpha (if instructed). - bl2_packm_int( alpha, - &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - // Pack B1 and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, &b1, &b1_pack, @@ -138,16 +195,13 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, &c1L, &c1L_pack, cntl_sub_packm_c( cntl ) ); - // Perform her2k subproblem. - bl2_her2k_int( alpha, - &a1_pack, - &bhL_pack, - alpha_conj, - &b1_pack, - &ahL_pack, - beta, - &c1L_pack, - cntl_sub_her2k( cntl ) ); + // Perform herk subproblem. + bl2_herk_int( alpha_conj, + &b1_pack, + &ahL_pack, + &BLIS_ONE, + &c1L_pack, + cntl_sub_herk( cntl ) ); // Unpack C1 (if C1 was packed). bl2_unpackm_int( &c1L_pack, &c1L, @@ -156,8 +210,6 @@ void bl2_her2k_l_blk_var1( obj_t* alpha, // If any packing buffers were acquired within packm, release them back // to the memory manager. - bl2_obj_release_pack( &a1_pack ); - bl2_obj_release_pack( &bh_pack ); bl2_obj_release_pack( &b1_pack ); bl2_obj_release_pack( &ah_pack ); bl2_obj_release_pack( &c1L_pack ); diff --git a/frame/3/her2k/bl2_her2k_u_blk_var1.c b/frame/3/her2k/bl2_her2k_u_blk_var1.c index ad106419b..dbfe59c71 100644 --- a/frame/3/her2k/bl2_her2k_u_blk_var1.c +++ b/frame/3/her2k/bl2_her2k_u_blk_var1.c @@ -71,17 +71,84 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, c, cntl_sub_scalm( cntl ) ); - // Initialize objects for packing B' and A'. + // + // Perform first rank-k update: C = C + alpha * A * B'. + // + + // Initialize object for packing B'. bl2_packm_init( bh, &bh_pack, cntl_sub_packm_b( cntl ) ); - bl2_packm_init( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); // Pack B' and scale by alpha (if instructed). bl2_packm_int( alpha, bh, &bh_pack, cntl_sub_packm_b( cntl ) ); + // Partition along the m dimension. + for ( i = 0; i < m_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bl2_determine_blocksize_f( i, m_trans, a, + cntl_blocksize( cntl ) ); + + // Acquire partitions for A1 and C1. + bl2_acquire_mpart_t2b( BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bl2_acquire_mpart_t2b( BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Partition off the stored region of C1 and the corresponding region + // of Bh_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. + offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) ); + nR = bl2_obj_width_after_trans( c1 ) - offR; + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offR, nR, &c1, &c1R ); + bl2_acquire_mpart_l2r( BLIS_SUBPART1, + offR, nR, &bh_pack, &bhR_pack ); + + // Initialize objects for packing A1 and C1. + bl2_packm_init( &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + bl2_packm_init( &c1R, &c1R_pack, + cntl_sub_packm_c( cntl ) ); + + // Pack A1 and scale by alpha (if instructed). + bl2_packm_int( alpha, + &a1, &a1_pack, + cntl_sub_packm_a( cntl ) ); + + // Pack C1 and scale by beta (if instructed). + bl2_packm_int( beta, + &c1R, &c1R_pack, + cntl_sub_packm_c( cntl ) ); + + // Perform herk subproblem. + bl2_herk_int( alpha, + &a1_pack, + &bhR_pack, + beta, + &c1R_pack, + cntl_sub_herk( cntl ) ); + + // Unpack C1 (if C1 was packed). + bl2_unpackm_int( &c1R_pack, &c1R, + cntl_sub_unpackm_c( cntl ) ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + bl2_obj_release_pack( &a1_pack ); + bl2_obj_release_pack( &bh_pack ); + + // + // Perform second rank-k update: C = C + conj(alpha) * B * A'. + // + + // Initialize object for packing A'. + bl2_packm_init( ah, &ah_pack, + cntl_sub_packm_b( cntl ) ); + // Pack A' and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, ah, &ah_pack, @@ -91,42 +158,31 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, for ( i = 0; i < m_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bl2_determine_blocksize_f( i, m_trans, a, + b_alg = bl2_determine_blocksize_f( i, m_trans, b, cntl_blocksize( cntl ) ); - // Acquire partitions for A1, B1, and C1. - bl2_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); + // Acquire partitions for B1 and C1. bl2_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, b, &b1 ); bl2_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); - // Partition off the stored region of C1 and the corresponding regions - // of Bh_pack and Ah_pack. We compute the width of the subpartition - // taking the location of the diagonal into account. + // Partition off the stored region of C1 and the corresponding region + // of Ah_pack. We compute the width of the subpartition taking the + // location of the diagonal into account. offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) ); nR = bl2_obj_width_after_trans( c1 ) - offR; bl2_acquire_mpart_l2r( BLIS_SUBPART1, offR, nR, &c1, &c1R ); - bl2_acquire_mpart_l2r( BLIS_SUBPART1, - offR, nR, &bh_pack, &bhR_pack ); bl2_acquire_mpart_l2r( BLIS_SUBPART1, offR, nR, &ah_pack, &ahR_pack ); - // Initialize objects for packing A1, B1, and C1. - bl2_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + // Initialize objects for packing B1 and C1. bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) ); bl2_packm_init( &c1R, &c1R_pack, cntl_sub_packm_c( cntl ) ); - // Pack A1 and scale by alpha (if instructed). - bl2_packm_int( alpha, - &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - // Pack B1 and scale by alpha_conj (if instructed). bl2_packm_int( alpha_conj, &b1, &b1_pack, @@ -137,16 +193,13 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, &c1R, &c1R_pack, cntl_sub_packm_c( cntl ) ); - // Perform her2k subproblem. - bl2_her2k_int( alpha, - &a1_pack, - &bhR_pack, - alpha_conj, - &b1_pack, - &ahR_pack, - beta, - &c1R_pack, - cntl_sub_her2k( cntl ) ); + // Perform herk subproblem. + bl2_herk_int( alpha_conj, + &b1_pack, + &ahR_pack, + &BLIS_ONE, + &c1R_pack, + cntl_sub_herk( cntl ) ); // Unpack C1 (if C1 was packed). bl2_unpackm_int( &c1R_pack, &c1R, @@ -155,8 +208,6 @@ void bl2_her2k_u_blk_var1( obj_t* alpha, // If any packing buffers were acquired within packm, release them back // to the memory manager. - bl2_obj_release_pack( &a1_pack ); - bl2_obj_release_pack( &bh_pack ); bl2_obj_release_pack( &b1_pack ); bl2_obj_release_pack( &ah_pack ); bl2_obj_release_pack( &c1R_pack );