From dedda523dc5dc779ecc34e6a03dc74cb8eb220de Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 19 Aug 2013 12:07:41 -0500 Subject: [PATCH] Fixed bug in bli_acquire_mpart_t2b(), _l2r(). Details: - Fixed a bug in bli_acquire_mpart_t2b() and bli_acquire_mpart_l2r() that cause incorrect partitioning when SUBPART0 was requested. This bug was introduced in 46d3d09d49ad. Thanks to Bryan for isolating this bug. - Removed dupl kernels from kernels/x86_64/3 directory. - Uncommented beta == 0 optimizaition code in kernels/x86_64/3/bli_gemm_opt_d4x4.c. --- frame/base/bli_part.c | 4 +- kernels/x86_64/3/bli_dupl_opt_var1.c | 186 --------------------------- kernels/x86_64/3/bli_dupl_opt_var1.h | 46 ------- kernels/x86_64/3/bli_gemm_opt_d4x4.c | 6 +- 4 files changed, 5 insertions(+), 237 deletions(-) delete mode 100644 kernels/x86_64/3/bli_dupl_opt_var1.c delete mode 100644 kernels/x86_64/3/bli_dupl_opt_var1.h diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index ac1e4bf13..a417cd8a3 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -99,7 +99,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = i; n_part = n; } - if ( requested_part == BLIS_SUBPART1T ) + else if ( requested_part == BLIS_SUBPART1T ) { // A1T (offm,offn) unchanged. // A1T is (i+b) x n. @@ -287,7 +287,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = j; } - if ( requested_part == BLIS_SUBPART1L ) + else if ( requested_part == BLIS_SUBPART1L ) { // A1L (offm,offn) unchanged. // A1L is m x (j+b). diff --git a/kernels/x86_64/3/bli_dupl_opt_var1.c b/kernels/x86_64/3/bli_dupl_opt_var1.c deleted file mode 100644 index 189e39135..000000000 --- a/kernels/x86_64/3/bli_dupl_opt_var1.c +++ /dev/null @@ -1,186 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2013, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_sdupl_opt_var1( - dim_t n_elem, - float* b, - float* bd - ) -{ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); -} - -void bli_ddupl_opt_var1( - dim_t n_elem, - double* b, - double* bd - ) -{ - dim_t n_iter = n_elem / 16; - dim_t n_left = n_elem % 16; - - __asm__ volatile - ( - " \n\t" - "movq %2, %%rax \n\t" // load address of b. - "movq %3, %%rbx \n\t" // load address of bd. - " \n\t" - "addq $8 * 16, %%rax \n\t" // increment pointers to allow byte - "addq $8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. - " \n\t" - "movq %0, %%rsi \n\t" // i = n_iter; - "testq %%rsi, %%rsi \n\t" // check n_iter via logical AND. - "je .CONSIDERNLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the n_left loop. - " \n\t" - " \n\t" - ".LOOPNITER: \n\t" // MAIN LOOP - " \n\t" - "movapd -8 * 16(%%rax), %%xmm1 \n\t" - "movapd -7 * 16(%%rax), %%xmm3 \n\t" - "movapd -6 * 16(%%rax), %%xmm5 \n\t" - "movapd -5 * 16(%%rax), %%xmm7 \n\t" - " \n\t" - "movapd -4 * 16(%%rax), %%xmm9 \n\t" - "movapd -3 * 16(%%rax), %%xmm11 \n\t" - "movapd -2 * 16(%%rax), %%xmm13 \n\t" - "movapd -1 * 16(%%rax), %%xmm15 \n\t" - " \n\t" - " \n\t" - "movddup %%xmm1, %%xmm0 \n\t" - "unpckhpd %%xmm1, %%xmm1 \n\t" - "movddup %%xmm3, %%xmm2 \n\t" - "unpckhpd %%xmm3, %%xmm3 \n\t" - "movddup %%xmm5, %%xmm4 \n\t" - "unpckhpd %%xmm5, %%xmm5 \n\t" - "movddup %%xmm7, %%xmm6 \n\t" - "unpckhpd %%xmm7, %%xmm7 \n\t" - " \n\t" - "movddup %%xmm9, %%xmm8 \n\t" - "unpckhpd %%xmm9, %%xmm9 \n\t" - "movddup %%xmm11, %%xmm10 \n\t" - "unpckhpd %%xmm11, %%xmm11 \n\t" - "movddup %%xmm13, %%xmm12 \n\t" - "unpckhpd %%xmm13, %%xmm13 \n\t" - "movddup %%xmm15, %%xmm14 \n\t" - "unpckhpd %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - "movapd %%xmm0, -8 * 16(%%rbx) \n\t" - "movapd %%xmm1, -7 * 16(%%rbx) \n\t" - "movapd %%xmm2, -6 * 16(%%rbx) \n\t" - "movapd %%xmm3, -5 * 16(%%rbx) \n\t" - "movapd %%xmm4, -4 * 16(%%rbx) \n\t" - "movapd %%xmm5, -3 * 16(%%rbx) \n\t" - "movapd %%xmm6, -2 * 16(%%rbx) \n\t" - "movapd %%xmm7, -1 * 16(%%rbx) \n\t" - " \n\t" - "movapd %%xmm8, 0 * 16(%%rbx) \n\t" - "movapd %%xmm9, 1 * 16(%%rbx) \n\t" - "movapd %%xmm10, 2 * 16(%%rbx) \n\t" - "movapd %%xmm11, 3 * 16(%%rbx) \n\t" - "movapd %%xmm12, 4 * 16(%%rbx) \n\t" - "movapd %%xmm13, 5 * 16(%%rbx) \n\t" - "movapd %%xmm14, 6 * 16(%%rbx) \n\t" - "movapd %%xmm15, 7 * 16(%%rbx) \n\t" - " \n\t" - "addq $8 * 16, %%rax \n\t" // b += 16; - "addq $16 * 16, %%rbx \n\t" // bd += 16*2; - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .LOOPNITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CONSIDERNLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = n_left; - "testq %%rsi, %%rsi \n\t" // check n_left via logical AND. - "je .DONE \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter n_left loop. - " \n\t" - " \n\t" - ".LOOPNLEFT: \n\t" // EDGE LOOP - " \n\t" - "movddup -8 * 16(%%rax), %%xmm0 \n\t" - "addq $8, %%rax \n\t" // b += 1; - " \n\t" - "movapd %%xmm0, -8 * 16(%%rbx) \n\t" - "addq $16, %%rbx \n\t" // bd += 2; - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .LOOPNLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DONE: \n\t" - " \n\t" - - : // output operands (none) - : // input operands - "r" (n_iter), - "r" (n_left), - "m" (b), - "m" (bd) - : // register clobber list - "rax", "rbx", "rsi", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ); - -} - -void bli_cdupl_opt_var1( - dim_t k, - scomplex* b, - scomplex* bd - ) -{ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); -} - -void bli_zdupl_opt_var1( - dim_t k, - dcomplex* b, - dcomplex* bd - ) -{ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); -} - diff --git a/kernels/x86_64/3/bli_dupl_opt_var1.h b/kernels/x86_64/3/bli_dupl_opt_var1.h deleted file mode 100644 index 8ad9c598c..000000000 --- a/kernels/x86_64/3/bli_dupl_opt_var1.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2013, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t n_elem, \ - ctype* b, \ - ctype* bd \ - ); - -INSERT_GENTPROT_BASIC( dupl_opt_var1 ) - diff --git a/kernels/x86_64/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/3/bli_gemm_opt_d4x4.c index 93cd0db2b..fcf8f76cd 100644 --- a/kernels/x86_64/3/bli_gemm_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemm_opt_d4x4.c @@ -371,9 +371,9 @@ void bli_dgemm_opt_d4x4( " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" - //"xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. - //"ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0. - //"je .BETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + "xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. + "ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0. + "je .BETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" " \n\t" // check if aligned/column-stored