Gtestsuite: Memory testing of ZGEMM micro kernels

- Testing out of bound read and write of input and output matrix
  for SUP and Native micro kernels
- Protected buffers and memory testing feature available in gtestuite
  is used to validate memory error

AMD_Internal: [CPUPL-4623]

Change-Id: I620fd3cd4eed1002e08b6233effb89b47beb073f
This commit is contained in:
mangala v
2024-02-23 07:33:02 +05:30
parent 53bbc7866f
commit 0ec3581940
2 changed files with 1122 additions and 520 deletions

View File

@@ -0,0 +1,452 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <stdexcept>
#include <signal.h>
#include "level3/ref_gemm.h"
#include "inc/check_error.h"
#include "blis.h"
#include "common/testing_helpers.h"
/**********************************************************************/
/************ Code path when memory test is disabled **************/
/* 1. Compute Leading dimension of all matrix based on */
/* storage, size and trans parameters */
/* 2. Compute size of matrices for which memory needs to be allocated */
/* 3. Allocate memory for all matrices */
/* 4. Initialise matrices with random numbers */
/* 5. Copy blis output matrix content to reference output matrix */
/* 6. Call blis micro kernel with output matrix */
/* 7. Call reference kernel with reference output matrix */
/* 8. Compute difference of blis and reference output */
/* based on threshold set */
/**********************************************************************/
/************ Code path when memory test is enabled **************/
/* 1. Compute Leading dimension of all matrix based on */
/* storage, size and trans parameters */
/* 2. Compute size of matrices for which memory needs to be allocated */
/* 3. Allocate 2 set of memories for A, B, C matrix */
/* green_zone1: Memory near red_zone1 */
/* green_zone2: Memory near red_zone2 */
/* 2 set of memory is required to check memory leaks */
/* before starting of buffer or after end of buffer */
/* 4. Initialise matrices with random numbers */
/* 5. Call blis micro kernel with output matrix with green_zone1 ptr */
/* 6. Call blis micro kernel again with green_zone2 ptr */
/* 7. Failure is reported if there is out of bound read/write error */
/* 8. Call reference kernel with reference output matrix to */
/* check for any accuracy failures */
/* 9. Compute difference of blis and reference output */
/* based on threshold set */
/**********************************************************************/
template<typename T, typename FT>
static void test_zgemmsup_ukr( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, gtint_t k, T alpha, T beta, double thresh, FT ukr_fp, bool is_memory_test = false )
{
// Compute the leading dimensions of a, b, and c.
gtint_t lda = testinghelpers::get_leading_dimension( storage, trnsa, m, k, 0 );
gtint_t ldb = testinghelpers::get_leading_dimension( storage, trnsb, k, n, 0 );
gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, n, 0 );
//----------------------------------------------------------
// Compute size of Matrix: A, B, C
//----------------------------------------------------------
gtint_t sizea = testinghelpers::matsize( storage, trnsa, m, k, lda ) * sizeof(T);
gtint_t sizeb = testinghelpers::matsize( storage, trnsb, k, n, ldb ) * sizeof(T);
gtint_t sizec = testinghelpers::matsize( storage, 'n', m, n, ldc ) * sizeof(T);
// Allocate memory for Matrix: A, B, C, CRef
testinghelpers::ProtectedBuffer buf_a_ptrs( sizea, false, is_memory_test );
testinghelpers::ProtectedBuffer buf_b_ptrs( sizeb, false , is_memory_test );
testinghelpers::ProtectedBuffer buf_c_ptrs( sizec, false , is_memory_test );
/* No need to check for memory errors for reference code path, */
/* hence is_memory_test is set to false */
testinghelpers::ProtectedBuffer buf_cref_ptrs( sizec, false , false );
/* GreenZone-1 and GreenZone-2 might overlap hence we need */
/* additional buffer to copy contents of GreenZone-1 before */
/* copying it to GreenZone-2 */
testinghelpers::ProtectedBuffer buf_aref_ptrs( sizea, false , false );
testinghelpers::ProtectedBuffer buf_bref_ptrs( sizeb, false , false );
T* buf_a = (T*)buf_a_ptrs.greenzone_1;
T* buf_b = (T*)buf_b_ptrs.greenzone_1;
T* buf_c = (T*)buf_c_ptrs.greenzone_1;
T* buf_cref = (T*)buf_cref_ptrs.greenzone_1;
T* buf_aref = (T*)buf_aref_ptrs.greenzone_1;
T* buf_bref = (T*)buf_bref_ptrs.greenzone_1;
// Check if the memory has been successfully allocated
if ((buf_a == NULL) || (buf_b == NULL) ||(buf_c == NULL) || (buf_cref == NULL)
|| (buf_aref == NULL) || (buf_bref == NULL) ) {
printf("Memory not allocated for input or output Matrix.\n");
return ;
}
testinghelpers::datagenerators::randomgenerators<T>( -2, 8, storage, m, k, (T*)(buf_a), trnsa, lda);
testinghelpers::datagenerators::randomgenerators<T>( -5, 2, storage, k, n, (T*)(buf_b), trnsb, ldb);
testinghelpers::datagenerators::randomgenerators<T>( -3, 5, storage, m, n, (T*)(buf_c), 'n', ldc);
// Create a copy of c so that we can check reference results.
memcpy(buf_cref, buf_c, sizec);
memcpy(buf_aref, buf_a, sizea);
memcpy(buf_bref, buf_b, sizeb);
gtint_t rs_a = 1, cs_a = 1, rs_b = 1, cs_b = 1, rs_c = 1, cs_c = 1;
gtint_t rs_a0 = 1, cs_a0 = 1, rs_b0 = 1, cs_b0 = 1;
if(storage == 'r')
{
rs_a = lda;
rs_b = ldb;
rs_c = ldc;
cs_a = 1;
cs_b = 1;
cs_c = 1;
rs_a0 = lda;
rs_b0 = ldb;
cs_a0 = 1;
cs_b0 = 1;
}
else
{
cs_a = lda;
cs_b = ldb;
cs_c = ldc;
rs_a = 1;
rs_b = 1;
rs_c = 1;
cs_a0 = lda;
cs_b0 = ldb;
rs_a0 = 1;
rs_b0 = 1;
}
if(trnsb == 't' || trnsb == 'T')
{
rs_b = cs_b0;
cs_b = rs_b0;
}
if(trnsa == 't' || trnsa == 'T')
{
rs_a = cs_a0;
cs_a = rs_a0;
}
// add signal handler for segmentation fault
testinghelpers::ProtectedBuffer::start_signal_handler();
try
{
auxinfo_t data;
//Panel stride update is required only for zen4 sup kernels
inc_t ps_a_use = (12 * rs_a); //12 = MR
bli_auxinfo_set_ps_a( ps_a_use, &data );
ukr_fp(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
m,
n,
k,
&alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
&beta,
buf_c, rs_c, cs_c,
&data,
NULL
);
if (is_memory_test)
{
// set pointers to second buffer
buf_a = (T*)buf_a_ptrs.greenzone_2;
buf_b = (T*)buf_b_ptrs.greenzone_2;
buf_c = (T*)buf_c_ptrs.greenzone_2;
// Check if the memory has been successfully allocated
if ((buf_a == NULL) || (buf_b == NULL) || (buf_c == NULL)) {
printf("Memory not allocated for input or output Matrix for memory test.\n");
return ;
}
// copy data from 1st buffer of A and B to second buffer
memcpy(buf_a, buf_aref, sizea);
memcpy(buf_b, buf_bref, sizeb);
//buf_c_ptrs.greenzone_1 has been updated with output from previous
// gemm call, hence use buf_cref
memcpy(buf_c, buf_cref, sizec);
// second call to ukr
auxinfo_t data;
inc_t ps_a_use = (12 * rs_a); //12 = MR
bli_auxinfo_set_ps_a( ps_a_use, &data );
ukr_fp(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
m,
n,
k,
&alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
&beta,
buf_c, rs_c, cs_c,
&data,
NULL
);
}
}
catch(const std::exception& e)
{
// reset to default signal handler
testinghelpers::ProtectedBuffer::stop_signal_handler();
// show failure in case seg fault was detected
FAIL() << "Memory Test Failed";
}
// reset to default signal handler
testinghelpers::ProtectedBuffer::stop_signal_handler();
// call reference implementation
testinghelpers::ref_gemm<T>( storage, trnsa, trnsb, m, n, k, alpha,
buf_a, lda, buf_b, ldb, beta, buf_cref, ldc);
// Check component-wise error
computediff<T>( storage, m, n, buf_c, buf_cref, ldc, thresh );
}
// The function is templatized based on the datatype and function-pointer type to the kernel.
template<typename T, typename FT>
static void test_gemmnat_ukr( char storage, gtint_t m, gtint_t n, gtint_t k, T alpha, T beta, double thresh, FT ukr_fp, bool is_memory_test = false )
{
// In case of memory test:
// Allocate packed buffer size for Matrix A, B native kernel works on packed buffer
// Native kernel has preload or prebroadcase design
// If we allocate size required by dimension then memtest fails
obj_t a, b;
obj_t ap, bp; // for packed buffers
cntx_t* cntx;
num_t dt = BLIS_DCOMPLEX;
cntx = bli_gks_query_cntx();
bli_obj_create(dt, m, k, 1, m, &a);
bli_obj_create(dt, k, n, n, 1, &b);
bli_obj_create(dt, m, k, 1, m, &ap);
bli_obj_create(dt, k, n, n, 1, &bp);
gtint_t sizea = bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_GEMM, BLIS_PACKED_ROW_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_MR, BLIS_KR, &a, &ap, cntx) * sizeof(T);
gtint_t sizeb = bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_GEMM, BLIS_PACKED_COL_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_KR, BLIS_NR, &b, &bp, cntx ) * sizeof(T);
// Create test operands
// matrix A will be in col-storage
// matrix B will be in row-storage
// column * row = matrix -- rank-k update
// Set matrix A dimensions
gtint_t rs = 1;
gtint_t cs = m;
gtint_t lda = cs;
//gtint_t sizea = m * k * sizeof(T);
// Set matrix B dimensions
rs = n;
cs = 1;
gtint_t ldb = rs;
//gtint_t sizeb = k * n * sizeof(T);
// Set matrix C dimensions
gtint_t ldc = m;
if(storage == 'r' || storage == 'R')
{
rs = n;
cs = 1;
ldc = rs;
}
else
{
rs = 1;
cs = m;
ldc = cs;
}
gtint_t sizec = m * n * sizeof(T);
// Allocating aligned memory for A and B matrix as Native microkernel issues
// VMOVAPD which expects memory to be accessed to be aligned.
// Matrix C need not be aligned
testinghelpers::ProtectedBuffer buf_a_ptrs( sizea, true, is_memory_test );
testinghelpers::ProtectedBuffer buf_b_ptrs( sizeb, true, is_memory_test );
testinghelpers::ProtectedBuffer buf_c_ptrs( sizec, false, is_memory_test );
// Allocate memory for C Matrix used for reference computation
testinghelpers::ProtectedBuffer buf_c_ref_ptrs( sizec, false , false );
/* GreenZone-1 and GreenZone-2 might overlap hence we need */
/* additional buffer to copy contents of GreenZone-1 before */
/* copying it to GreenZone-2 */
testinghelpers::ProtectedBuffer buf_a_ref_ptrs( sizea, false , false );
testinghelpers::ProtectedBuffer buf_b_ref_ptrs( sizeb, false , false );
T* buf_a = (T*)buf_a_ptrs.greenzone_1;
T* buf_b = (T*)buf_b_ptrs.greenzone_1;
T* buf_c = (T*)buf_c_ptrs.greenzone_1;
T* buf_cref = (T*)buf_c_ref_ptrs.greenzone_1;
T* buf_aref = (T*)buf_a_ref_ptrs.greenzone_1;
T* buf_bref = (T*)buf_b_ref_ptrs.greenzone_1;
// Check if the memory has been successfully allocated
if (( buf_a == NULL ) || ( buf_b == NULL ) || ( buf_c == NULL ) ||
( buf_cref == NULL ) || ( buf_aref == NULL ) || ( buf_bref == NULL )) {
printf("Matrix: Memory not allocated.\n");
return ;
}
/* Initialize Matrices with random numbers */
testinghelpers::datagenerators::randomgenerators<T>( -2, 8, 'c', m, k, (T*)(buf_a), 'n', lda);
testinghelpers::datagenerators::randomgenerators<T>( -5, 2, 'r', k, n, (T*)(buf_b), 'n', ldb);
testinghelpers::datagenerators::randomgenerators<T>( -5, 2, storage , m, n, (T*)(buf_c), 'n', ldc);
// Create a copy of c so that we can check reference results.
memcpy(buf_cref, buf_c, sizec);
memcpy(buf_aref, buf_a, sizea);
memcpy(buf_bref, buf_b, sizeb);
/* Fill the auxinfo_t struct in case the micro-kernel uses it. */
auxinfo_t data;
bli_auxinfo_set_ps_a(0, &data);
// add signal handler for segmentation fault
testinghelpers::ProtectedBuffer::start_signal_handler();
try
{
// call micro-kernel
ukr_fp (
k,
&alpha,
buf_a,
buf_b,
&beta,
buf_c,
rs,
cs,
&data,
NULL
);
if(is_memory_test)
{
// set pointers to second buffer
buf_a = (T*)buf_a_ptrs.greenzone_2;
buf_b = (T*)buf_b_ptrs.greenzone_2;
buf_c = (T*)buf_c_ptrs.greenzone_2;
// copy data from 1st buffer of A and B to second buffer
memcpy(buf_a, buf_aref, sizea);
memcpy(buf_b, buf_bref, sizeb);
//buf_c_ptrs.greenzone_1 has been updated with output from previous
// gemm call, hence use buf_cref
memcpy(buf_c, buf_cref, sizec);
ukr_fp (
k,
&alpha,
buf_a,
buf_b,
&beta,
buf_c,
rs,
cs,
&data,
NULL
);
}
}
catch(const std::exception& e)
{
// reset to default signal handler
testinghelpers::ProtectedBuffer::stop_signal_handler();
// show failure in case seg fault was detected
FAIL() << "Memory Test Failed";
}
// reset to default signal handler
testinghelpers::ProtectedBuffer::stop_signal_handler();
// In native micro-kernel
// op(A) = No transpose & op(B) = transpose
// for column-storage
char transa = 'n';
char transb = 't';
// The objective here is to make storage of all matrices same
// To do this we set transpose of A and B appropriatley.
if (storage == 'r' || storage == 'R')
{
// if row-storage
transa = 't';
transb = 'n';
// because matrix A is created with col-storage
// and matrix B is created with row-storage
// Generally storage parameter in cblas signifies
// storage of all matrices A, B and C.
// since A is col-storage, A' will be row-storage
}
// call reference implementation
testinghelpers::ref_gemm<T>( storage, transa, transb, m, n, k, alpha,
buf_a, lda, buf_b, ldb, beta, (T*)buf_cref, ldc);
// Check component-wise error
computediff<T>( storage, m, n, (T*)buf_c, (T*)buf_cref, ldc, thresh );
}

File diff suppressed because it is too large Load Diff