From e7dbbb1194e9ae31a358653e1c2402f83dc0eb0e Mon Sep 17 00:00:00 2001 From: Chithra Sankar Date: Mon, 19 Aug 2019 10:29:58 +0530 Subject: [PATCH] Level 3 BLAS CPP routines(Except trmm) + Doxygen Commenting + Test application for dgemm CPP API Change-Id: I97d1203ce466f3adb991341c17db9070c1eaf104 --- cpp/blis.hh | 1240 +++++++++++++++++++++++++++++++++++++++--- cpp/cblas.hh | 367 ++++++++++++- testcpp/test.sh | 32 +- testcpp/test_gemm.cc | 279 +++++++--- testcpp/test_gemm.hh | 191 +++++++ 5 files changed, 1919 insertions(+), 190 deletions(-) create mode 100644 testcpp/test_gemm.hh diff --git a/cpp/blis.hh b/cpp/blis.hh index fc25ebff7..b3f637aa0 100644 --- a/cpp/blis.hh +++ b/cpp/blis.hh @@ -1,102 +1,1176 @@ +/****************************************************************************** +* Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +* THE SOFTWARE. +*******************************************************************************/ + +/*! @file blis.hh + * blis.hh defines all the CPP templated public interfaces + * */ #ifndef BLIS_HH #define BLIS_HH -#include "blis_util.hh" #include "cblas.hh" +#include "blis_util.hh" #include namespace blis { +/*! @brief \b GEMM -// ============================================================================= -/// General matrix-matrix multiply, -/// \f[ C = \alpha op(A) \times op(B) + \beta C, \f] -/// where op(X) is one of -/// \f[ op(X) = X, \f] -/// \f[ op(X) = X^T, \f] -/// \f[ op(X) = X^H, \f] -/// alpha and beta are scalars, and A, B, and C are matrices, with -/// op(A) an m-by-k matrix, op(B) a k-by-n matrix, and C an m-by-n matrix. -/// -/// Generic implementation for arbitrary data types. -/// TODO: generic version not yet implemented. -/// -/// @param[in] layout -/// Matrix storage, Layout::ColMajor or Layout::RowMajor. -/// -/// @param[in] transA -/// The operation op(A) to be used: -/// - Op::NoTrans: \f$ op(A) = A. \f$ -/// - Op::Trans: \f$ op(A) = A^T. \f$ -/// - Op::ConjTrans: \f$ op(A) = A^H. \f$ -/// -/// @param[in] transB -/// The operation op(B) to be used: -/// - Op::NoTrans: \f$ op(B) = B. \f$ -/// - Op::Trans: \f$ op(B) = B^T. \f$ -/// - Op::ConjTrans: \f$ op(B) = B^H. \f$ -/// -/// @param[in] m -/// Number of rows of the matrix C and op(A). m >= 0. -/// -/// @param[in] n -/// Number of columns of the matrix C and op(B). n >= 0. -/// -/// @param[in] k -/// Number of columns of op(A) and rows of op(B). k >= 0. -/// -/// @param[in] alpha -/// Scalar alpha. If alpha is zero, A and B are not accessed. -/// -/// @param[in] A -/// - If transA = NoTrans: -/// the m-by-k matrix A, stored in an lda-by-k array [RowMajor: m-by-lda]. -/// - Otherwise: -/// the k-by-m matrix A, stored in an lda-by-m array [RowMajor: k-by-lda]. -/// -/// @param[in] lda -/// Leading dimension of A. -/// - If transA = NoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)]. -/// - Otherwise: lda >= max(1, k) [RowMajor: lda >= max(1, m)]. -/// -/// @param[in] B -/// - If transB = NoTrans: -/// the k-by-n matrix B, stored in an ldb-by-n array [RowMajor: k-by-ldb]. -/// - Otherwise: -/// the n-by-k matrix B, stored in an ldb-by-k array [RowMajor: n-by-ldb]. -/// -/// @param[in] ldb -/// Leading dimension of B. -/// - If transB = NoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)]. -/// - Otherwise: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)]. -/// -/// @param[in] beta -/// Scalar beta. If beta is zero, C need not be set on input. -/// -/// @param[in] C -/// The m-by-n matrix C, stored in an ldc-by-n array [RowMajor: m-by-ldc]. -/// -/// @param[in] ldc -/// Leading dimension of C. ldc >= max(1, m) [RowMajor: ldc >= max(1, n)]. -/// -/// @ingroup gemm + \verbatim -template< typename TA, typename TB, typename TC > + GEMM performs general matrix-matrix multiply for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + C := alpha*op( A )*op( B ) + beta*C, + + where op( X ) is one of + + op( X ) = X or op( X ) = X**T or op( X ) = X**H, + + alpha and beta are scalars, and A, B and C are matrices, with op( A ) + an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_ORDER + + layout specifies Matrix storage as follows: + + layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] transA + \verbatim + + transA is CBLAS_TRANSPOSE + On entry, transA specifies the form of op( A ) to be used in + the matrix multiplication as follows: + + transA = CBLAS_TRANSPOSE::CblasNoTrans, op( A ) = A. + + transA = CBLAS_TRANSPOSE::CblasTrans, op( A ) = A**T. + + transA = CBLAS_TRANSPOSE::CblasConjTrans, op( A ) = A**H. + \endverbatim + + \param[in] transB + \verbatim + transB is CBLAS_TRANSPOSE + On entry, transB specifies the form of op( B ) to be used in + the matrix multiplication as follows: + + transB = CBLAS_TRANSPOSE::CblasNoTrans, op( B ) = B. + + transB = CBLAS_TRANSPOSE::CblasTrans, op( B ) = B**T. + + transB = CBLAS_TRANSPOSE::CblasConjTrans, op( B ) = B**H. + \endverbatim + + \param[in] m + \verbatim + m is INTEGER + On entry, m specifies the number of rows of the matrix + op( A ) and of the matrix C. m must be at least zero. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the number of columns of the matrix + op( B ) and the number of columns of the matrix C. n must be + at least zero. + \endverbatim + + \param[in] k + \verbatim + k is INTEGER + On entry, k specifies the number of columns of the matrix + op( A ) and the number of rows of the matrix op( B ). k must + be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If transA = CblasNoTrans: + m-by-k , stored in an lda-by-k array [RowMajor: m-by-lda]. + Otherwise: + k-by-m , stored in an lda-by-m array [RowMajor: k-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If transA = CblasNoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)]. + Otherwise: lda >= max(1, k) [RowMajor: lda >= max(1, m)]. + \endverbatim + + \param[in] B + \verbatim + B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If transA = CblasNoTrans: + k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb]. + Otherwise: + n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb]. + \endverbatim + + \param[in] ldb + \verbatim + ldb is INTEGER + On entry, ldb specifies the Leading dimension of B + If transA = CblasNoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)]. + Otherwise: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)]. + \endverbatim + + \param[in] beta + \verbatim + beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, beta specifies the scalar alpha.When beta is + supplied as zero then C need not be set on input. + \endverbatim + + \param[in,out] C + \verbatim + C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : + m-by-n stored in an ldc-by-n array [RowMajor: m-by-ldc]. + Before entry, the leading m by n part of the array C must + contain the matrix C, except when beta is zero, in which + case C need not be set on entry. + On exit, the array C is overwritten by the m by n matrix + ( alpha*op( A )*op( B ) + beta*C ). + \endverbatim + + \param[in] ldc + \verbatim + ldc is INTEGER + On entry, ldc specifies the first dimension of C + ldc >= max(1, m) [RowMajor: ldc >= max(1, n)]. + \endverbatim + + */ +template< typename T > void gemm( - CBLAS_LAYOUT layout, + CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int64_t m, int64_t n, int64_t k, - scalar_type alpha, - TA const *A, int64_t lda, - TB const *B, int64_t ldb, - scalar_type beta, - TC *C, int64_t ldc ) + T alpha, + T const *A, int64_t lda, + T const *B, int64_t ldb, + T beta, + T *C, int64_t ldc ) { -// printf("In gemm.cpp\n"); cblas_gemm(layout, transA, transB, m, n, k, alpha, A,lda, B, ldb, beta, C, ldc); -}; +} +/*! @brief \b TRSM + + \verbatim + + TRSM performs solves one of the matrix equations for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + op( A )*X = alpha*B, or X*op( A ) = alpha*B, + + where alpha is a scalar, X and B are m by n matrices, A is a unit, or + non-unit, upper or lower triangular matrix and op( A ) is one of + where op( X ) is one of + + op( A ) = A or op( A ) = A**T or op( A ) = A**H. + + The matrix X is overwritten on B. + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_ORDER + + layout specifies Matrix storage as follows: + + layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] side + \verbatim + side is enum CBLAS_SIDE + + side specifies specifies whether op( A ) appears on the left + or right of X as follows: + + side = CBLAS_SIDE::CblasLeft op( A )*X = alpha*B. + + side = CBLAS_SIDE::CblasRight op( A )*X = alpha*B. + \endverbatim + + \param[in] uplo + \verbatim + uplo is enum CBLAS_UPLO + + uplo specifies specifies whether the matrix A is an upper or + lower triangular matrix as follows: + + uplo = CBLAS_UPLO::CblasUpper A is an upper triangular matrix. + + uplo = CBLAS_UPLO::CblasLower A is a lower triangular matrix. + \endverbatim + + \param[in] trans + \verbatim + + trans is CBLAS_TRANSPOSE + On entry, trans specifies the form of op( A ) to be used in + the matrix multiplication as follows: + + trans = CBLAS_TRANSPOSE::CblasNoTrans, op( A ) = A. + + trans = CBLAS_TRANSPOSE::CblasTrans, op( A ) = A**T. + + trans = CBLAS_TRANSPOSE::CblasConjTrans, op( A ) = A**H. + \endverbatim + + \param[in] diag + \verbatim + diag is enum CBLAS_DIAG + + diag specifies specifies whether or not A is unit triangular + as follows: + + diag = CBLAS_DIAG::CblasUnit A is assumed to be unit triangular. + + diag = CBLAS_DIAG::CblasNonUnit A is not assumed to be unit + triangular. + \endverbatim + + \param[in] m + \verbatim + m is INTEGER + On entry, m specifies the number of rows of the matrix + B. m must be at least zero. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the number of columns of the matrix + B. n must be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If side = CblasLeft: + the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda]. + If side = CblasRight: + the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If side = CblasLeft: lda >= max(1, m) . + If side = CblasRight:lda >= max(1, k) . + \endverbatim + + \param[in] B + \verbatim + B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb]. + on exit is overwritten by the solution matrix X. + \endverbatim + + \param[in] ldb + \verbatim + ldb is INTEGER + On entry, ldb specifies the Leading dimension of B + ldb >= max(1, m) [RowMajor: ldb >= max(1, n)]. + \endverbatim + + */ +template< typename T > +void trsm( + CBLAS_ORDER layout, + CBLAS_SIDE side, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, + CBLAS_DIAG diag, + int64_t m, + int64_t n, + T alpha, + T const *A, int64_t lda, + T *B, int64_t ldb ) +{ + cblas_trsm( layout, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); +} + +/*! @brief \b HEMM + + \verbatim + + HEMM performs solves one of the matrix-matrix operations for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + C := alpha*A*B + beta*C + or + C := alpha*B*A + beta*C, + + where alpha is a scalar, A is an hermitian matrix + C and B are m by n matrices + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_ORDER + + layout specifies Matrix storage as follows: + + layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] side + \verbatim + side is enum CBLAS_SIDE + + side specifies specifies whether the hermitian matrix A + appears on the left or right in the operation as follows: + + side = CBLAS_SIDE::CblasLeft C := alpha*A*B + beta*C, + + side = CBLAS_SIDE::CblasRight C := alpha*B*A + beta*C + \endverbatim + + \param[in] uplo + \verbatim + uplo is enum CBLAS_UPLO + + uplo specifies specifies whether the upper or lower + triangular part of the hermitian matrix A is to be + referenced as follows: + + uplo = CBLAS_UPLO::CblasUpper Only the upper triangular part of the + hermitian matrix is to be referenced. + + uplo = CBLAS_UPLO::CblasLower Only the lower triangular part of the + hermitian matrix is to be referenced. + \endverbatim + + \param[in] m + \verbatim + m is INTEGER + On entry, m specifies the number of rows of the matrix + C. m must be at least zero. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the number of columns of the matrix + C. n must be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If side = CblasLeft: + the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda]. + If side = CblasRight: + the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If side = CblasLeft: lda >= max(1, m) . + If side = CblasRight:lda >= max(1, k) . + \endverbatim + + \param[in] B + \verbatim + B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb]. + \endverbatim + + \param[in] ldb + \verbatim + ldb is INTEGER + On entry, ldb specifies the Leading dimension of B + ldb >= max(1, m) [RowMajor: ldb >= max(1, n)]. + \endverbatim + + \param[in] beta + \verbatim + beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, beta specifies the scalar beta. + If beta is zero, C need not be set on input + \endverbatim + + \param[in] C + \verbatim + C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc]. + \endverbatim + + \param[in] ldc + \verbatim + ldc is INTEGER + On entry, ldc specifies the Leading dimension of C + ldc >= max(1, m) [RowMajor: ldc >= max(1, n)]. + \endverbatim + + */ +template< typename T > +void hemm( + CBLAS_ORDER layout, + CBLAS_SIDE side, + CBLAS_UPLO uplo, + int64_t m, int64_t n, + T alpha, + T const *A, int64_t lda, + T const *B, int64_t ldb, + T beta, + T *C, int64_t ldc ) +{ + cblas_hemm( layout, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); +} + +/*! @brief \b SYMM + + \verbatim + + SYMM performs solves one of the matrix-matrix operations for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + C := alpha*A*B + beta*C + or + C := alpha*B*A + beta*C, + + where alpha is a scalar, A is an symmetric matrix + C and B are m by n matrices + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_ORDER + + layout specifies Matrix storage as follows: + + layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] side + \verbatim + side is enum CBLAS_SIDE + + side specifies specifies whether the symmetric matrix A + appears on the left or right in the operation as follows: + + side = CBLAS_SIDE::CblasLeft C := alpha*A*B + beta*C, + + side = CBLAS_SIDE::CblasRight C := alpha*B*A + beta*C + \endverbatim + + \param[in] uplo + \verbatim + uplo is enum CBLAS_UPLO + + uplo specifies specifies whether the upper or lower + triangular part of the symmetric matrix A is to be + referenced as follows: + + uplo = CBLAS_UPLO::CblasUpper Only the upper triangular part of the + symmetric matrix is to be referenced. + + uplo = CBLAS_UPLO::CblasLower Only the lower triangular part of the + symmetric matrix is to be referenced. + \endverbatim + + \param[in] m + \verbatim + m is INTEGER + On entry, m specifies the number of rows of the matrix + C. m must be at least zero. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the number of columns of the matrix + C. n must be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If side = CblasLeft: + the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda]. + If side = CblasRight: + the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If side = CblasLeft: lda >= max(1, m) . + If side = CblasRight:lda >= max(1, k) . + \endverbatim + + \param[in] B + \verbatim + B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb]. + \endverbatim + + \param[in] ldb + \verbatim + ldb is INTEGER + On entry, ldb specifies the Leading dimension of B + ldb >= max(1, m) [RowMajor: ldb >= max(1, n)]. + \endverbatim + + \param[in] beta + \verbatim + beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, beta specifies the scalar beta. + If beta is zero, C need not be set on input + \endverbatim + + \param[in] C + \verbatim + C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc]. + \endverbatim + + \param[in] ldc + \verbatim + ldc is INTEGER + On entry, ldc specifies the Leading dimension of C + ldc >= max(1, m) [RowMajor: ldc >= max(1, n)]. + \endverbatim + + */ +template< typename T > +void symm( + CBLAS_ORDER layout, + CBLAS_SIDE side, + CBLAS_UPLO uplo, + int64_t m, int64_t n, + T alpha, + T const *A, int64_t lda, + T const *B, int64_t ldb, + T beta, + T *C, int64_t ldc ) +{ + cblas_symm( layout, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); +} + +/*! @brief \b SYRK + + \verbatim + + SYRK performs one of the symmetric rank k operations for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + C := alpha*A*A**T + beta*C, + + or + + C := alpha*A**T*A + beta*C, + + where alpha and beta are scalars, C is an n by n symmetric matrix + and A is an n by k matrix in the first case and a k by n matrix + in the second case. + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_LAYOUT + + layout specifies Matrix storage as follows: + + layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] uplo + \verbatim + uplo is enum CBLAS_UPLO + + uplo specifies specifies whether the upper or lower + triangular part of the array C is to be referenced + as follows: + + uplo = CBLAS_UPLO::CblasUpper Only the upper triangular part of C + is to be referenced. + + uplo = CBLAS_UPLO::CblasLower Only the lower triangular part of C + is to be referenced. + \endverbatim + + \param[in] trans + \verbatim + + trans is CBLAS_TRANSPOSE + On entry, trans specifies the operation to be used as follows: + + trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*A**T + beta*C. + + trans = CBLAS_TRANSPOSE::CblasTrans,C := alpha*A**T*A + beta*C. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the order of the matrix C. n must be + at least zero. + \endverbatim + + \param[in] k + \verbatim + k is INTEGER + If trans = CblasNoTrans: k is number of columns of the matrix A. + Otherwise: k is number of rows of the matrix A. + k must be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If transA = CblasNoTrans: + n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda]. + Otherwise: + k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If transA = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)]. + Otherwise: lda >= max(1, k) [RowMajor: lda >= max(1, n)]. + \endverbatim + + \param[in] beta + \verbatim + beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, beta specifies the scalar alpha.When beta is + supplied as zero then C need not be set on input. + \endverbatim + + \param[in,out] C + \verbatim + C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : + The n-by-n symmetric matrix C, + stored in an ldc-by-n array [RowMajor: n-by-ldc]. + On exit, the array C is overwritten by the lower/upper + triangular part of the updated matrix. + \endverbatim + + \param[in] ldc + \verbatim + ldc is INTEGER + On entry, ldc specifies the first dimension of C + ldc >= max(1, n) + \endverbatim + + */ +template< typename T > +void syrk( + CBLAS_ORDER layout, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, + int64_t n, int64_t k, + T alpha, + T const *A, int64_t lda, + T beta, + T *C, int64_t ldc ) +{ + cblas_syrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); +} + +/*! @brief \b SYR2K + + \verbatim + + SYR2K performs one of the symmetric rank 2k operations for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + C := alpha*A*B**T + alpha*B*A**T + beta*C, + + or + + C := alpha*A**T*B + alpha*B**T*A + beta*C, + + where alpha and beta are scalars, C is an n by n symmetric matrix + and A and B are n by k matrices in the first case and k by n matrices + in the second case. + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_LAYOUT + + layout specifies Matrix storage as follows: + + layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] uplo + \verbatim + uplo is enum CBLAS_UPLO + + uplo specifies specifies whether the upper or lower + triangular part of the array C is to be referenced + as follows: + + uplo = CBLAS_UPLO::CblasUpper Only the upper triangular part of C + is to be referenced. + + uplo = CBLAS_UPLO::CblasLower Only the lower triangular part of C + is to be referenced. + \endverbatim + + \param[in] trans + \verbatim + + trans is CBLAS_TRANSPOSE + On entry, trans specifies the operation to be used as follows: + + trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*B**T + alpha*B*A**T + + beta*C. + + trans = CBLAS_TRANSPOSE::CblasTrans, C := alpha*A**T*B + alpha*B**T*A + + beta*C. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the order of the matrix C. n must be + at least zero. + \endverbatim + + \param[in] k + \verbatim + k is INTEGER + If trans = CblasNoTrans: k is number of columns of the matrices A & B. + Otherwise: k is number of rows of the matrices A & B. + k must be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If trans = CblasNoTrans: + n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda]. + Otherwise: + k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)]. + Otherwise: lda >= max(1, k) [RowMajor: lda >= max(1, n)]. + \endverbatim + + \param[in] B + \verbatim + B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If trans = CblasNoTrans: + n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb]. + Otherwise: + k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb] + \endverbatim + + \param[in] ldb + \verbatim + ldb is INTEGER + On entry, ldb specifies the Leading dimension of B + If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)]. + Otherwise: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)]. + \endverbatim + + \param[in] beta + \verbatim + beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, beta specifies the scalar alpha.When beta is + supplied as zero then C need not be set on input. + \endverbatim + + \param[in,out] C + \verbatim + C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : + The n-by-n symmetric matrix C, + stored in an ldc-by-n array [RowMajor: n-by-ldc]. + On exit, the array C is overwritten by the lower/upper + triangular part of the updated matrix. + \endverbatim + + \param[in] ldc + \verbatim + ldc is INTEGER + On entry, ldc specifies the first dimension of C + ldc >= max(1, n) + \endverbatim + + */ +template< typename T > +void syr2k( + CBLAS_ORDER layout, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, + int64_t n, int64_t k, + T alpha, + T const *A, int64_t lda, + T const *B, int64_t ldb, + T beta, + T *C, int64_t ldc ) +{ + cblas_syr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc ); +} + +/*! @brief \b HERK + + \verbatim + + HERK performs one of the hermitian rank k operations for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C, + + or + + C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C, + + where alpha and beta are real scalars, C is an n by n hermitian + matrix and A is an n by k matrix in the first case and + k by n matrix in the second case. + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_LAYOUT + + layout specifies Matrix storage as follows: + + layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] uplo + \verbatim + uplo is enum CBLAS_UPLO + + uplo specifies specifies whether the upper or lower + triangular part of the array C is to be referenced + as follows: + + uplo = CBLAS_UPLO::CblasUpper Only the upper triangular part of C + is to be referenced. + + uplo = CBLAS_UPLO::CblasLower Only the lower triangular part of C + is to be referenced. + \endverbatim + + \param[in] trans + \verbatim + + trans is CBLAS_TRANSPOSE + On entry, trans specifies the operation to be used as follows: + + trans = CBLAS_TRANSPOSE::CblasNoTrans, C := alpha*A*A**H + beta*C. + + trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*A + beta*C. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the order of the matrix C. n must be + at least zero. + \endverbatim + + \param[in] k + \verbatim + k is INTEGER + If trans = CblasNoTrans: k is number of columns of the matrix A. + Otherwise: k is number of rows of the matrix A. + k must be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If trans = CblasNoTrans: + n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda]. + Otherwise: + k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)]. + Otherwise: lda >= max(1, k) [RowMajor: lda >= max(1, n)]. + \endverbatim + + \param[in] beta + \verbatim + beta is REAL/DOUBLE PRECISION + On entry, beta specifies the scalar alpha.When beta is + supplied as zero then C need not be set on input. + \endverbatim + + \param[in,out] C + \verbatim + C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : + The n-by-n Hermitian matrix C, + stored in an ldc-by-n array [RowMajor: n-by-ldc]. + On exit, the array C is overwritten by the lower/upper + triangular part of the updated matrix. + \endverbatim + + \param[in] ldc + \verbatim + ldc is INTEGER + On entry, ldc specifies the first dimension of C + ldc >= max(1, n) + \endverbatim + */ +template< typename T > +void herk( + CBLAS_ORDER layout, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, + int64_t n, int64_t k, + real_type alpha, + T const *A, int64_t lda, + real_type beta, + T *C, int64_t ldc ) +{ + cblas_herk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc ); +} + +/*! @brief \b HER2K + + \verbatim + + HER2K performs one of the hermitian rank 2k operations for arbitrary data types + Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL, + SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16) + + C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C, + + or + + C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C, + + where alpha and beta are scalars with beta real, C is an n by n + hermitian matrix and A and B are n by k matrices in the first case + and k by n matrices in the second case. + \endverbatim + + \param[in] layout + \verbatim + layout is enum CBLAS_LAYOUT + + layout specifies Matrix storage as follows: + + layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor. + \endverbatim + + \param[in] uplo + \verbatim + uplo is enum CBLAS_UPLO + + uplo specifies specifies whether the upper or lower + triangular part of the array C is to be referenced + as follows: + + uplo = CBLAS_UPLO::CblasUpper Only the upper triangular part of C + is to be referenced. + + uplo = CBLAS_UPLO::CblasLower Only the lower triangular part of C + is to be referenced. + \endverbatim + + \param[in] trans + \verbatim + + trans is CBLAS_TRANSPOSE + On entry, trans specifies the operation to be used as follows: + + trans = CBLAS_TRANSPOSE::CblasNoTrans, C := alpha*A*B**H + + conjg( alpha )*B*A**H + + beta*C. + + trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*B + + conjg( alpha )*B**H*A + + beta*C. + \endverbatim + + \param[in] n + \verbatim + n is INTEGER + On entry, n specifies the order of the matrix C. n must be + at least zero. + \endverbatim + + \param[in] k + \verbatim + k is INTEGER + If trans = CblasNoTrans: k is number of columns of the matrices A & B. + Otherwise: k is number of rows of the matrices A & B. + k must be at least zero. + \endverbatim + + \param[in] alpha + \verbatim + alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 + On entry, alpha specifies the scalar alpha. + \endverbatim + + \param[in] A + \verbatim + A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If trans = CblasNoTrans: + n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda]. + Otherwise: + k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda]. + \endverbatim + + \param[in] lda + \verbatim + lda is INTEGER + On entry, lda specifies the Leading dimension of A + If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)]. + Otherwise: lda >= max(1, k) [RowMajor: lda >= max(1, n)]. + \endverbatim + + \param[in] B + \verbatim + B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : + If trans = CblasNoTrans: + n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb]. + Otherwise: + k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb] + \endverbatim + + \param[in] ldb + \verbatim + ldb is INTEGER + On entry, ldb specifies the Leading dimension of B + If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)]. + Otherwise: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)]. + \endverbatim + + \param[in] beta + \verbatim + beta is REAL/DOUBLE PRECISION + On entry, beta specifies the scalar alpha.When beta is + supplied as zero then C need not be set on input. + \endverbatim + + \param[in,out] C + \verbatim + C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : + The n-by-n Hermitian matrix C, + stored in an ldc-by-n array [RowMajor: n-by-ldc]. + On exit, the array C is overwritten by the lower/upper + triangular part of the updated matrix. + \endverbatim + + \param[in] ldc + \verbatim + ldc is INTEGER + On entry, ldc specifies the first dimension of C + ldc >= max(1, n) + \endverbatim + */ +template< typename T > +void her2k( + CBLAS_ORDER layout, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, + int64_t n, int64_t k, + T alpha, + T const *A, int64_t lda, + T const *B, int64_t ldb, + real_type beta, + T *C, int64_t ldc ) +{ + cblas_her2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc ); +} } // namespace blis - #endif // #ifndef BLIS_HH diff --git a/cpp/cblas.hh b/cpp/cblas.hh index de67a6c17..7d70fda5d 100644 --- a/cpp/cblas.hh +++ b/cpp/cblas.hh @@ -5,7 +5,6 @@ extern "C" { #include #include } -typedef CBLAS_ORDER CBLAS_LAYOUT; #include @@ -18,7 +17,7 @@ namespace blis{ // ----------------------------------------------------------------------------- inline void cblas_gemm( - CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, + CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int m, int n, int k, float alpha, float const *A, int lda, @@ -34,7 +33,7 @@ cblas_gemm( inline void cblas_gemm( - CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, + CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int m, int n, int k, double alpha, double const *A, int lda, @@ -50,7 +49,7 @@ cblas_gemm( inline void cblas_gemm( - CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, + CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int m, int n, int k, std::complex alpha, std::complex const *A, int lda, @@ -66,7 +65,7 @@ cblas_gemm( inline void cblas_gemm( - CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, + CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int m, int n, int k, std::complex alpha, std::complex const *A, int lda, @@ -79,7 +78,365 @@ cblas_gemm( &alpha, A, lda, B, ldb, &beta, C, ldc ); } +// ----------------------------------------------------------------------------- +inline void +cblas_trsm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, CBLAS_DIAG diag, + int m, int n, + float alpha, + float const *A, int lda, + float *B, int ldb ) +{ + cblas_strsm( layout, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); +} +inline void +cblas_trsm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, CBLAS_DIAG diag, + int m, int n, + double alpha, + double const *A, int lda, + double *B, int ldb ) +{ + cblas_dtrsm( layout, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); +} + +inline void +cblas_trsm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, CBLAS_DIAG diag, + int m, int n, + std::complex alpha, + std::complex const *A, int lda, + std::complex *B, int ldb ) +{ + cblas_ctrsm( layout, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb ); +} + +inline void +cblas_trsm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + CBLAS_TRANSPOSE trans, CBLAS_DIAG diag, + int m, int n, + std::complex alpha, + std::complex const *A, int lda, + std::complex *B, int ldb ) +{ + cblas_ztrsm( layout, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb ); +} + +// ----------------------------------------------------------------------------- +inline void +cblas_hemm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + float alpha, + float const *A, int lda, + float const *B, int ldb, + float beta, + float* C, int ldc ) +{ + cblas_ssymm( layout, side, uplo, m, n, + alpha, A, lda, B, ldb, + beta, C, ldc ); +} + +inline void +cblas_hemm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + double alpha, + double const *A, int lda, + double const *B, int ldb, + double beta, + double* C, int ldc ) +{ + cblas_dsymm( layout, side, uplo, m, n, + alpha, A, lda, B, ldb, + beta, C, ldc ); +} + +inline void +cblas_hemm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_chemm( layout, side, uplo, m, n, + &alpha, A, lda, B, ldb, + &beta, C, ldc ); +} + +inline void +cblas_hemm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_zhemm( layout, side, uplo, m, n, + &alpha, A, lda, B, ldb, + &beta, C, ldc ); +} + +// ----------------------------------------------------------------------------- +inline void +cblas_symm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + float alpha, + float const *A, int lda, + float const *B, int ldb, + float beta, + float* C, int ldc ) +{ + cblas_ssymm( layout, side, uplo, m, n, + alpha, A, lda, B, ldb, + beta, C, ldc ); +} + +inline void +cblas_symm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + double alpha, + double const *A, int lda, + double const *B, int ldb, + double beta, + double* C, int ldc ) +{ + cblas_dsymm( layout, side, uplo, m, n, + alpha, A, lda, B, ldb, + beta, C, ldc ); +} + +inline void +cblas_symm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_csymm( layout, side, uplo, m, n, + &alpha, A, lda, B, ldb, + &beta, C, ldc ); +} + +inline void +cblas_symm( + CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo, + int m, int n, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_zsymm( layout, side, uplo, m, n, + &alpha, A, lda, B, ldb, + &beta, C, ldc ); +} + + +// ----------------------------------------------------------------------------- +inline void +cblas_syrk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + float alpha, + float const *A, int lda, + float beta, + float* C, int ldc ) +{ + cblas_ssyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc ); +} + +inline void +cblas_syrk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + double alpha, + double const *A, int lda, + double beta, + double* C, int ldc ) +{ + cblas_dsyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc ); +} + +inline void +cblas_syrk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + std::complex alpha, + std::complex const *A, int lda, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_csyrk( layout, uplo, trans, n, k, &alpha, A, lda, &beta, C, ldc ); +} + +inline void +cblas_syrk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + std::complex alpha, + std::complex const *A, int lda, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_zsyrk( layout, uplo, trans, n, k, &alpha, A, lda, &beta, C, ldc ); +} + +// ----------------------------------------------------------------------------- +inline void +cblas_herk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + float alpha, + float const *A, int lda, + float beta, + float* C, int ldc ) +{ + cblas_ssyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc ); +} + +inline void +cblas_herk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + double alpha, + double const *A, int lda, + double beta, + double* C, int ldc ) +{ + cblas_dsyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc ); +} + +inline void +cblas_herk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + float alpha, // note: real + std::complex const *A, int lda, + float beta, // note: real + std::complex* C, int ldc ) +{ + cblas_cherk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc ); +} + +inline void +cblas_herk( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + double alpha, // note: real + std::complex const *A, int lda, + double beta, // note: real + std::complex* C, int ldc ) +{ + cblas_zherk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc ); +} + +// ----------------------------------------------------------------------------- +inline void +cblas_syr2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + float alpha, + float const *A, int lda, + float const *B, int ldb, + float beta, + float* C, int ldc ) +{ + cblas_ssyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc ); +} + +inline void +cblas_syr2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + double alpha, + double const *A, int lda, + double const *B, int ldb, + double beta, + double* C, int ldc ) +{ + cblas_dsyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc ); +} + +inline void +cblas_syr2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_csyr2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc ); +} + +inline void +cblas_syr2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + std::complex beta, + std::complex* C, int ldc ) +{ + cblas_zsyr2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc ); +} + +// ----------------------------------------------------------------------------- +inline void +cblas_her2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + float alpha, + float const *A, int lda, + float const *B, int ldb, + float beta, + float* C, int ldc ) +{ + cblas_ssyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc ); +} + +inline void +cblas_her2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + double alpha, + double const *A, int lda, + double const *B, int ldb, + double beta, + double* C, int ldc ) +{ + cblas_dsyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc ); +} + +inline void +cblas_her2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + float beta, // note: real + std::complex* C, int ldc ) +{ + cblas_cher2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, beta, C, ldc ); +} + +inline void +cblas_her2k( + CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k, + std::complex alpha, + std::complex const *A, int lda, + std::complex const *B, int ldb, + double beta, // note: real + std::complex* C, int ldc ) +{ + cblas_zher2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, beta, C, ldc ); +} }//namespace blis #endif // #ifndef CBLAS_HH diff --git a/testcpp/test.sh b/testcpp/test.sh index b6f34e4b5..19ec47700 100755 --- a/testcpp/test.sh +++ b/testcpp/test.sh @@ -2,31 +2,29 @@ CWD=$(pwd) echo $CWD make clean make blis CFLAGS+="-DFLOAT" -numactl -C 1 ./test_gemm1_blis.x -make clean -make blis CFLAGS+="-DDOUBLE" -numactl -C 1 ./test_gemm1_blis.x -make clean -make blis CFLAGS+="-DSCOMPLEX" -numactl -C 1 ./test_gemm1_blis.x -make clean -make blis CFLAGS+="-DDCOMPLEX" -numactl -C 1 ./test_gemm1_blis.x - - -cd ../test/ -CWD=$(pwd) -echo $CWD -make clean -make blis CFLAGS+="-DFLOAT" numactl -C 1 ./test_gemm_blis.x +numactl -C 1 ./test_trsm_blis.x +numactl -C 1 ./test_hemm_blis.x +numactl -C 1 ./test_symm_blis.x + make clean make blis CFLAGS+="-DDOUBLE" numactl -C 1 ./test_gemm_blis.x +numactl -C 1 ./test_trsm_blis.x +numactl -C 1 ./test_hemm_blis.x +numactl -C 1 ./test_symm_blis.x + make clean make blis CFLAGS+="-DSCOMPLEX" numactl -C 1 ./test_gemm_blis.x +numactl -C 1 ./test_trsm_blis.x +numactl -C 1 ./test_hemm_blis.x +numactl -C 1 ./test_symm_blis.x + make clean make blis CFLAGS+="-DDCOMPLEX" numactl -C 1 ./test_gemm_blis.x +numactl -C 1 ./test_trsm_blis.x +numactl -C 1 ./test_hemm_blis.x +numactl -C 1 ./test_symm_blis.x diff --git a/testcpp/test_gemm.cc b/testcpp/test_gemm.cc index 05929636b..f251dd948 100644 --- a/testcpp/test_gemm.cc +++ b/testcpp/test_gemm.cc @@ -1,3 +1,36 @@ +/* + + BLISPP + C++ test driver for BLIS CPP gemm routine and reference cblas gemm routine. + + Copyright (C) 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + #include #include @@ -5,98 +38,174 @@ #include #include #include "blis.hh" +#include "test_gemm.hh" using namespace std; - -#define DIM 2 -template -void print_matrix(T * matrix , int m , int n) +#define PRINT +int computeError( + int ldc, + int ldc_ref, + int m, + int n, + double *C, + double *C_ref + ) { - for ( int L=0; L < m; L ++ ) { - for ( int J = 0; J < n; J ++ ) { - cout<< matrix[L * n + J]<<" "; - } - cout<<"\n"; - } + int i, j; + int ret = 0; + for ( i = 0; i < m; i ++ ) { + for ( j = 0; j < n; j ++ ) { + if ( C( i, j ) != C_ref( i, j ) ) { + printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) ); + ret = 1; + break; + } + } + } + return ret; } +void test_dgemm( ) +{ + int i, j, p, nx; + double *A, *B, *C, *C_ref; + double alpha, beta; + double tmp, error, flops; + double ref_beg, ref_time, bl_dgemm_beg, bl_dgemm_time; + int nrepeats; + int m,n,k; + int lda, ldb, ldc, ldc_ref; + double ref_rectime, bl_dgemm_rectime; + + alpha = 1.0; + beta = 0.0; + m = 5; + k = 6; + n = 4; + + A = (double*)malloc( sizeof(double) * m * k ); + B = (double*)malloc( sizeof(double) * k * n ); + + lda = m; + ldb = k; + ldc = m; + ldc_ref = m; + C = bl_malloc_aligned( ldc, n + 4, sizeof(double) ); + C_ref = (double*)malloc( sizeof(double) * m * n ); + + nrepeats = 3; + + srand48 (time(NULL)); + + // Randonly generate points in [ 0, 1 ]. + for ( p = 0; p < k; p ++ ) { + for ( i = 0; i < m; i ++ ) { + A( i, p ) = (double)( drand48() ); + } + } + for ( j = 0; j < n; j ++ ) { + for ( p = 0; p < k; p ++ ) { + B( p, j ) = (double)( drand48() ); + } + } + + for ( j = 0; j < n; j ++ ) { + for ( i = 0; i < m; i ++ ) { + C_ref( i, j ) = (double)( 0.0 ); + C( i, j ) = (double)( 0.0 ); + } + } +#ifdef PRINT + bl_dgemm_printmatrix(A, lda ,m,k); + bl_dgemm_printmatrix(B, ldb ,k,n); + bl_dgemm_printmatrix(C, ldc ,m,n); +#endif + for ( i = 0; i < nrepeats; i ++ ) { + bl_dgemm_beg = bl_clock(); + { + blis::gemm( + CblasColMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc + ); + } + bl_dgemm_time = bl_clock() - bl_dgemm_beg; + + if ( i == 0 ) { + bl_dgemm_rectime = bl_dgemm_time; + } else { + bl_dgemm_rectime = bl_dgemm_time < bl_dgemm_rectime ? bl_dgemm_time : bl_dgemm_rectime; + } + } + +#ifdef PRINT + bl_dgemm_printmatrix(C, ldc ,m,n); +#endif + for ( i = 0; i < nrepeats; i ++ ) { + ref_beg = bl_clock(); + { + cblas_dgemm( + CblasColMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + alpha, + A, + lda, + B, + ldb, + beta, + C_ref, + ldc_ref + ); + } + ref_time = bl_clock() - ref_beg; + + if ( i == 0 ) { + ref_rectime = ref_time; + } else { + ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime; + } + } + +#ifdef PRINT + bl_dgemm_printmatrix(C_ref, ldc_ref ,m,n); +#endif + if(computeError(ldc, ldc_ref, m, n, C, C_ref )==1) + printf("%s TEST FAIL\n" ,__func__); + else + printf("%s TEST PASS\n" , __func__); + + + // Compute overall floating point operations. + flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k ); + + printf( "%5d\t %5d\t %5d\t %5.2lf\t %5.2lf\n", + m, n, k, flops / bl_dgemm_rectime, flops / ref_rectime ); + + free( A ); + free( B ); + free( C ); + free( C_ref ); +} + // ----------------------------------------------------------------------------- int main( int argc, char** argv ) { - int M, N, K, lda, ldb, ldc; - double a_d[DIM * DIM] = { 1.111, 2.222, 3.333, 4.444 }; - double b_d[DIM * DIM] = { 5.555, 6.666, 7.777, 8.888 }; - double c_d[DIM * DIM]; - double alpha_d, beta_d; - float a_f[DIM * DIM] = { 1.1, 2.2, 3.3, 4.4 }; - float b_f[DIM * DIM] = { 5.5, 6.6, 7.7, 8.8 }; - float c_f[DIM * DIM]; - float alpha_f, beta_f; - std::complex a_c[DIM * DIM]={{1, 2},{3, 4},{5,6},{7,8}}; - std::complex b_c[DIM * DIM]={{1, 2},{3, 4},{5,6},{7,8}}; - std::complex c_c[DIM * DIM]; - std::complex alpha_c, beta_c; - std::complex a_z[DIM * DIM]={{1.1, 2.2},{3.3, 4.4},{5.5,6.6},{7.7,8.8}}; - std::complex b_z[DIM * DIM]={{1.1, 2.2},{3.3, 4.4},{5.5,6.6},{7.7,8.8}}; - std::complex c_z[DIM * DIM]; - std::complex alpha_z, beta_z; - M = DIM; - N = M; - K = M; - lda = M; - ldb = K; - ldc = M; - alpha_d = 1.0; - beta_d = 0.0; - alpha_f = 1.0; - beta_f = 0.0; - alpha_c = {1.0,1.0}; - beta_c = {0.0,0.0}; - alpha_z = {1.0,1.0}; - beta_z = {0.0,0.0}; - - /*cblis_sgemm*/ - cout<<"a_f= \n"; - print_matrix(a_f , M , K); - cout<<"b_f= \n"; - print_matrix(b_f , K , N); - blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_f, a_f, - lda, b_f, ldb, beta_f, c_f, ldc); - cout<<"c_f= \n"; - print_matrix(c_f , M , N); - - - /*cblis_dgemm*/ - printf("a_d = \n"); - print_matrix(a_d , M , K); - printf("b_d = \n"); - print_matrix(b_d , K , N); - blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_d, a_d, - lda, b_d, ldb, beta_d, c_d, ldc); - printf("c_d = \n"); - print_matrix(c_d , M , N); - - - /*cblis_cgemm*/ - printf("a_c = \n"); - print_matrix>(a_c , M , K); - printf("b_c = \n"); - print_matrix>(b_c , K , N); - blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_c, a_c, - lda, b_c, ldb, beta_c, c_c, ldc); - printf("c_c = \n"); - print_matrix>(c_c , M , N); - - - /*cblis_zgemm*/ - printf("a_z = \n"); - print_matrix>(a_z , M , K); - printf("b_z = \n"); - print_matrix>(b_z , K , N); - blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_z, a_z, - lda, b_z, ldb, beta_z, c_z, ldc); - printf("c_z = \n"); - print_matrix>(c_z , M , N); - return 0; + test_dgemm( ); + return 0; } diff --git a/testcpp/test_gemm.hh b/testcpp/test_gemm.hh new file mode 100644 index 000000000..a4525b035 --- /dev/null +++ b/testcpp/test_gemm.hh @@ -0,0 +1,191 @@ +/* + * -------------------------------------------------------------------------- + * BLISLAB + * -------------------------------------------------------------------------- + * Copyright (C) 2016, The University of Texas at Austin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - Neither the name of The University of Texas nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * + * test_gemm.hh + * + * + * Purpose: + * this header file contains all function prototypes. + * + * Todo: + * + * + * Modification: + * + * + * */ + + +#ifndef TEST_GEMM_HH +#define TEST_GEMM_HH + +#include + +#include +#include + +// Determine the target operating system +#if defined(_WIN32) || defined(__CYGWIN__) +#define BL_OS_WINDOWS 1 +#elif defined(__APPLE__) || defined(__MACH__) +#define BL_OS_OSX 1 +#elif defined(__ANDROID__) +#define BL_OS_ANDROID 1 +#elif defined(__linux__) +#define BL_OS_LINUX 1 +#elif defined(__bgq__) +#define BL_OS_BGQ 1 +#elif defined(__bg__) +#define BL_OS_BGP 1 +#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__bsdi__) || defined(__DragonFly__) +#define BL_OS_BSD 1 +#else +#error "Cannot determine operating system" +#endif + +// gettimeofday() needs this. +#if BL_OS_WINDOWS + #include +#elif BL_OS_OSX + #include +#else + #include + #include +#endif + +//#include "bl_config.h" + +#define min( i, j ) ( (i)<(j) ? (i): (j) ) + +#define A( i, j ) A[ (j)*lda + (i) ] +#define B( i, j ) B[ (j)*ldb + (i) ] +#define C( i, j ) C[ (j)*ldc + (i) ] +#define C_ref( i, j ) C_ref[ (j)*ldc_ref + (i) ] +#define GEMM_SIMD_ALIGN_SIZE 32 +struct aux_s { + double *b_next; + float *b_next_s; + int ldr; + char *flag; + int pc; + int m; + int n; +}; +typedef struct aux_s aux_t; + +void bl_dgemm( + int m, + int n, + int k, + double *A, + int lda, + double *B, + int ldb, + double *C, + int ldc + ); + +/* + * + * + */ +double *bl_malloc_aligned( + int m, + int n, + int size + ) +{ + double *ptr; + int err; + + err = posix_memalign( (void**)&ptr, (size_t)GEMM_SIMD_ALIGN_SIZE, size * m * n ); + + if ( err ) { + printf( "bl_malloc_aligned(): posix_memalign() failures" ); + exit( 1 ); + } + + return ptr; +} + + + +/* + * + * + */ +void bl_dgemm_printmatrix( + double *A, + int lda, + int m, + int n + ) +{ + int i, j; + for ( i = 0; i < m; i ++ ) { + for ( j = 0; j < n; j ++ ) { + printf("%lf\t", A[j * lda + i]); + } + printf("\n"); + } + printf("\n"); +} + +/* + * The timer functions are copied directly from BLIS 0.2.0 + * + */ +static double gtod_ref_time_sec = 0.0; +double bl_clock_helper() +{ + double the_time, norm_sec; + struct timespec ts; + + clock_gettime( CLOCK_MONOTONIC, &ts ); + + if ( gtod_ref_time_sec == 0.0 ) + gtod_ref_time_sec = ( double ) ts.tv_sec; + + norm_sec = ( double ) ts.tv_sec - gtod_ref_time_sec; + + the_time = norm_sec + ts.tv_nsec * 1.0e-9; + + return the_time; +} + + +double bl_clock( void ) +{ + return bl_clock_helper(); +} + +#endif