From e7dbbb1194e9ae31a358653e1c2402f83dc0eb0e Mon Sep 17 00:00:00 2001
From: Chithra Sankar <Chithra.Sankar@amd.com>
Date: Mon, 19 Aug 2019 10:29:58 +0530
Subject: [PATCH] Level 3 BLAS CPP routines(Except trmm) + Doxygen Commenting +
 Test application for dgemm CPP API

Change-Id: I97d1203ce466f3adb991341c17db9070c1eaf104
---
 cpp/blis.hh          | 1240 +++++++++++++++++++++++++++++++++++++++---
 cpp/cblas.hh         |  367 ++++++++++++-
 testcpp/test.sh      |   32 +-
 testcpp/test_gemm.cc |  279 +++++++---
 testcpp/test_gemm.hh |  191 +++++++
 5 files changed, 1919 insertions(+), 190 deletions(-)
 create mode 100644 testcpp/test_gemm.hh
diff --git a/cpp/blis.hh b/cpp/blis.hh
index fc25ebff7..b3f637aa0 100644
--- a/cpp/blis.hh
+++ b/cpp/blis.hh
@@ -1,102 +1,1176 @@
+/******************************************************************************
+* Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+/*! @file blis.hh
+ *  blis.hh defines all the CPP templated public interfaces
+ *  */
 #ifndef BLIS_HH
 #define BLIS_HH
 
-#include "blis_util.hh"
 #include "cblas.hh"
+#include "blis_util.hh"
 #include <limits>
 
 namespace blis {
+/*! @brief \b GEMM
 
-// =============================================================================
-/// General matrix-matrix multiply,
-///     \f[ C = \alpha op(A) \times op(B) + \beta C, \f]
-/// where op(X) is one of
-///     \f[ op(X) = X,   \f]
-///     \f[ op(X) = X^T, \f]
-///     \f[ op(X) = X^H, \f]
-/// alpha and beta are scalars, and A, B, and C are matrices, with
-/// op(A) an m-by-k matrix, op(B) a k-by-n matrix, and C an m-by-n matrix.
-///
-/// Generic implementation for arbitrary data types.
-/// TODO: generic version not yet implemented.
-///
-/// @param[in] layout
-///     Matrix storage, Layout::ColMajor or Layout::RowMajor.
-///
-/// @param[in] transA
-///     The operation op(A) to be used:
-///     - Op::NoTrans:   \f$ op(A) = A.   \f$
-///     - Op::Trans:     \f$ op(A) = A^T. \f$
-///     - Op::ConjTrans: \f$ op(A) = A^H. \f$
-///
-/// @param[in] transB
-///     The operation op(B) to be used:
-///     - Op::NoTrans:   \f$ op(B) = B.   \f$
-///     - Op::Trans:     \f$ op(B) = B^T. \f$
-///     - Op::ConjTrans: \f$ op(B) = B^H. \f$
-///
-/// @param[in] m
-///     Number of rows of the matrix C and op(A). m >= 0.
-///
-/// @param[in] n
-///     Number of columns of the matrix C and op(B). n >= 0.
-///
-/// @param[in] k
-///     Number of columns of op(A) and rows of op(B). k >= 0.
-///
-/// @param[in] alpha
-///     Scalar alpha. If alpha is zero, A and B are not accessed.
-///
-/// @param[in] A
-///     - If transA = NoTrans:
-///       the m-by-k matrix A, stored in an lda-by-k array [RowMajor: m-by-lda].
-///     - Otherwise:
-///       the k-by-m matrix A, stored in an lda-by-m array [RowMajor: k-by-lda].
-///
-/// @param[in] lda
-///     Leading dimension of A.
-///     - If transA = NoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)].
-///     - Otherwise:           lda >= max(1, k) [RowMajor: lda >= max(1, m)].
-///
-/// @param[in] B
-///     - If transB = NoTrans:
-///       the k-by-n matrix B, stored in an ldb-by-n array [RowMajor: k-by-ldb].
-///     - Otherwise:
-///       the n-by-k matrix B, stored in an ldb-by-k array [RowMajor: n-by-ldb].
-///
-/// @param[in] ldb
-///     Leading dimension of B.
-///     - If transB = NoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
-///     - Otherwise:           ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
-///
-/// @param[in] beta
-///     Scalar beta. If beta is zero, C need not be set on input.
-///
-/// @param[in] C
-///     The m-by-n matrix C, stored in an ldc-by-n array [RowMajor: m-by-ldc].
-///
-/// @param[in] ldc
-///     Leading dimension of C. ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
-///
-/// @ingroup gemm
+  \verbatim
 
-template< typename TA, typename TB, typename TC >
+  GEMM  performs general matrix-matrix multiply for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*op( A )*op( B ) + beta*C,
+
+  where  op( X ) is one of
+
+  op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+
+  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_ORDER
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] transA
+  \verbatim
+  
+  transA is CBLAS_TRANSPOSE
+  On entry, transA specifies the form of op( A ) to be used in
+  the matrix multiplication as follows:
+
+  transA = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
+
+  transA = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
+
+  transA = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**H.
+  \endverbatim
+
+  \param[in] transB
+  \verbatim
+  transB is CBLAS_TRANSPOSE
+  On entry, transB specifies the form of op( B ) to be used in
+  the matrix multiplication as follows:
+
+  transB = CBLAS_TRANSPOSE::CblasNoTrans,  op( B ) = B.
+
+  transB = CBLAS_TRANSPOSE::CblasTrans,  op( B ) = B**T.
+
+  transB = CBLAS_TRANSPOSE::CblasConjTrans,  op( B ) = B**H.
+  \endverbatim
+
+  \param[in] m
+  \verbatim
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  op( A )  and of the  matrix  C.  m  must  be at least  zero.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  op( B ) and the number of columns of the matrix C. n must be
+  at least zero.
+  \endverbatim
+
+  \param[in] k
+  \verbatim
+  k is INTEGER
+  On entry,  k  specifies  the number of columns of the matrix
+  op( A ) and the number of rows of the matrix op( B ). k must
+  be at least  zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If transA = CblasNoTrans:
+  m-by-k , stored in an lda-by-k array [RowMajor: m-by-lda].
+  Otherwise:
+  k-by-m , stored in an lda-by-m array [RowMajor: k-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If transA = CblasNoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, m)].
+  \endverbatim
+
+  \param[in] B
+  \verbatim
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If transA = CblasNoTrans:
+  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb].
+  Otherwise:
+  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
+  \endverbatim
+
+  \param[in] ldb
+  \verbatim
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  If transA = CblasNoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
+  Otherwise:                ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
+  \endverbatim
+
+  \param[in] beta
+  \verbatim
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+  \endverbatim
+
+  \param[in,out] C
+  \verbatim
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  m-by-n stored in an ldc-by-n array [RowMajor: m-by-ldc].
+  Before entry, the leading  m by n  part of the array  C must
+  contain the matrix  C,  except when  beta  is zero, in which
+  case C need not be set on entry.
+  On exit, the array  C  is overwritten by the  m by n  matrix
+  ( alpha*op( A )*op( B ) + beta*C ).
+  \endverbatim
+
+  \param[in] ldc
+  \verbatim
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
+  \endverbatim
+
+  */	
+template< typename T >
 void gemm(
-    CBLAS_LAYOUT layout,
+    CBLAS_ORDER layout,
     CBLAS_TRANSPOSE transA,
     CBLAS_TRANSPOSE transB,
     int64_t m, int64_t n, int64_t k,
-    scalar_type<TA, TB, TC> alpha,
-    TA const *A, int64_t lda,
-    TB const *B, int64_t ldb,
-    scalar_type<TA, TB, TC> beta,
-    TC       *C, int64_t ldc )
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
 {
-//    printf("In gemm.cpp\n");
     cblas_gemm(layout, transA, transB, m, n, k, alpha, A,lda, B, ldb, beta, C, ldc);
 
-};
+}
 
+/*! @brief \b TRSM
+
+  \verbatim
+
+  TRSM  performs solves one of the matrix equations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+      op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+
+  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+  where  op( X ) is one of
+      
+	  op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+
+  The matrix X is overwritten on B.
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_ORDER
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] side
+  \verbatim
+  side is enum CBLAS_SIDE
+
+  side specifies specifies whether op( A ) appears on the left
+  or right of X as follows:
+  
+  side = CBLAS_SIDE::CblasLeft   op( A )*X = alpha*B.
+	  
+  side = CBLAS_SIDE::CblasRight   op( A )*X = alpha*B.
+  \endverbatim
+
+  \param[in] uplo
+  \verbatim
+  uplo is enum CBLAS_UPLO
+
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows:
+  
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix.
+	  
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  \endverbatim
+  
+  \param[in] trans
+  \verbatim
+
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the form of op( A ) to be used in
+  the matrix multiplication as follows:
+
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
+
+  trans = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
+
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**H.
+  \endverbatim
+  
+  \param[in] diag
+  \verbatim
+  diag is enum CBLAS_DIAG
+
+  diag specifies specifies whether or not A is unit triangular
+  as follows:
+  
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.
+	  
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+  \endverbatim
+
+  \param[in] m
+  \verbatim
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  B.  m  must  be at least  zero.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  B. n must be at least zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If side = CblasLeft:
+  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
+  If side = CblasRight:
+  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If side = CblasLeft: lda >= max(1, m) .
+  If side = CblasRight:lda >= max(1, k) .
+  \endverbatim
+
+  \param[in] B
+  \verbatim
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb]. 
+  on exit  is overwritten by the solution matrix  X.
+  \endverbatim
+
+  \param[in] ldb
+  \verbatim
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
+  \endverbatim
+
+  */
+template< typename T >
+void trsm(
+    CBLAS_ORDER layout,
+    CBLAS_SIDE side,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t m,
+    int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T       *B, int64_t ldb )
+{
+    cblas_trsm( layout, side, uplo, trans, diag, m, n,  alpha, A, lda, B, ldb);
+}
+
+/*! @brief \b HEMM
+
+  \verbatim
+
+  HEMM  performs solves one of the matrix-matrix operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+      C := alpha*A*B + beta*C
+  or
+      C := alpha*B*A + beta*C,
+
+  where alpha is a scalar,  A is an hermitian matrix
+  C and B are m by n matrices
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_ORDER
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] side
+  \verbatim
+  side is enum CBLAS_SIDE
+
+  side specifies specifies whether the  hermitian matrix  A
+  appears on the  left or right  in the  operation as follows:
+
+  side = CBLAS_SIDE::CblasLeft   C := alpha*A*B + beta*C,
+
+  side = CBLAS_SIDE::CblasRight   C := alpha*B*A + beta*C
+  \endverbatim
+
+  \param[in] uplo
+  \verbatim
+  uplo is enum CBLAS_UPLO
+
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  hermitian  matrix   A  is  to  be
+  referenced as follows:
+
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of the
+                                  hermitian matrix is to be referenced.
+
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of the
+                                  hermitian matrix is to be referenced.
+  \endverbatim
+
+  \param[in] m
+  \verbatim
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  C.  m  must  be at least  zero.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  C. n must be at least zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If side = CblasLeft:
+  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
+  If side = CblasRight:
+  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If side = CblasLeft: lda >= max(1, m) .
+  If side = CblasRight:lda >= max(1, k) .
+  \endverbatim
+
+  \param[in] B
+  \verbatim
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
+  \endverbatim
+
+  \param[in] ldb
+  \verbatim
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
+  \endverbatim
+
+  \param[in] beta
+  \verbatim
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar beta.
+  If beta is zero, C need not be set on input
+  \endverbatim
+
+  \param[in] C
+  \verbatim
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc].
+  \endverbatim
+
+  \param[in] ldc
+  \verbatim
+  ldc is INTEGER
+  On entry, ldc specifies the Leading dimension of C
+  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
+  \endverbatim
+
+  */
+template< typename T >
+void hemm(
+    CBLAS_ORDER layout,
+    CBLAS_SIDE side,
+    CBLAS_UPLO uplo,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_hemm( layout, side, uplo, m, n,  alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+/*! @brief \b SYMM
+
+  \verbatim
+
+  SYMM  performs solves one of the matrix-matrix operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+      C := alpha*A*B + beta*C
+  or
+      C := alpha*B*A + beta*C,
+
+  where alpha is a scalar,  A is an symmetric matrix
+  C and B are m by n matrices
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_ORDER
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] side
+  \verbatim
+  side is enum CBLAS_SIDE
+
+  side specifies specifies whether the  symmetric matrix  A
+  appears on the  left or right  in the  operation as follows:
+
+  side = CBLAS_SIDE::CblasLeft   C := alpha*A*B + beta*C,
+
+  side = CBLAS_SIDE::CblasRight   C := alpha*B*A + beta*C
+  \endverbatim
+
+  \param[in] uplo
+  \verbatim
+  uplo is enum CBLAS_UPLO
+
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  symmetric  matrix   A  is  to  be
+  referenced as follows:
+
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of the
+                                  symmetric matrix is to be referenced.
+
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of the
+                                  symmetric matrix is to be referenced.
+  \endverbatim
+
+  \param[in] m
+  \verbatim
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  C.  m  must  be at least  zero.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  C. n must be at least zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If side = CblasLeft:
+  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
+  If side = CblasRight:
+  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If side = CblasLeft: lda >= max(1, m) .
+  If side = CblasRight:lda >= max(1, k) .
+  \endverbatim
+
+  \param[in] B
+  \verbatim
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
+  \endverbatim
+
+  \param[in] ldb
+  \verbatim
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
+  \endverbatim
+
+  \param[in] beta
+  \verbatim
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar beta.
+  If beta is zero, C need not be set on input
+  \endverbatim
+
+  \param[in] C
+  \verbatim
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc].
+  \endverbatim
+
+  \param[in] ldc
+  \verbatim
+  ldc is INTEGER
+  On entry, ldc specifies the Leading dimension of C
+  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
+  \endverbatim
+
+  */
+template< typename T >
+void symm(
+    CBLAS_ORDER layout,
+    CBLAS_SIDE side,
+    CBLAS_UPLO uplo,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_symm( layout, side, uplo, m, n,  alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+/*! @brief \b SYRK
+
+  \verbatim
+
+  SYRK  performs one of the symmetric rank k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*A*A**T + beta*C,
+
+  or
+
+  C := alpha*A**T*A + beta*C,
+
+  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+  and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+  in the second case.
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_LAYOUT
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] uplo
+  \verbatim
+  uplo is enum CBLAS_UPLO
+
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced
+  as follows:
+
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+  \endverbatim
+
+  \param[in] trans
+  \verbatim
+
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*A**T + beta*C.
+
+  trans = CBLAS_TRANSPOSE::CblasTrans,C := alpha*A**T*A + beta*C.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+  \endverbatim
+
+  \param[in] k
+  \verbatim
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrix A.
+  Otherwise:               k is number of rows    of the matrix A.
+  k must be at least  zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If transA = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If transA = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+  \endverbatim
+
+  \param[in] beta
+  \verbatim
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+  \endverbatim
+
+  \param[in,out] C
+  \verbatim
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n symmetric matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper
+  triangular part of the updated matrix.
+  \endverbatim
+
+  \param[in] ldc
+  \verbatim
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n)
+  \endverbatim
+
+  */
+template< typename T >
+void syrk(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_syrk( layout, uplo, trans,  n, k,  alpha, A, lda, beta, C, ldc);
+}
+
+/*! @brief \b SYR2K
+
+  \verbatim
+
+  SYR2K  performs one of the symmetric rank 2k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*A*B**T + alpha*B*A**T + beta*C,
+
+  or
+
+  C := alpha*A**T*B + alpha*B**T*A + beta*C,
+
+  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+  and  A  and B are n by k  matrices in the first case and k by n  matrices
+  in the second case.
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_LAYOUT
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] uplo
+  \verbatim
+  uplo is enum CBLAS_UPLO
+
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced
+  as follows:
+
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+  \endverbatim
+
+  \param[in] trans
+  \verbatim
+
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*B**T + alpha*B*A**T
+                                             + beta*C.
+
+  trans = CBLAS_TRANSPOSE::CblasTrans,  C := alpha*A**T*B + alpha*B**T*A
+                                             + beta*C.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+  \endverbatim
+
+  \param[in] k
+  \verbatim
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrices A & B.
+  Otherwise:               k is number of rows    of the matrices A & B.
+  k must be at least  zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+  \endverbatim
+
+  \param[in] B
+  \verbatim
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
+  Otherwise:
+  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb]
+  \endverbatim
+
+  \param[in] ldb
+  \verbatim
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
+  Otherwise:               ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
+  \endverbatim
+
+  \param[in] beta
+  \verbatim
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+  \endverbatim
+
+  \param[in,out] C
+  \verbatim
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n symmetric matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper
+  triangular part of the updated matrix.
+  \endverbatim
+
+  \param[in] ldc
+  \verbatim
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n)
+  \endverbatim
+
+  */
+template< typename T >
+void syr2k(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_syr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );	
+}
+
+/*! @brief \b HERK
+
+  \verbatim
+
+  HERK  performs one of the hermitian rank k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+    C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+  
+  or
+  
+    C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+
+  where  alpha and beta  are real scalars,  C is an  n by n  hermitian 
+  matrix and  A  is an n by k  matrix in the first case and 
+  k by n  matrix in the second case.
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_LAYOUT
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] uplo
+  \verbatim
+  uplo is enum CBLAS_UPLO
+
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced 
+  as follows:
+
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+  \endverbatim
+
+  \param[in] trans
+  \verbatim
+
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  C := alpha*A*A**H + beta*C.
+
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*A + beta*C.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+  \endverbatim
+
+  \param[in] k
+  \verbatim
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrix   A. 
+  Otherwise:               k is number of rows    of the matrix   A.
+  k must be at least  zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+  \endverbatim
+  
+  \param[in] beta
+  \verbatim
+  beta is REAL/DOUBLE PRECISION
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+  \endverbatim
+
+  \param[in,out] C
+  \verbatim
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n Hermitian matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper 
+  triangular part of the updated matrix.
+  \endverbatim
+
+  \param[in] ldc
+  \verbatim
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n) 
+  \endverbatim
+  */
+template< typename T >
+void herk(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    real_type<T> alpha,
+    T const *A, int64_t lda,
+    real_type<T> beta,
+    T       *C, int64_t ldc )
+{
+    cblas_herk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );	
+}
+
+/*! @brief \b HER2K
+
+  \verbatim
+
+  HER2K  performs one of the hermitian rank 2k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+    C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+  
+  or
+  
+    C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+
+  where  alpha and beta  are scalars with  beta  real,  C is an  n by n 
+  hermitian matrix and  A  and B are n by k  matrices in the first case 
+  and k by n  matrices in the second case.
+  \endverbatim
+
+  \param[in] layout
+  \verbatim
+  layout is enum CBLAS_LAYOUT
+
+  layout specifies Matrix storage as follows:
+
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+  \endverbatim
+
+  \param[in] uplo
+  \verbatim
+  uplo is enum CBLAS_UPLO
+
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced 
+  as follows:
+
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+  \endverbatim
+
+  \param[in] trans
+  \verbatim
+
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  C := alpha*A*B**H          +
+                                             conjg( alpha )*B*A**H +
+                                             beta*C.
+
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*B          +
+                                             conjg( alpha )*B**H*A +  
+                                             beta*C.
+  \endverbatim
+
+  \param[in] n
+  \verbatim
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+  \endverbatim
+
+  \param[in] k
+  \verbatim
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrices A & B. 
+  Otherwise:               k is number of rows    of the matrices A & B.
+  k must be at least  zero.
+  \endverbatim
+
+  \param[in] alpha
+  \verbatim
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+  \endverbatim
+
+  \param[in] A
+  \verbatim
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+  \endverbatim
+
+  \param[in] lda
+  \verbatim
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+  \endverbatim
+
+  \param[in] B
+  \verbatim
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
+  Otherwise:
+  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb]
+  \endverbatim
+
+  \param[in] ldb
+  \verbatim
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
+  Otherwise:               ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
+  \endverbatim
+  
+  \param[in] beta
+  \verbatim
+  beta is REAL/DOUBLE PRECISION
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+  \endverbatim
+
+  \param[in,out] C
+  \verbatim
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n Hermitian matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper 
+  triangular part of the updated matrix.
+  \endverbatim
+
+  \param[in] ldc
+  \verbatim
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n) 
+  \endverbatim
+  */
+template< typename T >
+void her2k(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    real_type<T> beta,
+    T       *C, int64_t ldc )
+{
+    cblas_her2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );	
+}
 }  // namespace blis
-
 #endif        //  #ifndef BLIS_HH
diff --git a/cpp/cblas.hh b/cpp/cblas.hh
index de67a6c17..7d70fda5d 100644
--- a/cpp/cblas.hh
+++ b/cpp/cblas.hh
@@ -5,7 +5,6 @@ extern "C" {
 #include <cblas.h>
 #include <blis.h>
 }
-typedef CBLAS_ORDER CBLAS_LAYOUT;
 
 
 #include <complex>
@@ -18,7 +17,7 @@ namespace blis{
 // -----------------------------------------------------------------------------
 inline void
 cblas_gemm(
-    CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
+    CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
     int m, int n, int k,
     float  alpha,
     float const *A, int lda,
@@ -34,7 +33,7 @@ cblas_gemm(
 
 inline void
 cblas_gemm(
-    CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
+    CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
     int m, int n, int k,
     double  alpha,
     double const *A, int lda,
@@ -50,7 +49,7 @@ cblas_gemm(
 
 inline void
 cblas_gemm(
-    CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
+    CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
     int m, int n, int k,
     std::complex<float>  alpha,
     std::complex<float> const *A, int lda,
@@ -66,7 +65,7 @@ cblas_gemm(
 
 inline void
 cblas_gemm(
-    CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
+    CBLAS_ORDER layout, CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
     int m, int n, int k,
     std::complex<double>  alpha,
     std::complex<double> const *A, int lda,
@@ -79,7 +78,365 @@ cblas_gemm(
                  &alpha, A, lda, B, ldb,
                  &beta,  C, ldc );
 }
+// -----------------------------------------------------------------------------
+inline void
+cblas_trsm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans, CBLAS_DIAG diag,
+    int m, int n,
+    float alpha,
+    float const *A, int lda,
+    float       *B, int ldb )
+{
+    cblas_strsm( layout, side, uplo, trans, diag, m, n,  alpha, A, lda, B, ldb);
+}
 
+inline void
+cblas_trsm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans, CBLAS_DIAG diag,
+    int m, int n,
+    double alpha,
+    double const *A, int lda,
+    double       *B, int ldb )
+{
+    cblas_dtrsm( layout, side, uplo, trans, diag, m, n,  alpha, A, lda, B, ldb);
+}
+
+inline void
+cblas_trsm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans, CBLAS_DIAG diag,
+    int m, int n,
+    std::complex<float> alpha,
+    std::complex<float> const *A, int lda,
+    std::complex<float>       *B, int ldb )
+{
+    cblas_ctrsm( layout, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb );
+}
+
+inline void
+cblas_trsm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans, CBLAS_DIAG diag,
+    int m, int n,
+    std::complex<double> alpha,
+    std::complex<double> const *A, int lda,
+    std::complex<double>       *B, int ldb )
+{
+    cblas_ztrsm( layout, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb );
+}
+
+// -----------------------------------------------------------------------------
+inline void
+cblas_hemm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    float  alpha,
+    float const *A, int lda,
+    float const *B, int ldb,
+    float  beta,
+    float* C, int ldc )
+{
+    cblas_ssymm( layout, side, uplo, m, n,
+                 alpha, A, lda, B, ldb,
+                 beta,  C, ldc );
+}
+
+inline void
+cblas_hemm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    double  alpha,
+    double const *A, int lda,
+    double const *B, int ldb,
+    double  beta,
+    double* C, int ldc )
+{
+    cblas_dsymm( layout, side, uplo, m, n,
+                 alpha, A, lda, B, ldb,
+                 beta,  C, ldc );
+}
+
+inline void
+cblas_hemm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    std::complex<float>  alpha,
+    std::complex<float> const *A, int lda,
+    std::complex<float> const *B, int ldb,
+    std::complex<float>  beta,
+    std::complex<float>* C, int ldc )
+{
+    cblas_chemm( layout, side, uplo, m, n,
+                 &alpha, A, lda, B, ldb,
+                 &beta,  C, ldc );
+}
+
+inline void
+cblas_hemm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    std::complex<double>  alpha,
+    std::complex<double> const *A, int lda,
+    std::complex<double> const *B, int ldb,
+    std::complex<double>  beta,
+    std::complex<double>* C, int ldc )
+{
+    cblas_zhemm( layout, side, uplo, m, n,
+                 &alpha, A, lda, B, ldb,
+                 &beta,  C, ldc );
+}
+
+// -----------------------------------------------------------------------------
+inline void
+cblas_symm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    float  alpha,
+    float const *A, int lda,
+    float const *B, int ldb,
+    float  beta,
+    float* C, int ldc )
+{
+    cblas_ssymm( layout, side, uplo, m, n,
+                 alpha, A, lda, B, ldb,
+                 beta,  C, ldc );
+}
+
+inline void
+cblas_symm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    double  alpha,
+    double const *A, int lda,
+    double const *B, int ldb,
+    double  beta,
+    double* C, int ldc )
+{
+    cblas_dsymm( layout, side, uplo, m, n,
+                 alpha, A, lda, B, ldb,
+                 beta,  C, ldc );
+}
+
+inline void
+cblas_symm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    std::complex<float>  alpha,
+    std::complex<float> const *A, int lda,
+    std::complex<float> const *B, int ldb,
+    std::complex<float>  beta,
+    std::complex<float>* C, int ldc )
+{
+    cblas_csymm( layout, side, uplo, m, n,
+                 &alpha, A, lda, B, ldb,
+                 &beta,  C, ldc );
+}
+
+inline void
+cblas_symm(
+    CBLAS_ORDER layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+    int m, int n,
+    std::complex<double>  alpha,
+    std::complex<double> const *A, int lda,
+    std::complex<double> const *B, int ldb,
+    std::complex<double>  beta,
+    std::complex<double>* C, int ldc )
+{
+    cblas_zsymm( layout, side, uplo, m, n,
+                 &alpha, A, lda, B, ldb,
+                 &beta,  C, ldc );
+}
+
+
+// -----------------------------------------------------------------------------
+inline void
+cblas_syrk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    float alpha,
+    float const *A, int lda,
+    float beta,
+    float*       C, int ldc )
+{
+    cblas_ssyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );
+}
+
+inline void
+cblas_syrk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    double alpha,
+    double const *A, int lda,
+    double beta,
+    double*       C, int ldc )
+{
+    cblas_dsyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );
+}
+
+inline void
+cblas_syrk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    std::complex<float> alpha,
+    std::complex<float> const *A, int lda,
+    std::complex<float> beta,
+    std::complex<float>*       C, int ldc )
+{
+    cblas_csyrk( layout, uplo, trans, n, k, &alpha, A, lda, &beta, C, ldc );
+}
+
+inline void
+cblas_syrk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    std::complex<double> alpha,
+    std::complex<double> const *A, int lda,
+    std::complex<double> beta,
+    std::complex<double>* C, int ldc )
+{
+    cblas_zsyrk( layout, uplo, trans, n, k, &alpha, A, lda, &beta, C, ldc );
+}
+
+// -----------------------------------------------------------------------------
+inline void
+cblas_herk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    float alpha,
+    float const *A, int lda,
+    float beta,
+    float*       C, int ldc )
+{
+    cblas_ssyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );
+}
+
+inline void
+cblas_herk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    double alpha,
+    double const *A, int lda,
+    double beta,
+    double*       C, int ldc )
+{
+    cblas_dsyrk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );
+}
+
+inline void
+cblas_herk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    float alpha,  // note: real
+    std::complex<float> const *A, int lda,
+    float beta,   // note: real
+    std::complex<float>*       C, int ldc )
+{
+    cblas_cherk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );
+}
+
+inline void
+cblas_herk(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    double alpha,  // note: real
+    std::complex<double> const *A, int lda,
+    double beta,   // note: real
+    std::complex<double>* C, int ldc )
+{
+    cblas_zherk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );
+}
+
+// -----------------------------------------------------------------------------
+inline void
+cblas_syr2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    float alpha,
+    float const *A, int lda,
+    float const *B, int ldb,
+    float beta,
+    float*       C, int ldc )
+{
+    cblas_ssyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );
+}
+
+inline void
+cblas_syr2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    double alpha,
+    double const *A, int lda,
+    double const *B, int ldb,
+    double beta,
+    double*       C, int ldc )
+{
+    cblas_dsyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );
+}
+
+inline void
+cblas_syr2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    std::complex<float> alpha,
+    std::complex<float> const *A, int lda,
+    std::complex<float> const *B, int ldb,
+    std::complex<float> beta,
+    std::complex<float>*       C, int ldc )
+{
+    cblas_csyr2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc );
+}
+
+inline void
+cblas_syr2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    std::complex<double> alpha,
+    std::complex<double> const *A, int lda,
+    std::complex<double> const *B, int ldb,
+    std::complex<double> beta,
+    std::complex<double>* C, int ldc )
+{
+    cblas_zsyr2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc );
+}
+
+// -----------------------------------------------------------------------------
+inline void
+cblas_her2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    float alpha,
+    float const *A, int lda,
+    float const *B, int ldb,
+    float beta,
+    float*       C, int ldc )
+{
+    cblas_ssyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );
+}
+
+inline void
+cblas_her2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    double alpha,
+    double const *A, int lda,
+    double const *B, int ldb,
+    double beta,
+    double*       C, int ldc )
+{
+    cblas_dsyr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );
+}
+
+inline void
+cblas_her2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    std::complex<float> alpha,
+    std::complex<float> const *A, int lda,
+    std::complex<float> const *B, int ldb,
+    float beta,  // note: real
+    std::complex<float>*       C, int ldc )
+{
+    cblas_cher2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, beta, C, ldc );
+}
+
+inline void
+cblas_her2k(
+    CBLAS_ORDER layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, int n, int k,
+    std::complex<double> alpha,
+    std::complex<double> const *A, int lda,
+    std::complex<double> const *B, int ldb,
+    double beta,  // note: real
+    std::complex<double>* C, int ldc )
+{
+    cblas_zher2k( layout, uplo, trans, n, k, &alpha, A, lda, B, ldb, beta, C, ldc );
+}
 }//namespace blis
 
 #endif        //  #ifndef CBLAS_HH
diff --git a/testcpp/test.sh b/testcpp/test.sh
index b6f34e4b5..19ec47700 100755
--- a/testcpp/test.sh
+++ b/testcpp/test.sh
@@ -2,31 +2,29 @@ CWD=$(pwd)
 echo $CWD
 make clean
 make blis CFLAGS+="-DFLOAT"
-numactl -C 1 ./test_gemm1_blis.x
-make clean
-make blis CFLAGS+="-DDOUBLE"
-numactl -C 1 ./test_gemm1_blis.x
-make clean
-make blis CFLAGS+="-DSCOMPLEX"
-numactl -C 1 ./test_gemm1_blis.x
-make clean
-make blis CFLAGS+="-DDCOMPLEX"
-numactl -C 1 ./test_gemm1_blis.x
-
-
-cd ../test/
-CWD=$(pwd)
-echo $CWD
-make clean
-make blis CFLAGS+="-DFLOAT"
 numactl -C 1 ./test_gemm_blis.x
+numactl -C 1 ./test_trsm_blis.x
+numactl -C 1 ./test_hemm_blis.x
+numactl -C 1 ./test_symm_blis.x
+
 make clean
 make blis CFLAGS+="-DDOUBLE"
 numactl -C 1 ./test_gemm_blis.x
+numactl -C 1 ./test_trsm_blis.x
+numactl -C 1 ./test_hemm_blis.x
+numactl -C 1 ./test_symm_blis.x
+
 make clean
 make blis CFLAGS+="-DSCOMPLEX"
 numactl -C 1 ./test_gemm_blis.x
+numactl -C 1 ./test_trsm_blis.x
+numactl -C 1 ./test_hemm_blis.x
+numactl -C 1 ./test_symm_blis.x
+
 make clean
 make blis CFLAGS+="-DDCOMPLEX"
 numactl -C 1 ./test_gemm_blis.x
+numactl -C 1 ./test_trsm_blis.x
+numactl -C 1 ./test_hemm_blis.x
+numactl -C 1 ./test_symm_blis.x
 
diff --git a/testcpp/test_gemm.cc b/testcpp/test_gemm.cc
index 05929636b..f251dd948 100644
--- a/testcpp/test_gemm.cc
+++ b/testcpp/test_gemm.cc
@@ -1,3 +1,36 @@
+/*
+
+   BLISPP
+   C++ test driver for BLIS CPP gemm routine and reference cblas gemm routine.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
 #include <complex>
 
 #include <stdio.h>
@@ -5,98 +38,174 @@
 #include <string.h>
 #include <unistd.h>
 #include "blis.hh"
+#include "test_gemm.hh"
 
 using namespace std;
-
-#define DIM 2
-template <typename T>
-void print_matrix(T * matrix , int m , int n)
+#define PRINT
+int computeError(
+        int    ldc,
+        int    ldc_ref,
+        int    m,
+        int    n,
+        double *C,
+        double *C_ref
+        )
 {
-	for ( int L=0; L < m; L ++ ) {
-		for ( int J = 0; J < n; J ++ ) {
-			cout<< matrix[L * n + J]<<" ";
-		}
-		cout<<"\n";
-	}
+    int    i, j;
+    int ret = 0;
+    for ( i = 0; i < m; i ++ ) {
+        for ( j = 0; j < n; j ++ ) {
+            if (  C( i, j ) != C_ref( i, j ) ) {
+                printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) );
+		ret = 1;
+                break;
+            }
+        }
+    }
+    return ret;
 
 }
+void test_dgemm(  ) 
+{
+    int    i, j, p, nx;
+    double *A, *B, *C, *C_ref;
+    double alpha, beta;
+    double tmp, error, flops;
+    double ref_beg, ref_time, bl_dgemm_beg, bl_dgemm_time;
+    int    nrepeats;
+    int m,n,k;
+    int    lda, ldb, ldc, ldc_ref;
+    double ref_rectime, bl_dgemm_rectime;
+
+    alpha = 1.0;
+    beta = 0.0;
+    m = 5;
+    k = 6;
+    n = 4;
+
+    A    = (double*)malloc( sizeof(double) * m * k );
+    B    = (double*)malloc( sizeof(double) * k * n );
+
+    lda = m;
+    ldb = k;
+    ldc     = m;
+    ldc_ref = m;
+    C     = bl_malloc_aligned( ldc, n + 4, sizeof(double) );
+    C_ref = (double*)malloc( sizeof(double) * m * n );
+
+    nrepeats = 3;
+
+    srand48 (time(NULL));
+
+    // Randonly generate points in [ 0, 1 ].
+    for ( p = 0; p < k; p ++ ) {
+        for ( i = 0; i < m; i ++ ) {
+            A( i, p ) = (double)( drand48() );	
+        }
+    }
+    for ( j = 0; j < n; j ++ ) {
+        for ( p = 0; p < k; p ++ ) {
+            B( p, j ) = (double)( drand48() );
+        }
+    }
+
+    for ( j = 0; j < n; j ++ ) {
+        for ( i = 0; i < m; i ++ ) {
+            C_ref( i, j ) = (double)( 0.0 );	
+                C( i, j ) = (double)( 0.0 );	
+        }
+    }
+#ifdef PRINT
+    bl_dgemm_printmatrix(A, lda ,m,k);
+    bl_dgemm_printmatrix(B, ldb ,k,n);
+    bl_dgemm_printmatrix(C, ldc ,m,n);
+#endif
+    for ( i = 0; i < nrepeats; i ++ ) {
+        bl_dgemm_beg = bl_clock();
+        {
+		blis::gemm(
+		    CblasColMajor,
+		    CblasNoTrans,
+		    CblasNoTrans,
+                    m,
+                    n,
+                    k,
+		    alpha,
+                    A,
+                    lda,
+                    B,
+                    ldb,
+		    beta,
+                    C,
+                    ldc
+                    );
+        }
+        bl_dgemm_time = bl_clock() - bl_dgemm_beg;
+
+        if ( i == 0 ) {
+            bl_dgemm_rectime = bl_dgemm_time;
+        } else {
+            bl_dgemm_rectime = bl_dgemm_time < bl_dgemm_rectime ? bl_dgemm_time : bl_dgemm_rectime;
+        }
+    }
+
+#ifdef PRINT
+    bl_dgemm_printmatrix(C, ldc ,m,n);
+#endif
+    for ( i = 0; i < nrepeats; i ++ ) {
+        ref_beg = bl_clock();
+        {
+		cblas_dgemm(
+		    CblasColMajor,
+		    CblasNoTrans,
+		    CblasNoTrans,
+                    m,
+                    n,
+                    k,
+		    alpha,
+                    A,
+                    lda,
+                    B,
+                    ldb,
+		    beta,
+                    C_ref,
+                    ldc_ref
+                    );
+        }
+        ref_time = bl_clock() - ref_beg;
+
+        if ( i == 0 ) {
+            ref_rectime = ref_time;
+        } else {
+            ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime;
+        }
+    }
+
+#ifdef PRINT
+    bl_dgemm_printmatrix(C_ref, ldc_ref ,m,n);
+#endif
+    if(computeError(ldc, ldc_ref, m, n, C, C_ref )==1)
+	    printf("%s TEST FAIL\n" ,__func__);
+    else
+	    printf("%s TEST PASS\n" , __func__);
+
+
+    // Compute overall floating point operations.
+    flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );
+
+    printf( "%5d\t %5d\t %5d\t %5.2lf\t %5.2lf\n", 
+            m, n, k, flops / bl_dgemm_rectime, flops / ref_rectime );
+
+    free( A     );
+    free( B     );
+    free( C     );
+    free( C_ref );
+}
+
 // -----------------------------------------------------------------------------
 int main( int argc, char** argv )
 {
-	int M, N, K, lda, ldb, ldc;
-	double a_d[DIM * DIM] = { 1.111, 2.222, 3.333, 4.444 };
-	double b_d[DIM * DIM] = { 5.555, 6.666, 7.777, 8.888 };
-	double c_d[DIM * DIM];
-	double alpha_d, beta_d;
-	float a_f[DIM * DIM] = { 1.1, 2.2, 3.3, 4.4 };
-	float b_f[DIM * DIM] = { 5.5, 6.6, 7.7, 8.8 };
-	float c_f[DIM * DIM];
-	float alpha_f, beta_f;
-	std::complex<float> a_c[DIM * DIM]={{1, 2},{3, 4},{5,6},{7,8}};
-	std::complex<float> b_c[DIM * DIM]={{1, 2},{3, 4},{5,6},{7,8}};
-	std::complex<float> c_c[DIM * DIM];
-	std::complex<float> alpha_c, beta_c;
-	std::complex<double> a_z[DIM * DIM]={{1.1, 2.2},{3.3, 4.4},{5.5,6.6},{7.7,8.8}};
-	std::complex<double> b_z[DIM * DIM]={{1.1, 2.2},{3.3, 4.4},{5.5,6.6},{7.7,8.8}};
-	std::complex<double> c_z[DIM * DIM];
-	std::complex<double> alpha_z, beta_z;
-	M = DIM;
-	N = M;
-	K = M;
-	lda = M;
-	ldb = K;
-	ldc = M;
-	alpha_d = 1.0;
-	beta_d = 0.0;
-	alpha_f = 1.0;
-	beta_f = 0.0;
-	alpha_c = {1.0,1.0};
-	beta_c = {0.0,0.0};
-	alpha_z = {1.0,1.0};
-	beta_z = {0.0,0.0};
-
-	/*cblis_sgemm*/	
-	cout<<"a_f= \n";
-	print_matrix<float>(a_f , M , K);
-	cout<<"b_f= \n";
-	print_matrix<float>(b_f , K , N);
-	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_f, a_f,
-			lda, b_f, ldb, beta_f, c_f, ldc);
-	cout<<"c_f= \n";
-	print_matrix<float>(c_f , M , N);
-
-
-	/*cblis_dgemm*/	
-	printf("a_d = \n");
-	print_matrix<double>(a_d , M , K);
-	printf("b_d = \n");
-	print_matrix<double>(b_d , K , N);
-	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_d, a_d,
-			lda, b_d, ldb, beta_d, c_d, ldc);
-	printf("c_d = \n");
-	print_matrix<double>(c_d , M , N);
-
-
-	/*cblis_cgemm*/	
-	printf("a_c = \n");
-	print_matrix<std::complex<float>>(a_c , M , K);
-	printf("b_c = \n");
-	print_matrix<std::complex<float>>(b_c , K , N);
-	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_c, a_c,
-			lda, b_c, ldb, beta_c, c_c, ldc);
-	printf("c_c = \n");
-	print_matrix<std::complex<float>>(c_c , M , N);
-
-
-	/*cblis_zgemm*/	
-	printf("a_z = \n");
-	print_matrix<std::complex<double>>(a_z , M , K);
-	printf("b_z = \n");
-	print_matrix<std::complex<double>>(b_z , K , N);
-	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_z, a_z,
-			lda, b_z, ldb, beta_z, c_z, ldc);
-	printf("c_z = \n");
-	print_matrix<std::complex<double>>(c_z , M , N);
-	return 0;
+    test_dgemm( );
+    return 0;
 
 }
diff --git a/testcpp/test_gemm.hh b/testcpp/test_gemm.hh
new file mode 100644
index 000000000..a4525b035
--- /dev/null
+++ b/testcpp/test_gemm.hh
@@ -0,0 +1,191 @@
+/*
+ * --------------------------------------------------------------------------
+ * BLISLAB 
+ * --------------------------------------------------------------------------
+ * Copyright (C) 2016, The University of Texas at Austin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name of The University of Texas nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * test_gemm.hh
+ *
+ *
+ * Purpose:
+ * this header file contains all function prototypes.
+ *
+ * Todo:
+ *
+ *
+ * Modification:
+ *
+ * 
+ * */
+
+
+#ifndef TEST_GEMM_HH
+#define TEST_GEMM_HH
+
+#include <math.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Determine the target operating system
+#if defined(_WIN32) || defined(__CYGWIN__)
+#define BL_OS_WINDOWS 1
+#elif defined(__APPLE__) || defined(__MACH__)
+#define BL_OS_OSX 1
+#elif defined(__ANDROID__)
+#define BL_OS_ANDROID 1
+#elif defined(__linux__)
+#define BL_OS_LINUX 1
+#elif defined(__bgq__)
+#define BL_OS_BGQ 1
+#elif defined(__bg__)
+#define BL_OS_BGP 1
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+      defined(__bsdi__) || defined(__DragonFly__)
+#define BL_OS_BSD 1
+#else
+#error "Cannot determine operating system"
+#endif
+
+// gettimeofday() needs this.
+#if BL_OS_WINDOWS
+  #include <time.h>
+#elif BL_OS_OSX
+  #include <mach/mach_time.h>
+#else
+  #include <sys/time.h>
+  #include <time.h>
+#endif
+
+//#include "bl_config.h"
+
+#define min( i, j ) ( (i)<(j) ? (i): (j) )
+
+#define A( i, j )     A[ (j)*lda + (i) ]
+#define B( i, j )     B[ (j)*ldb + (i) ]
+#define C( i, j )     C[ (j)*ldc + (i) ]
+#define C_ref( i, j ) C_ref[ (j)*ldc_ref + (i) ]
+#define GEMM_SIMD_ALIGN_SIZE 32
+struct aux_s {
+    double *b_next;
+    float  *b_next_s;
+    int    ldr;
+    char   *flag;
+    int    pc;
+    int    m;
+    int    n;
+};
+typedef struct aux_s aux_t;
+
+void bl_dgemm(
+        int    m,
+        int    n,
+        int    k,
+        double *A,
+        int    lda,
+        double *B,
+        int    ldb,
+        double *C,
+        int    ldc
+        );
+
+/*
+ *
+ *
+ */ 
+double *bl_malloc_aligned(
+        int    m,
+        int    n,
+        int    size
+        )
+{
+    double *ptr;
+    int    err;
+
+    err = posix_memalign( (void**)&ptr, (size_t)GEMM_SIMD_ALIGN_SIZE, size * m * n );
+
+    if ( err ) {
+        printf( "bl_malloc_aligned(): posix_memalign() failures" );
+        exit( 1 );    
+    }
+
+    return ptr;
+}
+
+
+
+/*
+ *
+ *
+ */
+void bl_dgemm_printmatrix(
+        double *A,
+        int    lda,
+        int    m,
+        int    n
+        )
+{
+    int    i, j;
+    for ( i = 0; i < m; i ++ ) {
+        for ( j = 0; j < n; j ++ ) {
+            printf("%lf\t", A[j * lda + i]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+/*
+ * The timer functions are copied directly from BLIS 0.2.0
+ *
+ */
+static double gtod_ref_time_sec = 0.0;
+double bl_clock_helper()
+{
+    double the_time, norm_sec;
+    struct timespec ts;
+
+    clock_gettime( CLOCK_MONOTONIC, &ts );
+
+    if ( gtod_ref_time_sec == 0.0 )
+        gtod_ref_time_sec = ( double ) ts.tv_sec;
+
+    norm_sec = ( double ) ts.tv_sec - gtod_ref_time_sec;
+
+    the_time = norm_sec + ts.tv_nsec * 1.0e-9;
+
+    return the_time;
+}
+
+
+double bl_clock( void )
+{
+	return bl_clock_helper();
+}
+
+#endif