diff --git a/cpp/blis.hh b/cpp/blis.hh
index fc25ebff7..602c094fd 100644
--- a/cpp/blis.hh
+++ b/cpp/blis.hh
@@ -1,102 +1,3824 @@
+/******************************************************************************
+* Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+/*! @file blis.hh
+ *  blis.hh defines all the BLAS CPP templated public interfaces
+ *  */
 #ifndef BLIS_HH
 #define BLIS_HH
 
-#include "blis_util.hh"
 #include "cblas.hh"
-#include <limits>
+#include "blis_util.hh"
 
 namespace blis {
 
-// =============================================================================
-/// General matrix-matrix multiply,
-///     \f[ C = \alpha op(A) \times op(B) + \beta C, \f]
-/// where op(X) is one of
-///     \f[ op(X) = X,   \f]
-///     \f[ op(X) = X^T, \f]
-///     \f[ op(X) = X^H, \f]
-/// alpha and beta are scalars, and A, B, and C are matrices, with
-/// op(A) an m-by-k matrix, op(B) a k-by-n matrix, and C an m-by-n matrix.
-///
-/// Generic implementation for arbitrary data types.
-/// TODO: generic version not yet implemented.
-///
-/// @param[in] layout
-///     Matrix storage, Layout::ColMajor or Layout::RowMajor.
-///
-/// @param[in] transA
-///     The operation op(A) to be used:
-///     - Op::NoTrans:   \f$ op(A) = A.   \f$
-///     - Op::Trans:     \f$ op(A) = A^T. \f$
-///     - Op::ConjTrans: \f$ op(A) = A^H. \f$
-///
-/// @param[in] transB
-///     The operation op(B) to be used:
-///     - Op::NoTrans:   \f$ op(B) = B.   \f$
-///     - Op::Trans:     \f$ op(B) = B^T. \f$
-///     - Op::ConjTrans: \f$ op(B) = B^H. \f$
-///
-/// @param[in] m
-///     Number of rows of the matrix C and op(A). m >= 0.
-///
-/// @param[in] n
-///     Number of columns of the matrix C and op(B). n >= 0.
-///
-/// @param[in] k
-///     Number of columns of op(A) and rows of op(B). k >= 0.
-///
-/// @param[in] alpha
-///     Scalar alpha. If alpha is zero, A and B are not accessed.
-///
-/// @param[in] A
-///     - If transA = NoTrans:
-///       the m-by-k matrix A, stored in an lda-by-k array [RowMajor: m-by-lda].
-///     - Otherwise:
-///       the k-by-m matrix A, stored in an lda-by-m array [RowMajor: k-by-lda].
-///
-/// @param[in] lda
-///     Leading dimension of A.
-///     - If transA = NoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)].
-///     - Otherwise:           lda >= max(1, k) [RowMajor: lda >= max(1, m)].
-///
-/// @param[in] B
-///     - If transB = NoTrans:
-///       the k-by-n matrix B, stored in an ldb-by-n array [RowMajor: k-by-ldb].
-///     - Otherwise:
-///       the n-by-k matrix B, stored in an ldb-by-k array [RowMajor: n-by-ldb].
-///
-/// @param[in] ldb
-///     Leading dimension of B.
-///     - If transB = NoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
-///     - Otherwise:           ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
-///
-/// @param[in] beta
-///     Scalar beta. If beta is zero, C need not be set on input.
-///
-/// @param[in] C
-///     The m-by-n matrix C, stored in an ldc-by-n array [RowMajor: m-by-ldc].
-///
-/// @param[in] ldc
-///     Leading dimension of C. ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
-///
-/// @ingroup gemm
+/*! \brief Construct plane rotation for arbitrary data types 
 
-template< typename TA, typename TB, typename TC >
+  \b Purpose:	
+
+  ROTG  construct plane rotation that eliminates b for arbitrary data types, such that \n
+
+  [ z ] = [  c  s ] [ a ] \n
+  [ 0 ]   [ -s  c ] [ b ] \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+  
+  \param[in, out] a
+  SINGLE/DOUBLE PRECISION REAL
+  On entry, scalar a. On exit, set to z.
+ 
+  \param[in, out] b
+  SINGLE/DOUBLE PRECISION REAL
+  On entry, scalar b. On exit, set to s, 1/c, or 0.
+ 
+  \param[out] c
+  Cosine of rotation; SINGLE/DOUBLE PRECISION REAL.
+ 
+  \param[out] s
+  Sine of rotation; SINGLE/DOUBLE PRECISION REAL.
+  */
+template< typename T >
+void rotg(
+    T *a,
+    T *b,
+    T *c,
+    T *s )
+{
+    cblas_rotg(a, b, c, s);
+}
+
+/*! \brief Construct the modified givens transformation matrix for arbitrary data types 
+
+  \b Purpose:	
+
+  ROTMG construct modified (fast) plane rotation, H, that eliminates b, such that \n
+  [ z ] = H [ sqrt(d1)    0  ] [ a ] \n
+  [ 0 ]     [  0    sqrt(d2) ] [ b ] \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+
+  \param[in, out] d1
+  SINGLE/DOUBLE PRECISION REAL
+  sqrt(d1) is scaling factor for vector x.
+ 
+  \param[in, out] d2
+  SINGLE/DOUBLE PRECISION REAL
+  sqrt(d2) is scaling factor for vector y.
+ 
+  \param[in, out] a
+  On entry, scalar a. On exit, set to z. SINGLE/DOUBLE PRECISION REAL.
+ 
+  \param[in, out] b
+  On entry, scalar b. SINGLE/DOUBLE PRECISION REAL.  
+  
+  \param[out] param
+  SINGLE/DOUBLE PRECISION REAL array, dimension (5),giving parameters 
+  of modified plane rotation 
+  param(1)=DFLAG
+  param(2)=DH11
+  param(3)=DH21
+  param(4)=DH12
+  param(5)=DH22
+  */
+template< typename T >
+void rotmg(
+    T *d1,
+    T *d2,
+    T *a,
+    T  b,
+    T  param[5] )
+{
+    cblas_rotmg(d1, d2, a, b, param );
+}
+
+/*! \brief Apply plane rotation for arbitrary data types 
+
+  \b Purpose:	
+
+  ROT applies a plane rotation:  \n
+  [ x^T ]   [  c  s ] [ x^T ]  \n
+  [ y^T ] = [ -s  c ] [ y^T ]  \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+
+  \param[in] n
+  Number of elements in x and y. n >= 0.
+ 
+  \param[in, out] x
+  SINGLE/DOUBLE PRECISION REAL array
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[in, out] y
+  SINGLE/DOUBLE PRECISION REAL array
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+ 
+  \param[in] c
+  Cosine of rotation; SINGLE/DOUBLE PRECISION REAL.
+ 
+  \param[in] s
+  Sine of rotation; SINGLE/DOUBLE PRECISION REAL.
+  */
+template< typename T >
+void rot(
+    int64_t n,
+    T *x, int64_t incx,
+    T *y, int64_t incy,
+    T c,
+    T s )
+{
+    cblas_rot( n, x, incx, y, incy, c, s );
+}
+
+/*! \brief Apply the modified givens transformation for arbitrary data types 
+
+  \b Purpose:	
+
+  ROTM applies modified (fast) plane rotation, H:  \n
+  [ x^T ] = H [ x^T ]  \n
+  [ y^T ]     [ y^T ]  \n
+  
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+
+  \param[in] n
+  Number of elements in x and y. n >= 0.
+ 
+  \param[in, out] x
+  SINGLE/DOUBLE PRECISION REAL array
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[in, out] y
+  SINGLE/DOUBLE PRECISION REAL array
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+ 
+  \param[in] P
+  SINGLE/DOUBLE PRECISION REAL array, dimension (5),giving parameters 
+  of modified plane rotation 
+  param(1)=DFLAG
+  param(2)=DH11
+  param(3)=DH21
+  param(4)=DH12
+  param(5)=DH22
+  */
+template< typename T >
+void rotm(
+    int64_t n,
+    T *x, int64_t incx,
+    T *y, int64_t incy,
+    const T *P)
+{
+    cblas_rotm( n, x, incx, y, incy, P );
+}
+
+/*! \brief Interchanges two vectors of arbitrary data types 
+
+  \b Purpose:	
+
+  SWAP interchanges two vectors uses unrolled loops for increments equal to 1.\n
+  x <=> y  \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  \param[in] n
+  n is INTEGER
+  Number of elements in x and y. n >= 0.
+ 
+  \param[in] x
+  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER.
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[in, out] y
+  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER.
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+  */
+template< typename T >
+void swap(
+    int64_t n,
+    T *x, int64_t incx,
+    T *y, int64_t incy )
+{
+    cblas_swap( n, x, incx, y, incy );
+}
+
+/*! \brief Scales a vector of arbitrary data types by a constant.
+
+  \b Purpose:	
+
+  SCAL scales a vector by a constant, uses unrolled loops for increment equal to 1.\n
+  x = alpha * x \n
+  Data precisions of vector & constant include SINGLE/DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  \param[in] n
+  n is INTEGER
+  Number of elements in x. n >= 0.
+ 
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha. 
+  
+  \param[in ,out] x
+  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+  */
+template< typename TA, typename TB >
+void scal(
+    int64_t n,
+    TA alpha,
+    TB* x, int64_t incx )
+{
+    cblas_scal( n, alpha, x, incx );
+}
+
+/*! \brief Copies a vector x to a vector y for arbitrary data types 
+
+  \b Purpose:	
+
+  COPY copies a vector x to a vector y.\n
+  y = x  \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  \param[in] n
+  n is INTEGER
+  Number of elements in x and y. n >= 0.
+ 
+  \param[in] x
+  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER.
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[out] y
+  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER.
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+  */
+template< typename T >
+void copy(
+    int64_t n,
+    T const *x, int64_t incx,
+    T       *y, int64_t incy )
+{
+    cblas_copy( n, x, incx, y, incy );
+}
+
+/*! \brief Performs addition of scaled vector for arbitrary data types 
+
+  \b Purpose:	
+
+  AXPY constant times a vector plus a vector.\n
+  y = alpha*x + y  \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  \param[in] n
+  n is INTEGER
+  Number of elements in x and y. n >= 0.
+ 
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.\n
+  If alpha is zero, y is not updated.
+  
+  \param[in] x
+  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER.
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[out] y
+  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER.
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+  */
+template< typename T >
+void axpy(
+    int64_t n,
+    T alpha,
+    T const *x, int64_t incx,
+    T       *y, int64_t incy )
+{
+    cblas_axpy( n, alpha, x, incx, y, incy );
+}
+
+/*! \brief Performs the dot product of two vectors for arbitrary data types 
+
+  \b Purpose:	
+
+  DOT forms the dot product of two vectors
+  uses unrolled loops for increments equal to one.\n
+  dot = x^T * y \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+
+  \param[in] n
+  n is INTEGER
+  Number of elements in x and y. n >= 0.
+  
+  \param[in] x
+  REAL/DOUBLE PRECISION array.
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER.
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[in] y
+  REAL/DOUBLE PRECISION array.
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER.
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+  
+  \return Unconjugated dot product, x^T * y.
+  REAL/DOUBLE PRECISION 
+  */
+template< typename T, typename TR >
+TR dot(
+    int64_t n,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy )
+{
+    if((std::is_same<T, float>::value)&(std::is_same<TR, double>::value))
+	    return cblas_dsdot( n, x, incx, y, incy );
+    else
+	    return cblas_dot( n, x, incx, y, incy );
+}
+
+/*! \brief Performs the dot product of two complex vectors 
+
+  \b Purpose:	
+
+  DOTU forms the dot product of two complex vectors. \n
+  CDOTU = X^T * Y \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX
+
+  \param[in] n
+  n is INTEGER
+  Number of elements in x and y. n >= 0.
+  
+  \param[in] x
+  REAL/DOUBLE PRECISION COMPLEX array.
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER.
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[in] y
+  REAL/DOUBLE PRECISION COMPLEX array.
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER.
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+  
+  \return Unconjugated dot product, x^T * y.
+  REAL/DOUBLE PRECISION COMPLEX
+  */
+template< typename T >
+T dotu(
+    int64_t n,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy )
+{
+    return cblas_dotu( n, x, incx, y, incy );
+}
+
+/*! \brief Performs the dot product of two complex vectors 
+
+  \b Purpose:	
+
+  DOTC forms the dot product of two complex vectors. \n
+  CDOTU = X^H * Y \n
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX
+
+  \param[in] n
+  n is INTEGER
+  Number of elements in x and y. n >= 0.
+  
+  \param[in] x
+  REAL/DOUBLE PRECISION COMPLEX array.
+  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
+ 
+  \param[in] incx
+  incx is INTEGER.
+  Stride between elements of x. incx must not be zero.
+  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
+ 
+  \param[in] y
+  REAL/DOUBLE PRECISION COMPLEX array.
+  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
+ 
+  \param[in] incy
+  incy is INTEGER.
+  Stride between elements of y. incy must not be zero.
+  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
+  
+  \return Conjugated dot product, x^H * y.
+  REAL/DOUBLE PRECISION COMPLEX
+  */
+template< typename T >
+T dotc(
+    int64_t n,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy )
+{
+    return cblas_dotc( n, x, incx, y, incy );
+}
+
+/*! \brief Performs inner product of two vectors with extended precision accumulation
+
+  \b Purpose:	
+
+  DOTC forms the inner product of two vectors with extended precision accumulation. \n
+  Data precisions supported include SINGLE PRECISION REAL
+
+  \param[in] n
+  n is INTEGER\n
+  number of elements in input vector(s)
+  
+  \param[in] alpha
+  alpha is REAL\n
+  single precision scalar to be added to inner product
+  
+  \param[in] x
+  x is REAL array, dimension ( 1 + ( n - 1 )*abs( incx ) )\n
+  single precision vector with n elements
+  
+  \param[in] incx
+  incx is INTEGER\n
+  storage spacing between elements of x
+  
+  \param[in] y
+  y is REAL array, dimension ( 1 + ( n - 1 )*abs( incx ) )\n
+  single precision vector with n elements
+  
+  \param[in] incy
+  incy is INTEGER\n
+  storage spacing between elements of y
+  
+  \return S.P. result with dot product accumulated in D.P.
+  */
+template< typename T >
+T sdsdot(
+    int64_t n,
+    T alpha,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy )
+{
+    return cblas_sdsdot( n, alpha, x, incx, y, incy );
+}
+
+/*! \brief return 2-norm of vectors of arbitrary data types
+
+  \b Purpose:	
+
+  NRM2 returns the euclidean norm of a vector via the function name, so that
+  SNRM2 := sqrt( x'*x ). \n
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  \param[in] n
+  n is INTEGER\n
+  number of elements in input vector(s)
+  
+  \param[in] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, 
+  dimension ( 1 + ( n - 1 )*abs( incx ) )\n
+  single precision vector with n elements
+  
+  \param[in] incx
+  incx is INTEGER\n
+  storage spacing between elements of x
+  
+  \return 2-norm of vector
+  REAL SINGLE/DOUBLE PRECISION
+  */
+template< typename T >
+real_type<T>
+nrm2(
+    int64_t n,
+    T const * x, int64_t incx )
+{
+    return cblas_nrm2( n, x, incx );
+}
+
+/*! \brief return 1-norm of vector of arbitrary data types
+
+  \b Purpose:	
+
+  ASUM takes the sum of the absolute values, uses unrolled loops for 
+  increment equal to one. \n
+  ASUM := || Re(x) ||_1 + || Im(x) ||_1. \n
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  \param[in] n
+  n is INTEGER\n
+  number of elements in input vector(s)
+  
+  \param[in] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, 
+  dimension ( 1 + ( n - 1 )*abs( incx ) )\n
+  single precision vector with n elements
+  
+  \param[in] incx
+  incx is INTEGER\n
+  storage spacing between elements of x
+  
+  \return 1-norm of vector
+  REAL SINGLE/DOUBLE PRECISION
+  */
+template< typename T >
+real_type<T>
+asum(
+    int64_t n,
+    T const *x, int64_t incx )
+{
+    return cblas_asum( n, x, incx );
+}
+
+/*! \brief Return Index of infinity-norm of vectors of arbitrary types.
+
+  \b Purpose:	
+
+  IAMAX finds the index of the first element having maximum |Re(.)| + |Im(.)|. \n
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  \param[in] n
+  n is INTEGER\n
+  number of elements in input vector(s)
+  
+  \param[in] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, 
+  dimension ( 1 + ( n - 1 )*abs( incx ) ) \n
+  single precision vector with n elements
+  
+  \param[in] incx
+  incx is INTEGER\n
+  storage spacing between elements of x
+  
+  \return Index of infinity-norm of vector
+  INTEGER
+  */
+template< typename T >
+int64_t iamax(
+    int64_t n,
+    T const *x, int64_t incx )
+{
+    return cblas_iamax( n, x, incx );
+}
+
+/*! \brief Solve General matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  GEMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+ 
+     y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are vectors and A is an
+  m by n matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows: \n
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,y := alpha*A*x + beta*y. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  y := alpha*A**T*x + beta*y. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  y := alpha*A**T*x + beta*y.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m specifies the number of rows of the matrix A.
+  m must be at least zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the number of columns of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an lda-by-n array [RowMajor: m-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda >= max(1, m) [RowMajor: lda >= max(1, n)].
+
+  \param[in] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
+  If trans = CblasNoTrans:
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Otherwise:
+  at least ( 1 + ( m - 1 )*abs( incx ) ).
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then y need not be set on input.
+
+  \param[in,out] y
+  y is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : \n
+  If trans = CblasNoTrans:
+  at least ( 1 + ( m - 1 )*abs( incy ) ). \n
+  Otherwise:
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void gemv(
+    CBLAS_ORDER layout,
+    CBLAS_TRANSPOSE trans,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_gemv(layout, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+/*! \brief Solve General matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  GBMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+ 
+     y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+	 
+	 y := alpha*A**H*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are vectors and A is an
+  m by n matrix with kl sub-diagonals and ku super-diagonals.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows: \n
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,y := alpha*A*x + beta*y. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  y := alpha*A**T*x + beta*y. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  y := alpha*A**H*x + beta*y.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m specifies the number of rows of the matrix A.
+  m must be at least zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the number of columns of the matrix A.
+  n must be at least zero.
+
+  \param[in] kl
+  kl is INTEGER
+  On entry,  kl specifies the number of sub-diagonals of the matrix A.
+  kl must be at least zero.
+
+  \param[in] ku
+  ku is INTEGER
+  On entry,  ku specifies the number of super-diagonals of the matrix A.
+  ku must be at least zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension lda-by-n.
+  Before entry, the leading ( kl + ku + 1 ) by n part of the
+  array A must contain the matrix of coefficients, supplied
+  column by column, with the leading diagonal of the matrix in
+  row ( ku + 1 ) of the array, the first super-diagonal
+  starting at position 2 in row ku, the first sub-diagonal
+  starting at position 1 in row ( ku + 2 ), and so on.
+  Elements in the array A that do not correspond to elements
+  in the band matrix (such as the top left ku by ku triangle)
+  are not referenced.
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda >= ( kl + ku + 1 )
+
+  \param[in] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
+  If trans = CblasNoTrans:
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Otherwise:
+  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.  
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then y need not be set on input.
+
+  \param[in,out] y
+  y is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : \n
+  If trans = CblasNoTrans:
+  at least ( 1 + ( m - 1 )*abs( incy ) ). \n
+  Otherwise:
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void gbmv(
+    CBLAS_ORDER layout,
+    CBLAS_TRANSPOSE trans,
+    int64_t m, int64_t n,
+    int64_t kl, int64_t ku,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_gbmv(layout, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+/*! \brief Solves Hermitian matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  HEMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION COMPLEX, 
+  DOUBLE PRECISION COMPLEX(COMPLEX*16)
+ 
+     y := alpha*A*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are  n element vectors and 
+  A is an n by n hermitian matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] alpha
+  alpha is COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is COMPLEX/COMPLEX*16 array,dimension lda-by-n. \n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular part of the hermitian matrix and the strictly
+  lower triangular part of A is not referenced.
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular part of the hermitian matrix and the strictly
+  upper triangular part of A is not referenced. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set and are assumed to be zero.
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+
+  \param[in] x
+  x is COMPLEX/COMPLEX*16 array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then y need not be set on input.
+
+  \param[in,out] y
+  y is COMPLEX/COMPLEX*16 array, dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void hemv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_hemv(layout, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+/*! \brief Solves Hermitian matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  HBMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION COMPLEX, 
+  DOUBLE PRECISION COMPLEX(COMPLEX*16)
+ 
+     y := alpha*A*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are  n element vectors and 
+  A is an n by n hermitian matrix with k super-diagonals.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether the the upper or lower triangular
+  part of the band matrix A is being supplied as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] k
+  k is INTEGER
+  On entry,  k specifies the number of super-diagonals of the matrix A.
+  k must be at least zero.
+
+  \param[in] alpha
+  alpha is COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is COMPLEX/COMPLEX*16 array,dimension lda-by-n. \n
+  Before entry with UPLO = CblasUpper, the leading ( k + 1 )
+  by n part of the array A must contain the upper triangular
+  band part of the hermitian matrix, supplied column by
+  column, with the leading diagonal of the matrix in row
+  ( k + 1 ) of the array, the first super-diagonal starting at
+  position 2 in row k, and so on. The top left k by k triangle
+  of the array A is not referenced. \n
+  Before entry with UPLO = CblasLower, the leading ( k + 1 )
+  by n part of the array A must contain the lower triangular
+  band part of the hermitian matrix, supplied column by
+  column, with the leading diagonal of the matrix in row 1 of
+  the array, the first sub-diagonal starting at position 1 in
+  row 2, and so on. The bottom right k by k triangle of the
+  array A is not referenced. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set and are assumed to be zero.
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least ( k + 1 ).
+
+  \param[in] x
+  x is COMPLEX/COMPLEX*16 array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.
+
+  \param[in,out] y
+  y is COMPLEX/COMPLEX*16 array, dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void hbmv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_hbmv(layout, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+/*! \brief Solves Hermitian matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  HPMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION COMPLEX, 
+  DOUBLE PRECISION COMPLEX(COMPLEX*16)
+ 
+     y := alpha*A*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are  n element vectors and 
+  A is an n by n hermitian matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether the the upper or lower triangular
+  part of the band matrix A is supplied in the packed array Ap as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] alpha
+  alpha is COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] Ap
+  Ap is COMPLEX/COMPLEX*16 array,dimension atleast ( ( n*( n + 1 ) )/2 ). \n
+  Before entry with UPLO = CblasUpper, the array Ap must
+  contain the upper triangular part of the hermitian matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
+  and a( 2, 2 ) respectively, and so on. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular part of the hermitian matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
+  and a( 3, 1 ) respectively, and so on. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set and are assumed to be zero.
+
+  \param[in] x
+  x is COMPLEX/COMPLEX*16 array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When beta is
+  supplied as zero then y need not be set on input.
+
+  \param[in,out] y
+  y is COMPLEX/COMPLEX*16 array, dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void hpmv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,
+    T const *Ap,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_hpmv(layout, uplo, n, alpha, Ap, x, incx, beta, y, incy);
+}
+
+/*! \brief Solves Symmetric matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  SYMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
+ 
+     y := alpha*A*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are  n element vectors and 
+  A is an n by n symmetric matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is SINGLE/DOUBLE PRECISION REAL array,dimension lda-by-n. \n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular part of the symmetric matrix and the strictly
+  lower triangular part of A is not referenced.
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular part of the symmetric matrix and the strictly
+  upper triangular part of A is not referenced. \n
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION REAL array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is SINGLE/DOUBLE PRECISION REAL
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then y need not be set on input.
+
+  \param[in,out] y
+  y is SINGLE/DOUBLE PRECISION REAL array, dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void symv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_symv(layout, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+/*! \brief Solves symmetric matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  SBMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
+ 
+     y := alpha*A*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are  n element vectors and 
+  A is an n by n symmetric matrix with k super-diagonals.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether the the upper or lower triangular
+  part of the band matrix A is being supplied as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] k
+  k is INTEGER
+  On entry,  k specifies the number of super-diagonals of the matrix A.
+  k must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is SINGLE/DOUBLE PRECISION REAL array,dimension lda-by-n. \n
+  Before entry with UPLO = CblasUpper, the leading ( k + 1 )
+  by n part of the array A must contain the upper triangular
+  band part of the symmetric matrix, supplied column by
+  column, with the leading diagonal of the matrix in row
+  ( k + 1 ) of the array, the first super-diagonal starting at
+  position 2 in row k, and so on. The top left k by k triangle
+  of the array A is not referenced. \n
+  Before entry with UPLO = CblasLower, the leading ( k + 1 )
+  by n part of the array A must contain the lower triangular
+  band part of the symmetric matrix, supplied column by
+  column, with the leading diagonal of the matrix in row 1 of
+  the array, the first sub-diagonal starting at position 1 in
+  row 2, and so on. The bottom right k by k triangle of the
+  array A is not referenced. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set and are assumed to be zero.
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least ( k + 1 ).
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION REAL array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is SINGLE/DOUBLE PRECISION REAL
+  On entry, beta specifies the scalar alpha.
+
+  \param[in,out] y
+  y is SINGLE/DOUBLE PRECISION REAL array, dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void sbmv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_sbmv(layout, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+/*! \brief Solves symmetric matrix-vector multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  SPMV  performs one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
+ 
+     y := alpha*A*x + beta*y,
+ 
+  where alpha and beta are scalars, x and y are  n element vectors and 
+  A is an n by n symmetric matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether the the upper or lower triangular
+  part of the band matrix A is supplied in the packed array Ap as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] Ap
+  Ap is SINGLE/DOUBLE PRECISION REAL array,dimension atleast ( ( n*( n + 1 ) )/2 ). \n
+  Before entry with UPLO = CblasUpper, the array Ap must
+  contain the upper triangular part of the symmetric matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
+  and a( 2, 2 ) respectively, and so on. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular part of the symmetric matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
+  and a( 3, 1 ) respectively, and so on. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set and are assumed to be zero.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION REAL array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] beta
+  beta is SINGLE/DOUBLE PRECISION REAL
+  On entry, beta specifies the scalar alpha.When beta is
+  supplied as zero then y need not be set on input.
+
+  \param[in,out] y
+  y is SINGLE/DOUBLE PRECISION REAL array, dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry with beta non-zero, the incremented array y
+  must contain the vector y. On exit, y is overwritten by the
+  updated vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  */
+template< typename T >
+void spmv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,
+    T const *Ap,
+    T const *x, int64_t incx,
+    T beta,
+    T *y, int64_t incy )
+{
+    cblas_spmv(layout, uplo, n, alpha, Ap, x, incx, beta, y, incy);
+}
+
+/*! \brief Solve the one of the matrix-vector operations for arbitrary data types 
+
+  \b Purpose:	
+
+  TRMV  performs  one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  x := A*x,   or   x := A**T*x,
+
+  where x is an n element vector and  A is an n by n unit, or non-unit,
+  upper or lower triangular matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be performed as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  x := A*x. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  x := A**T*x. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  x := A**T*x.
+  
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows: \n
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular matrix and the strictly lower triangular part of
+  A is not referenced. \n
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular matrix and the strictly upper triangular part of
+  A is not referenced. \n
+  Note that when  DIAG = CblasUnit, the diagonal elements of
+  A are not referenced either, but are assumed to be unity.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+
+  \param[in, out] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.On exit, x is overwritten with the transformed vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+  */
+template< typename T >
+void trmv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t n,
+    T const *A, int64_t lda,
+    T       *x, int64_t incx )
+{
+    cblas_trmv(layout, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+/*! \brief Solve the one of the matrix-vector operations for arbitrary data types 
+
+  \b Purpose:	
+
+  TBMV  performs  one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  x := A*x,   or   x := A**T*x,
+
+  where x is an n element vector and  A is an n by n unit, or non-unit,
+  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be performed as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  x := A*x. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  x := A**T*x. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  x := A**T*x.
+  
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows: \n
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] k
+  k is INTEGER
+  On entry with UPLO = CblasUpper, k specifies the number of
+  super-diagonals of the matrix A.
+  On entry with UPLO = CblasLower, k specifies the number of
+  sub-diagonals of the matrix A.
+  k must at least zero.
+ 
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
+  Before entry with UPLO = CblasUpper, the leading ( k + 1 )
+  by n part of the array A must contain the upper triangular
+  band part of the matrix of coefficients, supplied column by
+  column, with the leading diagonal of the matrix in row
+  ( k + 1 ) of the array, the first super-diagonal starting at
+  position 2 in row k, and so on. The top left k by k triangle
+  of the array A is not referenced. \n 
+  Before entry with UPLO = CblasLower, the leading ( k + 1 )
+  by n part of the array A must contain the lower triangular
+  band part of the matrix of coefficients, supplied column by
+  column, with the leading diagonal of the matrix in row 1 of
+  the array, the first sub-diagonal starting at position 1 in
+  row 2, and so on. The bottom right k by k triangle of the
+  array A is not referenced. \n
+  Note that when DIAG = CblasUnit the elements of the array A
+  corresponding to the diagonal elements of the matrix are not
+  referenced, but are assumed to be unity.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, ( k + 1 ) ).
+
+  \param[in, out] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.On exit, x is overwritten with the transformed vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+  */
+template< typename T >
+void tbmv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t n, int64_t k,
+    T const *A, int64_t lda,
+    T       *x, int64_t incx )
+{
+    cblas_tbmv(layout, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+
+/*! \brief Solve the one of the matrix-vector operations for arbitrary data types 
+
+  \b Purpose:	
+
+  TPMV  performs  one of the matrix-vector operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  x := A*x,   or   x := A**T*x,
+
+  where x is an n element vector and  A is an n by n unit, or non-unit,
+  upper or lower triangular matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be performed as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  x := A*x. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  x := A**T*x. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  x := A**T*x.
+  
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows: \n
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+ 
+ \param[in] Ap
+  Ap is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension 
+  ( ( n*( n + 1 ) )/2 ). \n
+  Before entry with  UPLO = CblasUpper, the array Ap must
+  contain the upper triangular matrix packed sequentially,
+  column by column, so that Ap( 1 ) contains a( 1, 1 ),
+  Ap( 2 ) and Ap( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+  respectively, and so on. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular matrix packed sequentially,
+  column by column, so that Ap( 1 ) contains a( 1, 1 ),
+  Ap( 2 ) and Ap( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+  respectively, and so on. \n
+  Note that when  DIAG = CblasUnit, the diagonal elements of
+  A are not referenced, but are assumed to be unity.
+
+  \param[in, out] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  vector x.On exit, x is overwritten with the transformed vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+  */
+template< typename T >
+void tpmv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t n,
+    T const *Ap,
+    T       *x, int64_t incx )
+{
+    cblas_tpmv(layout, uplo, trans, diag, n, Ap, x, incx);
+}
+
+/*! \brief Solve the one of the triangular matrix-vector equation for arbitrary data types 
+
+  \b Purpose:	
+
+  TRSV  solves one of the systems of equations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A*x = b,   or   A**T*x = b,
+
+  where b and x are n element vectors and A is an n by n unit, or
+  non-unit, upper or lower triangular matrix
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be performed as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  A*x = b. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  A**T*x = b. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  A**T*x = b.
+  
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows: \n
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular matrix and the strictly lower triangular part of
+  A is not referenced. \n
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular matrix and the strictly upper triangular part of
+  A is not referenced. \n
+  Note that when  DIAG = CblasUnit, the diagonal elements of
+  A are not referenced either, but are assumed to be unity.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+
+  \param[in, out] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  element right-hand side vector b.On exit, x is overwritten
+  with the transformed vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+  */
+template< typename T >
+void trsv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t n,
+    T const *A, int64_t lda,
+    T       *x, int64_t incx )
+{
+    cblas_trsv(layout, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+/*! \brief Solve the one of the triangular matrix-vector equation for arbitrary data types 
+
+  \b Purpose:	
+
+  TBSV  solves one of the systems of equations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A*x = b,   or   A**T*x = b,
+
+  where b and x are n element vectors and A is an n by n unit, or
+  non-unit, upper or lower triangular band matrix, with ( k + 1 )
+  diagonals.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be performed as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  A*x = b. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  A**T*x = b. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  A**T*x = b.
+  
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows: \n
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+  \param[in] k
+  k is INTEGER
+  On entry with UPLO = CblasUpper, k specifies the number of
+  super-diagonals of the matrix A.
+  On entry with UPLO = CblasLower, k specifies the number of
+  sub-diagonals of the matrix A.
+  k must at least zero.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
+  Before entry with  UPLO = CblasUpper, the leading ( k + 1 )
+  by n part of the array A must contain the upper triangular
+  band part of the matrix of coefficients, supplied column by
+  column, with the leading diagonal of the matrix in row
+  ( k + 1 ) of the array, the first super-diagonal starting at
+  position 2 in row k, and so on. The top left k by k triangle
+  of the array A is not referenced. \n
+  Before entry with UPLO = CblasLower, the leading ( k + 1 )
+  by n part of the array A must contain the lower triangular
+  band part of the matrix of coefficients, supplied column by
+  column, with the leading diagonal of the matrix in row 1 of
+  the array, the first sub-diagonal starting at position 1 in
+  row 2, and so on. The bottom right k by k triangle of the
+  array A is not referenced. \n
+  Note that when  DIAG = CblasUnit, the elements of the array A
+  corresponding to the diagonal elements of the matrix are not
+  referenced, but are assumed to be unity.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, k+1 ).
+
+  \param[in, out] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  element right-hand side vector b.On exit, x is overwritten
+  with the solution vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+  */
+template< typename T >
+void tbsv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t n, int64_t k,
+    T const *A, int64_t lda,
+    T       *x, int64_t incx )
+{
+    cblas_tbsv(layout, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+
+/*! \brief Solve the one of the triangular matrix-vector equation for arbitrary data types 
+
+  \b Purpose:	
+
+  TPSV  solves one of the systems of equations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A*x = b,   or   A**T*x = b,
+
+  where b and x are n element vectors and A is an n by n unit, or
+  non-unit, upper or lower triangular band matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be performed as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  A*x = b. \n
+  trans = CBLAS_TRANSPOSE::CblasTrans,  A**T*x = b. \n
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  A**T*x = b.
+  
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows: \n
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.n must be at least zero.
+
+ \param[in] Ap
+  Ap is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension 
+  ( ( n*( n + 1 ) )/2 ). \n
+  Before entry with  UPLO = CblasUpper, the array Ap must
+  contain the upper triangular matrix packed sequentially,
+  column by column, so that Ap( 1 ) contains a( 1, 1 ),
+  Ap( 2 ) and Ap( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+  respectively, and so on. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular matrix packed sequentially,
+  column by column, so that Ap( 1 ) contains a( 1, 1 ),
+  Ap( 2 ) and Ap( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+  respectively, and so on. \n
+  Note that when  DIAG = CblasUnit, the diagonal elements of
+  A are not referenced, but are assumed to be unity.
+
+  \param[in, out] x
+  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the
+  element right-hand side vector b.On exit, x is overwritten
+  with the solution vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+  */
+template< typename T >
+void tpsv(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t n,
+    T const *Ap,
+    T       *x, int64_t incx )
+{
+    cblas_tpsv(layout, uplo, trans, diag, n, Ap, x, incx);
+}
+
+/*! \brief Perform the General matrix rank-1 update for arbitrary data types 
+
+  \b Purpose:	
+
+  GER  performs the rank 1 operation for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+
+  A := alpha*x*y**T + A,
+ 
+  where alpha is a scalar, x is an m element vector, y is an n element
+  vector and A is an m by n matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m specifies the number of rows of the matrix A.
+  m must be at least zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the number of columns of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is REAL/DOUBLE PRECISION array,dimension : 
+  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the m
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] y
+  y is REAL/DOUBLE PRECISION array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry, the incremented array y must contain the n
+  element vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+
+  \param[in,out] A
+  A is REAL/DOUBLE PRECISION array,dimension ( lda, n )\n
+  Before entry, the leading m by n part of the array A must
+  contain the matrix of coefficients. On exit, A is
+  overwritten by the updated matrix.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, m ).
+  */
+template< typename T >
+void ger(
+    CBLAS_ORDER layout,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy,
+    T *A, int64_t lda )
+{
+    cblas_ger(layout, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+/*! \brief Perform the General matrix rank-1 update for arbitrary data types 
+
+  \b Purpose:	
+
+  GERU  performs the rank 1 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A := alpha*x*y**T + A,
+ 
+  where alpha is a scalar, x is an m element vector, y is an n element
+  vector and A is an m by n matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m specifies the number of rows of the matrix A.
+  m must be at least zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the number of columns of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION COMPLEX
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the m
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] y
+  y is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry, the incremented array y must contain the n
+  element vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+
+  \param[in,out] A
+  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
+  Before entry, the leading m by n part of the array A must
+  contain the matrix of coefficients. On exit, A is
+  overwritten by the updated matrix.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, m ).
+  */
+template< typename T >
+void geru(
+    CBLAS_ORDER layout,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy,
+    T *A, int64_t lda )
+{
+    cblas_geru(layout, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+/*! \brief Perform the General matrix rank-1 update for arbitrary data types 
+
+  \b Purpose:	
+
+  GERC  performs the rank 1 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A := alpha*x*y**T + A,
+ 
+  where alpha is a scalar, x is an m element vector, y is an n element
+  vector and A is an m by n matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m specifies the number of rows of the matrix A.
+  m must be at least zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the number of columns of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION COMPLEX
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the m
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] y
+  y is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry, the incremented array y must contain the n
+  element vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+
+  \param[in,out] A
+  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
+  Before entry, the leading m by n part of the array A must
+  contain the matrix of coefficients. On exit, A is
+  overwritten by the updated matrix.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, m ).
+  */
+template< typename T >
+void gerc(
+    CBLAS_ORDER layout,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy,
+    T *A, int64_t lda )
+{
+    cblas_gerc(layout, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+/*! \brief Perform the hermitian rank 1 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  HER  performs the hermitian rank 1 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A := alpha*x*x**H + A,
+ 
+  where alpha is a real scalar, x is an n element vector, A is an n by n
+  hermitian matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the upper or lower triangular 
+  part of the array A is to be referenced as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in,out] A
+  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular part of the hermitian matrix and the strictly
+  lower triangular part of A is not referenced. On exit, the
+  upper triangular part of the array A is overwritten by the
+  upper triangular part of the updated matrix. \n
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular part of the hermitian matrix and the strictly
+  upper triangular part of A is not referenced. On exit, the
+  lower triangular part of the array A is overwritten by the
+  lower triangular part of the updated matrix. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set, they are assumed to be zero, and on exit they
+  are set to zero.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+  */
+template< typename T >
+void her(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    real_type<T> alpha,  // zher takes double alpha; use real
+    T const *x, int64_t incx,
+    T       *A, int64_t lda )
+{
+    cblas_her(layout, uplo, n, alpha, x, incx, A, lda);
+}
+
+/*! \brief Perform the hermitian rank 1 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  HPR  performs the hermitian rank 1 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A := alpha*x*x**H + A,
+ 
+  where alpha is a real scalar, x is an n element vector, A is an n by n
+  hermitian matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the upper or lower triangular 
+  part of the array A is to be referenced as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
+                                  supplied in Ap. \n
+  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
+                                  supplied in Ap.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in,out] Ap
+  Ap is SINGLE/DOUBLE PRECISION COMPLEX array,dimension 
+  atleast ( ( n*( n + 1 ) )/2 ).\n
+  Before entry with  UPLO = CblasUpper, the array Ap must
+  contain the upper triangular part of the hermitian matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
+  and a( 2, 2 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the upper triangular part of the
+  updated matrix. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular part of the hermitian matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
+  and a( 3, 1 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the lower triangular part of the
+  updated matrix. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set, they are assumed to be zero, and on exit they
+  are set to zero.  
+  */
+template< typename T >
+void hpr(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    real_type<T> alpha,  // zher takes double alpha; use real
+    T const *x, int64_t incx,
+    T       *Ap )
+{
+    cblas_hpr(layout, uplo, n, alpha, x, incx, Ap);
+}
+
+/*! \brief Perform the hermitian rank 2 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  HER2  performs the hermitian rank 2 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+ 
+  where alpha is a scalar, x and y are n element vector, A is an n by n
+  hermitian matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies whether the upper or lower triangular part of the 
+  array A is to be referenced as follows: \n
+  UPLO = CblasUpper   Only the upper triangular part of A
+                      is to be referenced. \n
+  UPLO = CblasLower   Only the lower triangular part of A
+                      is to be referenced.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION COMPLEX
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] y
+  y is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry, the incremented array y must contain the n
+  element vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  
+  \param[in,out] A
+  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular part of the hermitian matrix and the strictly
+  lower triangular part of A is not referenced. On exit, the
+  upper triangular part of the array A is overwritten by the
+  upper triangular part of the updated matrix. \n
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular part of the hermitian matrix and the strictly
+  upper triangular part of A is not referenced. On exit, the
+  lower triangular part of the array A is overwritten by the
+  lower triangular part of the updated matrix. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set, they are assumed to be zero, and on exit they
+  are set to zero.  
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+  */
+template< typename T >
+void her2(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,  
+    T const *x, int64_t incx,
+    T const *y, int64_t incy,
+    T       *A, int64_t lda )
+{
+    cblas_her2(layout, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+/*! \brief Perform the hermitian rank 2 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  HPR2  performs the hermitian rank 2 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+ 
+  where alpha is a scalar, x and y are n element vector, A is an n by n
+  hermitian matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the upper or lower triangular 
+  part of the array A is to be referenced as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
+                                  supplied in Ap. \n
+  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
+                                  supplied in Ap.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION COMPLEX
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] y
+  y is SINGLE/DOUBLE PRECISION REAL array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry, the incremented array y must contain the n
+  element vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  
+  \param[in,out] Ap
+  Ap is SINGLE/DOUBLE PRECISION COMPLEX array,dimension 
+  atleast ( ( n*( n + 1 ) )/2 ).\n
+  Before entry with  UPLO = CblasUpper, the array Ap must
+  contain the upper triangular part of the hermitian matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
+  and a( 2, 2 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the upper triangular part of the
+  updated matrix. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular part of the hermitian matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
+  and a( 3, 1 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the lower triangular part of the
+  updated matrix. \n
+  Note that the imaginary parts of the diagonal elements need
+  not be set, they are assumed to be zero, and on exit they
+  are set to zero.  
+  */
+template< typename T >
+void hpr2(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,  
+    T const *x, int64_t incx,
+    T const *y, int64_t incy,
+    T       *Ap )
+{
+    cblas_hpr2(layout, uplo, n, alpha, x, incx, y, incy, Ap);
+}
+
+/*! \brief Perform the symmetric rank 1 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  SYR performs the symmetric rank 1 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+
+  A := alpha*x*x**T + A,
+ 
+  where alpha is a real scalar, x is an n element vector, A is an n by n
+  symmetric matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the upper or lower triangular 
+  part of the array A is to be referenced as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in,out] A
+  A is SINGLE/DOUBLE PRECISION REAL array,dimension ( lda, n )\n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular part of the symmetric matrix and the strictly
+  lower triangular part of A is not referenced. On exit, the
+  upper triangular part of the array A is overwritten by the
+  upper triangular part of the updated matrix. \n
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular part of the symmetric matrix and the strictly
+  upper triangular part of A is not referenced. On exit, the
+  lower triangular part of the array A is overwritten by the
+  lower triangular part of the updated matrix. \n
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+  */
+template< typename T >
+void syr(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,  
+    T const *x, int64_t incx,
+    T       *A, int64_t lda )
+{
+    cblas_syr(layout, uplo, n, alpha, x, incx, A, lda);
+}
+
+/*! \brief Perform the symmetric rank 1 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  SPR  performs the symmetric rank 1 operation for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
+
+  A := alpha*x*x**T + A,
+ 
+  where alpha is a real scalar, x is an n element vector, A is an n by n
+  symmetric matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the upper or lower triangular 
+  part of the array A is to be referenced as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
+                                  supplied in Ap. \n
+  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
+                                  supplied in Ap.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in,out] Ap
+  Ap is SINGLE/DOUBLE PRECISION REAL array,dimension 
+  atleast ( ( n*( n + 1 ) )/2 ).\n
+  Before entry with  UPLO = CblasUpper, the array Ap must
+  contain the upper triangular part of the symmetric matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
+  and a( 2, 2 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the upper triangular part of the
+  updated matrix. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular part of the symmetric matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
+  and a( 3, 1 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the lower triangular part of the
+  updated matrix. \n  
+  */
+template< typename T >
+void spr(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,  
+    T const *x, int64_t incx,
+    T       *Ap )
+{
+    cblas_spr(layout, uplo, n, alpha, x, incx, Ap);
+}
+
+/*! \brief Perform the symmetric rank 2 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  SYR2  performs the symmetric rank 2 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+
+  A := alpha*x*y**T + alpha*y*x**T + A,
+ 
+  where alpha is a scalar, x and y are n element vector, A is an n by n
+  symmetric matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies whether the upper or lower triangular part of the 
+  array A is to be referenced as follows: \n
+  UPLO = CblasUpper   Only the upper triangular part of A
+                      is to be referenced. \n
+  UPLO = CblasLower   Only the lower triangular part of A
+                      is to be referenced.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] y
+  y is SINGLE/DOUBLE PRECISION REAL array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry, the incremented array y must contain the n
+  element vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  
+  \param[in,out] A
+  A is SINGLE/DOUBLE PRECISION REAL array,dimension ( lda, n )\n
+  Before entry with  UPLO = CblasUpper, the leading n by n
+  upper triangular part of the array A must contain the upper
+  triangular part of the symmetric matrix and the strictly
+  lower triangular part of A is not referenced. On exit, the
+  upper triangular part of the array A is overwritten by the
+  upper triangular part of the updated matrix. \n
+  Before entry with UPLO = CblasLower, the leading n by n
+  lower triangular part of the array A must contain the lower
+  triangular part of the symmetric matrix and the strictly
+  upper triangular part of A is not referenced. On exit, the
+  lower triangular part of the array A is overwritten by the
+  lower triangular part of the updated matrix. \n
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  lda must be at least max( 1, n ).
+  */
+template< typename T >
+void syr2(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy,
+    T       *A, int64_t lda )
+{
+    cblas_syr2(layout, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+/*! \brief Perform the symmetric rank 2 operation for arbitrary data types 
+
+  \b Purpose:	
+
+  SPR2  performs the symmetric rank 2 operation for arbitrary data types
+  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
+
+  A := alpha*x*y**T + alpha*y*x**T + A,
+ 
+  where alpha is a scalar, x and y are n element vector, A is an n by n
+  symmetric matrix, supplied in packed form.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO.
+  uplo specifies specifies whether the upper or lower triangular 
+  part of the array A is to be referenced as follows: \n
+  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
+                                  supplied in Ap. \n
+  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
+                                  supplied in Ap.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix A.
+  n must be at least zero.
+
+  \param[in] alpha
+  alpha is SINGLE/DOUBLE PRECISION REAL
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] x
+  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
+  Before entry, the incremented array x must contain the n
+  element vector x.
+
+  \param[in] incx
+  incx is INTEGER
+  On entry, incx specifies the increment for the elements of x.
+  incx must not be zero.
+
+  \param[in] y
+  y is SINGLE/DOUBLE PRECISION REAL array,dimension : 
+  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
+  Before entry, the incremented array y must contain the n
+  element vector y.
+
+  \param[in] incy
+  incy is INTEGER
+  On entry, incy specifies the increment for the elements of y.
+  incy must not be zero.
+  
+  \param[in,out] Ap
+  Ap is SINGLE/DOUBLE PRECISION REAL array,dimension 
+  atleast ( ( n*( n + 1 ) )/2 ).\n
+  Before entry with  UPLO = CblasUpper, the array Ap must
+  contain the upper triangular part of the symmetric matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
+  and a( 2, 2 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the upper triangular part of the
+  updated matrix. \n
+  Before entry with UPLO = CblasLower, the array Ap must
+  contain the lower triangular part of the symmetric matrix
+  packed sequentially, column by column, so that Ap( 1 )
+  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
+  and a( 3, 1 ) respectively, and so on. On exit, the array
+  Ap is overwritten by the lower triangular part of the
+  updated matrix. \n  
+  */
+template< typename T >
+void spr2(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    int64_t n,
+    T alpha,
+    T const *x, int64_t incx,
+    T const *y, int64_t incy,
+    T       *Ap )
+{
+    cblas_spr2(layout, uplo, n, alpha, x, incx, y, incy, Ap);
+}
+
+/*! \brief General matrix-matrix multiply for arbitrary data types
+
+  \b Purpose:	
+
+  GEMM  performs general matrix-matrix multiply for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*op( A )*op( B ) + beta*C,
+
+  where  op( X ) is one of
+
+  op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+
+  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] transA
+  transA is CBLAS_TRANSPOSE
+  On entry, transA specifies the form of op( A ) to be used in
+  the matrix multiplication as follows:
+  transA = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
+  transA = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
+  transA = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**H.
+
+  \param[in] transB
+  transB is CBLAS_TRANSPOSE
+  On entry, transB specifies the form of op( B ) to be used in
+  the matrix multiplication as follows:
+  transB = CBLAS_TRANSPOSE::CblasNoTrans,  op( B ) = B.
+  transB = CBLAS_TRANSPOSE::CblasTrans,  op( B ) = B**T.
+  transB = CBLAS_TRANSPOSE::CblasConjTrans,  op( B ) = B**H.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  op( A )  and of the  matrix  C.  m  must  be at least  zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  op( B ) and the number of columns of the matrix C. n must be
+  at least zero.
+
+  \param[in] k
+  k is INTEGER
+  On entry,  k  specifies  the number of columns of the matrix
+  op( A ) and the number of rows of the matrix op( B ). k must
+  be at least  zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If transA = CblasNoTrans:
+  m-by-k , stored in an lda-by-k array [RowMajor: m-by-lda].
+  Otherwise:
+  k-by-m , stored in an lda-by-m array [RowMajor: k-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If transA = CblasNoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, m)].
+
+  \param[in] B
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If transA = CblasNoTrans:
+  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb].
+  Otherwise:
+  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
+
+  \param[in] ldb
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  If transA = CblasNoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
+  Otherwise:                ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
+
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+
+  \param[in,out] C
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  m-by-n stored in an ldc-by-n array [RowMajor: m-by-ldc].
+  Before entry, the leading  m by n  part of the array  C must
+  contain the matrix  C,  except when  beta  is zero, in which
+  case C need not be set on entry.
+  On exit, the array  C  is overwritten by the  m by n  matrix
+  ( alpha*op( A )*op( B ) + beta*C ).
+
+  \param[in] ldc
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
+  */	
+template< typename T >
 void gemm(
-    CBLAS_LAYOUT layout,
+    CBLAS_ORDER layout,
     CBLAS_TRANSPOSE transA,
     CBLAS_TRANSPOSE transB,
     int64_t m, int64_t n, int64_t k,
-    scalar_type<TA, TB, TC> alpha,
-    TA const *A, int64_t lda,
-    TB const *B, int64_t ldb,
-    scalar_type<TA, TB, TC> beta,
-    TC       *C, int64_t ldc )
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
 {
-//    printf("In gemm.cpp\n");
     cblas_gemm(layout, transA, transB, m, n, k, alpha, A,lda, B, ldb, beta, C, ldc);
+}
 
-};
+/*! \brief Solve the triangular matrix-matrix equation for arbitrary data types 
+
+  \b Purpose:	
+
+  TRSM  performs  one of the matrix equations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+
+  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+  where  op( X ) is one of
+      
+  op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+
+  The matrix X is overwritten on B.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] side
+  side is enum CBLAS_SIDE
+  side specifies specifies whether op( A ) appears on the left
+  or right of X as follows:
+  side = CBLAS_SIDE::CblasLeft   op( A )*X = alpha*B.
+  side = CBLAS_SIDE::CblasRight   op( A )*X = alpha*B.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether the matrix A is an upper or
+  lower triangular matrix as follows:
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix.
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+  
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the form of op( A ) to be used in
+  the matrix multiplication as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
+  trans = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**H.
+  
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows:
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  B.  m  must  be at least  zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  B. n must be at least zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If side = CblasLeft:
+  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
+  If side = CblasRight:
+  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If side = CblasLeft: lda >= max(1, m) .
+  If side = CblasRight:lda >= max(1, k) .
+
+  \param[in,out] B
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb]. 
+  on exit  is overwritten by the solution matrix  X.
+
+  \param[in] ldb
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
+  */
+template< typename T >
+void trsm(
+    CBLAS_ORDER layout,
+    CBLAS_SIDE side,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t m,
+    int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T       *B, int64_t ldb )
+{
+    cblas_trsm( layout, side, uplo, trans, diag, m, n,  alpha, A, lda, B, ldb);
+}
+/*! \brief Solve the Triangular matrix-matrix multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  TRMM  performs solves one of the matrix equations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+
+  where alpha is a scalar, B is an m by n matrices, A is a unit, or
+  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+  op( A ) = A   or   op( A ) = A**T.
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] side
+  side is enum CBLAS_SIDE
+  side specifies whether op( A ) multiplies B from left or right of X
+  as follows:
+  side = CBLAS_SIDE::CblasLeft   B := alpha*op( A )*B.
+  side = CBLAS_SIDE::CblasRight  B := alpha*B*op( A ).
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies whether the matrix A is an upper or lower triangular
+  matrix as follows:
+  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix.
+  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
+
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the form of op( A ) to be used in
+  the matrix multiplication as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
+  trans = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**T.
+
+  \param[in] diag
+  diag is enum CBLAS_DIAG
+  diag specifies specifies whether or not A is unit triangular
+  as follows:
+  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.
+  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
+                                 triangular.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  B.  m  must  be at least  zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  B. n must be at least zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.When  alpha is
+  zero then  A is not referenced and  B need not be set before
+  entry.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If side = CblasLeft:
+  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
+  If side = CblasRight:
+  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If side = CblasLeft: lda >= max(1, m) .
+  If side = CblasRight:lda >= max(1, n) .
+
+  \param[in,out] B
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
+
+  \param[in] ldb
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
+  */
+template< typename T >
+void trmm(
+    CBLAS_ORDER layout,
+    CBLAS_SIDE side,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    CBLAS_DIAG diag,
+    int64_t m,
+    int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T       *B, int64_t ldb )
+{
+    cblas_trmm( layout, side, uplo, trans, diag, m, n,  alpha, A, lda, B, ldb);
+}
+
+/*! \brief Solve the Hermitian matrix-matrix multiply for arbitrary data types 
+
+  \b Purpose:	
+
+  HEMM  performs solves one of the matrix-matrix operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*A*B + beta*C
+
+  or
+
+  C := alpha*B*A + beta*C,
+
+  where alpha is a scalar,  A is an hermitian matrix
+  C and B are m by n matrices
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] side
+  side is enum CBLAS_SIDE
+  side specifies specifies whether the  hermitian matrix  A
+  appears on the  left or right  in the  operation as follows:
+  side = CBLAS_SIDE::CblasLeft   C := alpha*A*B + beta*C,
+  side = CBLAS_SIDE::CblasRight   C := alpha*B*A + beta*C
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  hermitian  matrix   A  is  to  be
+  referenced as follows:
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of the
+                                  hermitian matrix is to be referenced.
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of the
+                                  hermitian matrix is to be referenced.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  C.  m  must  be at least  zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  C. n must be at least zero.
+
+  \param[in] alpha
+  alpha is COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is COMPLEX/COMPLEX*16 array,dimension :
+  If side = CblasLeft:
+  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
+  If side = CblasRight:
+  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If side = CblasLeft: lda >= max(1, m) .
+  If side = CblasRight:lda >= max(1, k) .
+
+  \param[in] B
+  B is COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
+
+  \param[in] ldb
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
+
+  \param[in] beta
+  beta is COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar beta.
+  If beta is zero, C need not be set on input
+
+  \param[in,out] C
+  C is COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc].
+
+  \param[in] ldc
+  ldc is INTEGER
+  On entry, ldc specifies the Leading dimension of C
+  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
+  */
+template< typename T >
+void hemm(
+    CBLAS_ORDER layout,
+    CBLAS_SIDE side,
+    CBLAS_UPLO uplo,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_hemm( layout, side, uplo, m, n,  alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+/*! \brief Solve the Symmetric matrix-matrix multiply for arbitrary data types 
+
+  \b Purpose:	
+  
+  SYMM  performs solves one of the matrix-matrix operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*A*B + beta*C
+
+  or
+
+  C := alpha*B*A + beta*C,
+
+  where alpha is a scalar,  A is an symmetric matrix
+  C and B are m by n matrices
+
+  \param[in] layout
+  layout is enum CBLAS_ORDER
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] side
+  side is enum CBLAS_SIDE
+  side specifies specifies whether the  symmetric matrix  A
+  appears on the  left or right  in the  operation as follows:
+  side = CBLAS_SIDE::CblasLeft   C := alpha*A*B + beta*C,
+  side = CBLAS_SIDE::CblasRight   C := alpha*B*A + beta*C
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  symmetric  matrix   A  is  to  be
+  referenced as follows:
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of the
+                                  symmetric matrix is to be referenced.
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of the
+                                  symmetric matrix is to be referenced.
+
+  \param[in] m
+  m is INTEGER
+  On entry,  m  specifies  the number  of rows  of the  matrix
+  C.  m  must  be at least  zero.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n  specifies the number  of columns of the matrix
+  C. n must be at least zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If side = CblasLeft:
+  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
+  If side = CblasRight:
+  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If side = CblasLeft: lda >= max(1, m) .
+  If side = CblasRight:lda >= max(1, k) .
+
+  \param[in] B
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
+
+  \param[in] ldb
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
+
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar beta.
+  If beta is zero, C need not be set on input
+
+  \param[in, out] C
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc].
+
+  \param[in] ldc
+  ldc is INTEGER
+  On entry, ldc specifies the Leading dimension of C
+  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
+  */
+template< typename T >
+void symm(
+    CBLAS_ORDER layout,
+    CBLAS_SIDE side,
+    CBLAS_UPLO uplo,
+    int64_t m, int64_t n,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_symm( layout, side, uplo, m, n,  alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+/*! \brief Solve the Symmetric rank-k operations for arbitrary data types 
+
+  \b Purpose:	
+
+  SYRK  performs one of the symmetric rank k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*A*A**T + beta*C,
+
+  or
+
+  C := alpha*A**T*A + beta*C,
+
+  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+  and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+  in the second case.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced
+  as follows:
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*A**T + beta*C.
+  trans = CBLAS_TRANSPOSE::CblasTrans,C := alpha*A**T*A + beta*C.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+
+  \param[in] k
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrix A.
+  Otherwise:               k is number of rows    of the matrix A.
+  k must be at least  zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If transA = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If transA = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+
+  \param[in,out] C
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n symmetric matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper
+  triangular part of the updated matrix.
+
+  \param[in] ldc
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n)
+  */
+template< typename T >
+void syrk(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_syrk( layout, uplo, trans,  n, k,  alpha, A, lda, beta, C, ldc);
+}
+
+/*! \brief Solve the Symmetric rank 2k operations for arbitrary data types 
+
+  \b Purpose:	
+
+  SYR2K  performs one of the symmetric rank 2k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
+  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+  C := alpha*A*B**T + alpha*B*A**T + beta*C,
+
+  or
+
+  C := alpha*A**T*B + alpha*B**T*A + beta*C,
+
+  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+  and  A  and B are n by k  matrices in the first case and k by n  matrices
+  in the second case.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced
+  as follows:
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*B**T + alpha*B*A**T + beta*C.
+  trans = CBLAS_TRANSPOSE::CblasTrans,  C := alpha*A**T*B + alpha*B**T*A + beta*C.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+
+  \param[in] k
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrices A & B.
+  Otherwise:               k is number of rows    of the matrices A & B.
+  k must be at least  zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+
+  \param[in] B
+  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
+  Otherwise:
+  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb]
+
+  \param[in] ldb
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
+  Otherwise:               ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
+
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+
+  \param[in,out] C
+  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n symmetric matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper
+  triangular part of the updated matrix.
+
+  \param[in] ldc
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n)
+  */
+template< typename T >
+void syr2k(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    T beta,
+    T       *C, int64_t ldc )
+{
+    cblas_syr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );	
+}
+
+/*! \brief Solve the Hermitian rank k operations for arbitrary data types 
+
+  \b Purpose:	
+
+  HERK  performs one of the hermitian rank k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION COMPLEX, 
+  DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+    C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+  
+  or
+  
+    C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+
+  where  alpha and beta  are real scalars,  C is an  n by n  hermitian 
+  matrix and  A  is an n by k  matrix in the first case and 
+  k by n  matrix in the second case.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced 
+  as follows:
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  C := alpha*A*A**H + beta*C.
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*A + beta*C.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+
+  \param[in] k
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrix   A. 
+  Otherwise:               k is number of rows    of the matrix   A.
+  k must be at least  zero.
+
+  \param[in] alpha
+  alpha is REAL/DOUBLE PRECISION
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+  
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+
+  \param[in,out] C
+  C is COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n Hermitian matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper 
+  triangular part of the updated matrix.
+
+  \param[in] ldc
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n) 
+  */
+template< typename T >
+void herk(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    real_type<T> alpha,
+    T const *A, int64_t lda,
+    real_type<T> beta,
+    T       *C, int64_t ldc )
+{
+    cblas_herk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );	
+}
+
+/*! \brief Solve the Hermitian rank 2k operations for arbitrary data types 
+
+  \b Purpose:	
+
+  HER2K  performs one of the hermitian rank 2k operations for arbitrary data types
+  Data precisions supported include SINGLE PRECISION COMPLEX,
+  DOUBLE PRECISION COMPLEX(COMPLEX*16)
+
+    C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+  
+  or
+  
+    C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+
+  where  alpha and beta  are scalars with  beta  real,  C is an  n by n 
+  hermitian matrix and  A  and B are n by k  matrices in the first case 
+  and k by n  matrices in the second case.
+
+  \param[in] layout
+  layout is enum CBLAS_LAYOUT
+  layout specifies Matrix storage as follows:
+  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
+
+  \param[in] uplo
+  uplo is enum CBLAS_UPLO
+  uplo specifies specifies whether  the  upper  or  lower
+  triangular  part  of  the  array C  is  to  be  referenced 
+  as follows:
+  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
+                                  is to be referenced.
+  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
+                                  is to be referenced.
+
+  \param[in] trans
+  trans is CBLAS_TRANSPOSE
+  On entry, trans specifies the operation to be used as follows:
+  trans = CBLAS_TRANSPOSE::CblasNoTrans,  C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C.
+  trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C.
+
+  \param[in] n
+  n is INTEGER
+  On entry,  n specifies the order of the matrix C.  n must be
+  at least zero.
+
+  \param[in] k
+  k is INTEGER
+  If trans = CblasNoTrans: k is number of columns of the matrices A & B. 
+  Otherwise:               k is number of rows    of the matrices A & B.
+  k must be at least  zero.
+
+  \param[in] alpha
+  alpha is COMPLEX/COMPLEX*16
+  On entry, alpha specifies the scalar alpha.
+
+  \param[in] A
+  A is COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
+  Otherwise:
+  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
+
+  \param[in] lda
+  lda is INTEGER
+  On entry, lda specifies the Leading dimension of A
+  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
+  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
+
+  \param[in] B
+  B is COMPLEX/COMPLEX*16 array,dimension :
+  If trans = CblasNoTrans:
+  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
+  Otherwise:
+  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb]
+
+  \param[in] ldb
+  ldb is INTEGER
+  On entry, ldb specifies the Leading dimension of B
+  If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
+  Otherwise:               ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
+  
+  \param[in] beta
+  beta is REAL/DOUBLE PRECISION
+  On entry, beta specifies the scalar alpha.When  beta  is
+  supplied as zero then C need not be set on input.
+
+  \param[in,out] C
+  C is COMPLEX/COMPLEX*16 array, dimension :
+  The n-by-n Hermitian matrix C,
+  stored in an ldc-by-n array [RowMajor: n-by-ldc].
+  On exit, the array  C  is overwritten by the  lower/upper 
+  triangular part of the updated matrix.
+
+  \param[in] ldc
+  ldc is INTEGER
+  On entry, ldc specifies the first dimension of C
+  ldc >= max(1, n) 
+  */
+template< typename T >
+void her2k(
+    CBLAS_ORDER layout,
+    CBLAS_UPLO uplo,
+    CBLAS_TRANSPOSE trans,
+    int64_t n, int64_t k,
+    T alpha,
+    T const *A, int64_t lda,
+    T const *B, int64_t ldb,
+    real_type<T> beta,
+    T       *C, int64_t ldc )
+{
+    cblas_her2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );	
+}
 
 }  // namespace blis
-
 #endif        //  #ifndef BLIS_HH
diff --git a/test/Makefile b/test/Makefile
index 5c4aca248..a2e0391d5 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -148,10 +148,10 @@ CBLAS_HEADER_PATH  = ../frame/compat/cblas/src
 CINCFLAGS      := -I$(INC_PATH) -I$(CBLAS_HEADER_PATH)
 
 # Use the CFLAGS for the configuration family.
-override CFLAGS         += $(call get-user-cflags-for,$(CONFIG_NAME))
+CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
 
 # Add local header paths to CFLAGS
-override CFLAGS         += -I$(TEST_SRC_PATH)
+CFLAGS         += -I$(TEST_SRC_PATH)
 
 # Locate the libblis library to which we will link.
 #LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
diff --git a/test/test_gemm.c b/test/test_gemm.c
index c2730e68d..a0908965a 100644
--- a/test/test_gemm.c
+++ b/test/test_gemm.c
@@ -91,22 +91,13 @@ int main( int argc, char** argv )
 	n_input = 4;
 #endif
 
-#if 0
+#if 1
 //	dt = BLIS_FLOAT;
-//	dt = BLIS_DOUBLE;
+	dt = BLIS_DOUBLE;
 #else
 //	dt = BLIS_SCOMPLEX;
 //	dt = BLIS_DCOMPLEX;
 #endif
-#ifdef FLOAT
-        dt = BLIS_FLOAT;
-#elif defined DOUBLE
-        dt = BLIS_DOUBLE;
-#elif defined SCOMPLEX
-        dt = BLIS_SCOMPLEX;
-#elif defined DCOMPLEX
-        dt = BLIS_DCOMPLEX;
-#endif
 
 
 	transa = BLIS_NO_TRANSPOSE;
diff --git a/testcpp/Makefile b/testcpp/Makefile
index 3f2dc1327..62d0de335 100644
--- a/testcpp/Makefile
+++ b/testcpp/Makefile
@@ -1,3 +1,5 @@
+#
+#
 #  BLIS    
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
@@ -115,11 +117,7 @@ CINCFLAGS      := -I$(INC_PATH)
 CXX	= g++
 
 # Use the CFLAGS for the configuration family.
-<<<<<<< HEAD
-override CFLAGS         += $(call get-sandbox-cxxflags-for,$(CONFIG_NAME))
-=======
 override CFLAGS         += $(call get-user-cflags-for,$(CONFIG_NAME))
->>>>>>> Code Cleanup done; Test code updated to add performance measurement
 
 # Add local header paths to CFLAGS
 #CFLAGS         = -O0 -g -Wall 
@@ -140,150 +138,15 @@ LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
 
 # Complete list of possible targets when defining 'all':
 #
-<<<<<<< HEAD
 #   blis 
 #
+#all: blis
 all: blis 
 
-
-blis: test_asum_blis.x \
-	test_axpy_blis.x \
-	test_copy_blis.x \
-	test_dot_blis.x \
-	test_dotc_blis.x \
-	test_gbmv_blis.x \
-	test_gemm_blis.x \
-	test_gemv_blis.x \
-	test_ger_blis.x \
-	test_gerc_blis.x \
-	test_geru_blis.x \
-	test_hemm_blis.x \
-	test_hemv_blis.x \
-	test_her2_blis.x \
-	test_her_blis.x \
-	test_herk_blis.x \
-	test_hpr2_blis.x \
-	test_hpr_blis.x \
-	test_nrm2_blis.x \
-	test_rot_blis.x \
-	test_rotg_blis.x \
-	test_rotm_blis.x \
-	test_rotmg_blis.x \
-	test_scal_blis.x \
-	test_sdsdot_blis.x \
-	test_spr2_blis.x \
-	test_spr_blis.x \
-	test_swap_blis.x \
-	test_symm_blis.x \
-	test_syr2_blis.x \
-	test_syr2k_blis.x \
-	test_syr_blis.x \
-	test_syrk_blis.x \
-	test_tbmv_blis.x \
-	test_tbsv_blis.x \
-	test_tpmv_blis.x \
-	test_tpsv_blis.x \
-	test_trmm_blis.x \
-	test_trsm_blis.x \
-	test_trsv_blis.x 
- 
-=======
-#   blis openblas atlas mkl mac essl
-#
-#all: blis openblas atlas mkl
-all: blis openblas mkl
-
-blis: test_gemm_blis.x \
-	test_gemm1_blis.x
-
-openblas: \
-      test_dotv_openblas.x \
-      test_axpyv_openblas.x \
-      test_gemv_openblas.x \
-      test_ger_openblas.x \
-      test_hemv_openblas.x \
-      test_her_openblas.x \
-      test_her2_openblas.x \
-      test_trmv_openblas.x \
-      test_trsv_openblas.x \
-      \
-      test_gemm_openblas.x \
-      test_hemm_openblas.x \
-      test_herk_openblas.x \
-      test_her2k_openblas.x \
-      test_trmm_openblas.x \
-      test_trsm_openblas.x
-
-atlas: \
-      test_dotv_atlas.x \
-      test_axpyv_atlas.x \
-      test_gemv_atlas.x \
-      test_ger_atlas.x \
-      test_hemv_atlas.x \
-      test_her_atlas.x \
-      test_her2_atlas.x \
-      test_trmv_atlas.x \
-      test_trsv_atlas.x \
-      \
-      test_gemm_atlas.x \
-      test_hemm_atlas.x \
-      test_herk_atlas.x \
-      test_her2k_atlas.x \
-      test_trmm_atlas.x \
-      test_trsm_atlas.x
-
-mkl:  test_dotv_mkl.x \
-      test_axpyv_mkl.x \
-      test_gemv_mkl.x \
-      test_ger_mkl.x \
-      test_hemv_mkl.x \
-      test_her_mkl.x \
-      test_her2_mkl.x \
-      test_trmv_mkl.x \
-      test_trsv_mkl.x \
-      \
-      test_gemm_mkl.x \
-      test_hemm_mkl.x \
-      test_herk_mkl.x \
-      test_her2k_mkl.x \
-      test_trmm_mkl.x \
-      test_trsm_mkl.x
-
-essl: test_dotv_essl.x \
-      test_axpyv_essl.x \
-      test_gemv_essl.x \
-      test_ger_essl.x \
-      test_hemv_essl.x \
-      test_her_essl.x \
-      test_her2_essl.x \
-      test_trmv_essl.x \
-      test_trsv_essl.x \
-      \
-      test_gemm_essl.x \
-      test_hemm_essl.x \
-      test_herk_essl.x \
-      test_her2k_essl.x \
-      test_trmm_essl.x \
-      test_trsm_essl.x
-
-mac:  test_dotv_mac.x \
-      test_axpyv_mac.x \
-      test_gemv_mac.x \
-      test_ger_mac.x \
-      test_hemv_mac.x \
-      test_her_mac.x \
-      test_her2_mac.x \
-      test_trmv_mac.x \
-      test_trsv_mac.x \
-      \
-      test_gemm_mac.x \
-      test_hemm_mac.x \
-      test_herk_mac.x \
-      test_her2k_mac.x \
-      test_trmm_mac.x \
-      test_trsm_mac.x
-
->>>>>>> Code Cleanup done; Test code updated to add performance measurement
+blis: test_trsm_blis.x \
+      test_symm_blis.x \
+      test_hemm_blis.x \
+      test_gemm_blis.x
 
 
 # --Object file rules --
@@ -292,18 +155,18 @@ $(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.cc
 	$(CXX) $(CFLAGS) -c $< -o $@
 
 test_%_blis.o: test_%.cc
-<<<<<<< HEAD
-	@$(CXX) $(CFLAGS) -c $< -o $@
-=======
 	$(CXX) $(CFLAGS) -c $< -o $@
->>>>>>> Code Cleanup done; Test code updated to add performance measurement
 
 
 # -- Executable file rules --
 
+# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
+# on the link command line in case BLIS was configured with the BLAS
+# compatibility layer. This prevents BLIS from inadvertently getting called
+# for the BLAS routines we are trying to test with.
+
 test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK)
-	@$(LINKER) $^       $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-	./$@
+	$(LINKER) $^       $(LIBBLIS_LINK) $(LDFLAGS) -o $@
 
 # -- Clean rules --
 
diff --git a/testcpp/test_gemm1.cc b/testcpp/test_gemm1.cc
deleted file mode 100644
index c428ebbe7..000000000
--- a/testcpp/test_gemm1.cc
+++ /dev/null
@@ -1,421 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <complex>
-
-#include <stdio.h>
-#include <iostream>
-#include <string.h>
-#include <unistd.h>
-#include "blis.hh"
-using namespace std;
-
-
-//#define FILE_IN_OUT      // File based input matrix dimensions
-
-
-#define PRINT
-
-int main( int argc, char** argv )
-{
-	obj_t a, b, c;
-	obj_t c_save;
-	obj_t alpha, beta;
-	dim_t m, n, k;
-	dim_t p;
-	dim_t p_begin, p_end, p_inc;
-	int   m_input, n_input, k_input;
-	num_t dt;
-	int   r, n_repeats;
-	trans_t  transa;
-	trans_t  transb;
-//	f77_char f77_transa;
-//	f77_char f77_transb;
-
-	double dtime;
-	double dtime_save;
-	double gflops;
-
-
-#ifdef FILE_IN_OUT
-	FILE* fin  = NULL;
-	FILE* fout = NULL;
-	char gemm = 's';
-
-#endif
-
-	//bli_init();
-
-	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
-
-	n_repeats = 100;
-
-#ifndef PRINT
-	p_begin = 200;
-	p_end   = 2000;
-	p_inc   = 200;
-
-	m_input = -1;
-	n_input = -1;
-	k_input = -1;
-#else
-	p_begin = 16;
-	p_end   = 16;
-	p_inc   = 1;
-
-	m_input = 5;
-	k_input = 6;
-	n_input = 4;
-#endif
-
-#ifdef FLOAT
-	dt = BLIS_FLOAT;
-#elif defined DOUBLE
-	dt = BLIS_DOUBLE;
-#elif defined SCOMPLEX
-	dt = BLIS_SCOMPLEX;
-#elif defined DCOMPLEX
-	dt = BLIS_DCOMPLEX;
-#endif
-
-	transa = BLIS_NO_TRANSPOSE;
-	transb = BLIS_NO_TRANSPOSE;
-
-//	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
-//	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
-
-
-#ifdef FILE_IN_OUT
-	if (argc < 3)
-	  {
-	    printf("Usage: ./test_gemm_XX.x input.csv output.csv\n");
-	    exit(1);
-	  }
-	fin = fopen(argv[1], "r");
-	if (fin == NULL)
-	  {
-	    printf("Error opening the file %s\n", argv[1]);
-	    exit(1);
-	  }
-	fout = fopen(argv[2], "w");
-	if (fout == NULL)
-	  {
-	    printf("Error opening output file %s\n", argv[2]);
-	    exit(1);
-	  }
-	fprintf(fout, "m\t k\t n\t cs_a\t cs_b\t cs_c\t gflops\t GEMM_Algo\n");
-
-
-	printf("~~~~~~~~~~_BLAS\t m\t k\t n\t cs_a\t cs_b\t cs_c \t gflops\t GEMM_Algo\n");
-
-	inc_t cs_a;
-	inc_t cs_b;
-	inc_t cs_c;
-
-	while (fscanf(fin, "%ld %ld %ld %ld %ld %ld\n", &m, &k, &n, &cs_a, &cs_b, &cs_c) == 6)
-	  {
-	    if ((m > cs_a) || (k > cs_b) || (m > cs_c)) continue; // leading dimension should be greater than number of rows
-	    
-	    bli_obj_create( dt, 1, 1, 0, 0, &alpha);
-	    bli_obj_create( dt, 1, 1, 0, 0, &beta );
-
-	    bli_obj_create( dt, m, k, 1, cs_a, &a );
-	    bli_obj_create( dt, k, n, 1, cs_b, &b );
-	    bli_obj_create( dt, m, n, 1, cs_c, &c );
-	    bli_obj_create( dt, m, n, 1, cs_c, &c_save );
-
-	    bli_obj_set_conjtrans( transa, &a);
-	    bli_obj_set_conjtrans( transb, &b);
-
-	    //bli_setsc( 0.0, -1, &alpha );
-	    //bli_setsc( 0.0, 1, &beta );
-
-	    bli_setsc( -1, 0.0, &alpha );
-	    bli_setsc( 1, 0.0, &beta );
-
-	    //	    printf("%1.1f %1.1f\n", *((double *)bli_obj_buffer_for_const(BLIS_FLOAT, &alpha)), *((double *)bli_obj_buffer_for_const(BLIS_FLOAT, &beta)));
-
-#else
-	for ( p = p_begin; p <= p_end; p += p_inc )
-	{
-		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
-		else               m =     ( dim_t )    m_input;
-		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
-		else               n =     ( dim_t )    n_input;
-		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
-		else               k =     ( dim_t )    k_input;
-
-		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
-		bli_obj_create( dt, 1, 1, 0, 0, &beta );
-
-		bli_obj_create( dt, m, k, 0, 0, &a );
-		bli_obj_create( dt, k, n, 0, 0, &b );
-		bli_obj_create( dt, m, n, 0, 0, &c );
-		bli_obj_create( dt, m, n, 0, 0, &c_save );
-
-		bli_randm( &a );
-		bli_randm( &b );
-		bli_randm( &c );
-
-		bli_obj_set_conjtrans( transa, &a );
-		bli_obj_set_conjtrans( transb, &b );
-
-		bli_setsc(  (0.9/1.0), 0.2, &alpha );
-		bli_setsc( -(1.1/1.0), 0.3, &beta );
-#endif
-
-		bli_copym( &c, &c_save );
-	
-		dtime_save = DBL_MAX;
-
-		for ( r = 0; r < n_repeats; ++r )
-		{
-			bli_copym( &c_save, &c );
-
-
-			dtime = bli_clock();
-
-
-#ifdef PRINT
-			bli_printm( "a", &a, "%4.1f", "" );
-			bli_printm( "b", &b, "%4.1f", "" );
-			bli_printm( "c", &c, "%4.1f", "" );
-#endif
-
-#if 0
-
-			bli_gemm( &alpha,
-			          &a,
-			          &b,
-			          &beta,
-			          &c );
-
-#else
-
-		if ( bli_is_float( dt ) )
-		{
-			int  M     = bli_obj_length( &c );
-			int  K     = bli_obj_width_after_trans( &a );
-			int  N     = bli_obj_width( &c );
-			int  lda    = bli_obj_col_stride( &a );
-			int  ldb    = bli_obj_col_stride( &b );
-			int  ldc    = bli_obj_col_stride( &c );
-			float*   alphap = (float *)bli_obj_buffer( &alpha );
-			float*   ap     = (float *)bli_obj_buffer( &a );
-			float*   bp     = (float *)bli_obj_buffer( &b );
-			float*   betap  = (float *)bli_obj_buffer( &beta );
-			float*   cp     = (float *)bli_obj_buffer( &c );
-		
-		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
-                		        lda, bp, ldb, *betap, cp, ldc);
-		
-#if 0
-			sgemm_( &f77_transa,
-			        &f77_transb,
-			        &mm,
-			        &nn,
-			        &kk,
-			        alphap,
-			        ap, &lda,
-			        bp, &ldb,
-			        betap,
-			        cp, &ldc );
-#endif
-		}
-		else if ( bli_is_double( dt ) )
-		{
-			int  M     = bli_obj_length( &c );
-			int  K     = bli_obj_width_after_trans( &a );
-			int  N     = bli_obj_width( &c );
-			int  lda    = bli_obj_col_stride( &a );
-			int  ldb    = bli_obj_col_stride( &b );
-			int  ldc    = bli_obj_col_stride( &c );
-			double*  alphap = (double*)bli_obj_buffer( &alpha );
-			double*  ap     = (double*)bli_obj_buffer( &a );
-			double*  bp     = (double*)bli_obj_buffer( &b );
-			double*  betap  = (double*)bli_obj_buffer( &beta );
-			double*  cp     = (double*)bli_obj_buffer( &c );
-
-		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
-                		        lda, bp, ldb, *betap, cp, ldc);
-#if 0
-			dgemm_( &f77_transa,
-			        &f77_transb,
-			        &mm,
-			        &nn,
-			        &kk,
-			        alphap,
-			        ap, &lda,
-			        bp, &ldb,
-			        betap,
-			        cp, &ldc );
-#endif
-		}
-		else if ( bli_is_scomplex( dt ) )
-		{
-			int  M     = bli_obj_length( &c );
-			int  K     = bli_obj_width_after_trans( &a );
-			int  N     = bli_obj_width( &c );
-			int  lda    = bli_obj_col_stride( &a );
-			int  ldb    = bli_obj_col_stride( &b );
-			int  ldc    = bli_obj_col_stride( &c );
-			std::complex<float>*  alphap = (std::complex<float>*)bli_obj_buffer( &alpha );
-			std::complex<float>*  ap     = (std::complex<float>*)bli_obj_buffer( &a );
-			std::complex<float>*  bp     = (std::complex<float>*)bli_obj_buffer( &b );
-			std::complex<float>*  betap  = (std::complex<float>*)bli_obj_buffer( &beta );
-			std::complex<float>*  cp     = (std::complex<float>*)bli_obj_buffer( &c );
-
-		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
-                		        lda, bp, ldb, *betap, cp, ldc);
-#if 0
-			cgemm_( &f77_transa,
-			        &f77_transb,
-			        &mm,
-			        &nn,
-			        &kk,
-			        alphap,
-			        ap, &lda,
-			        bp, &ldb,
-			        betap,
-			        cp, &ldc );
-#endif
-		}
-		else if ( bli_is_dcomplex( dt ) )
-		{
-			f77_int  M     = bli_obj_length( &c );
-			f77_int  K     = bli_obj_width_after_trans( &a );
-			f77_int  N     = bli_obj_width( &c );
-			f77_int  lda    = bli_obj_col_stride( &a );
-			f77_int  ldb    = bli_obj_col_stride( &b );
-			f77_int  ldc    = bli_obj_col_stride( &c );
-			std::complex<double>*  alphap = (std::complex<double>*)bli_obj_buffer( &alpha );
-			std::complex<double>*  ap     = (std::complex<double>*)bli_obj_buffer( &a );
-			std::complex<double>*  bp     = (std::complex<double>*)bli_obj_buffer( &b );
-			std::complex<double>*  betap  = (std::complex<double>*)bli_obj_buffer( &beta );
-			std::complex<double>*  cp     = (std::complex<double>*)bli_obj_buffer( &c );
-
-		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
-                		        lda, bp, ldb, *betap, cp, ldc);
-#if 0
-			zgemm_( &f77_transa,
-			        &f77_transb,
-			        &mm,
-			        &nn,
-			        &kk,
-			        alphap,
-			        ap, &lda,
-			        bp, &ldb,
-			        betap,
-			        cp, &ldc );
-#endif
-		}
-#endif
-
-#ifdef PRINT
-			bli_printm( "c after", &c, "%4.1f", "" );
-			exit(1);
-#endif
-
-
-			dtime_save = bli_clock_min_diff( dtime_save, dtime );
-		}
-
-		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
-
-		if ( bli_is_complex( dt ) ) gflops *= 4.0;
-
-#ifdef BLIS
-		printf( "data_gemm_blis" );
-#else
-		//printf( "data_gemm_%s", BLAS );
-#endif
-
-
-#ifdef FILE_IN_OUT
-
-		if ( bli_is_double( dt ) ) {
-
-		  if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES/4))  || ((m  < (BLIS_SMALL_M_RECT_MATRIX_THRES/2) ) && (k < (BLIS_SMALL_K_RECT_MATRIX_THRES/2) )))
-		    gemm = 'S';      // small gemm
-		  else gemm = 'N';   // Normal blis gemm
-		  
-		}
-		else if (bli_is_float( dt )) {
-		  if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES))  || ((m  < BLIS_SMALL_M_RECT_MATRIX_THRES) && (k < BLIS_SMALL_K_RECT_MATRIX_THRES)))
-		    gemm = 'S';    // small gemm
-		  else gemm = 'N'; // normal blis gemm
-		}
-		
-		
-
-		printf("%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
-			( unsigned long )m,
-		        ( unsigned long )k,
-		       ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops, gemm );
-		
-		
-		fprintf(fout, "%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
-			( unsigned long )m,
-		        ( unsigned long )k,
-		        ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops, gemm );
-		fflush(fout);
-
-#else
-		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
-		        ( unsigned long )m,
-		        ( unsigned long )k,
-		        ( unsigned long )n, gflops );
-		
-
-#endif
-		bli_obj_free( &alpha );
-		bli_obj_free( &beta );
-
-		bli_obj_free( &a );
-		bli_obj_free( &b );
-		bli_obj_free( &c );
-		bli_obj_free( &c_save );
-	}
-
-	//bli_finalize();
-
-#ifdef FILE_IN_OUT
-	fclose(fin);
-	fclose(fout);
-#endif
-
-	return 0;
-}
-