diff --git a/cpp/blis.hh b/cpp/blis.hh
index 39dc25864..fc25ebff7 100644
--- a/cpp/blis.hh
+++ b/cpp/blis.hh
@@ -1,3820 +1,102 @@
-/******************************************************************************
-* Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-* THE SOFTWARE.
-*******************************************************************************/
-
-/*! @file blis.hh
- *  blis.hh defines all the BLAS CPP templated public interfaces
- *  */
 #ifndef BLIS_HH
 #define BLIS_HH
 
+#include "blis_util.hh"
 #include "cblas.hh"
+#include <limits>
 
 namespace blis {
 
-/*! \brief Construct plane rotation for arbitrary data types 
-
-  \b Purpose:	
-
-  ROTG  construct plane rotation that eliminates b for arbitrary data types, such that \n
-
-  [ z ] = [  c  s ] [ a ] \n
-  [ 0 ]   [ -s  c ] [ b ] \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-  
-  \param[in, out] a
-  SINGLE/DOUBLE PRECISION REAL
-  On entry, scalar a. On exit, set to z.
- 
-  \param[in, out] b
-  SINGLE/DOUBLE PRECISION REAL
-  On entry, scalar b. On exit, set to s, 1/c, or 0.
- 
-  \param[out] c
-  Cosine of rotation; SINGLE/DOUBLE PRECISION REAL.
- 
-  \param[out] s
-  Sine of rotation; SINGLE/DOUBLE PRECISION REAL.
-  */
-template< typename T >
-void rotg(
-    T *a,
-    T *b,
-    T *c,
-    T *s )
-{
-    cblas_rotg(a, b, c, s);
-}
-
-/*! \brief Construct the modified givens transformation matrix for arbitrary data types 
-
-  \b Purpose:	
-
-  ROTMG construct modified (fast) plane rotation, H, that eliminates b, such that \n
-  [ z ] = H [ sqrt(d1)    0  ] [ a ] \n
-  [ 0 ]     [  0    sqrt(d2) ] [ b ] \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-
-  \param[in, out] d1
-  SINGLE/DOUBLE PRECISION REAL
-  sqrt(d1) is scaling factor for vector x.
- 
-  \param[in, out] d2
-  SINGLE/DOUBLE PRECISION REAL
-  sqrt(d2) is scaling factor for vector y.
- 
-  \param[in, out] a
-  On entry, scalar a. On exit, set to z. SINGLE/DOUBLE PRECISION REAL.
- 
-  \param[in, out] b
-  On entry, scalar b. SINGLE/DOUBLE PRECISION REAL.  
-  
-  \param[out] param
-  SINGLE/DOUBLE PRECISION REAL array, dimension (5),giving parameters 
-  of modified plane rotation 
-  param(1)=DFLAG
-  param(2)=DH11
-  param(3)=DH21
-  param(4)=DH12
-  param(5)=DH22
-  */
-template< typename T >
-void rotmg(
-    T *d1,
-    T *d2,
-    T *a,
-    T  b,
-    T  param[5] )
-{
-    cblas_rotmg(d1, d2, a, b, param );
-}
-
-/*! \brief Apply plane rotation for arbitrary data types 
-
-  \b Purpose:	
-
-  ROT applies a plane rotation:  \n
-  [ x^T ]   [  c  s ] [ x^T ]  \n
-  [ y^T ] = [ -s  c ] [ y^T ]  \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-
-  \param[in] n
-  Number of elements in x and y. n >= 0.
- 
-  \param[in, out] x
-  SINGLE/DOUBLE PRECISION REAL array
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[in, out] y
-  SINGLE/DOUBLE PRECISION REAL array
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
- 
-  \param[in] c
-  Cosine of rotation; SINGLE/DOUBLE PRECISION REAL.
- 
-  \param[in] s
-  Sine of rotation; SINGLE/DOUBLE PRECISION REAL.
-  */
-template< typename T >
-void rot(
-    int64_t n,
-    T *x, int64_t incx,
-    T *y, int64_t incy,
-    T c,
-    T s )
-{
-    cblas_rot( n, x, incx, y, incy, c, s );
-}
-
-/*! \brief Apply the modified givens transformation for arbitrary data types 
-
-  \b Purpose:	
-
-  ROTM applies modified (fast) plane rotation, H:  \n
-  [ x^T ] = H [ x^T ]  \n
-  [ y^T ]     [ y^T ]  \n
-  
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-
-  \param[in] n
-  Number of elements in x and y. n >= 0.
- 
-  \param[in, out] x
-  SINGLE/DOUBLE PRECISION REAL array
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[in, out] y
-  SINGLE/DOUBLE PRECISION REAL array
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
- 
-  \param[in] P
-  SINGLE/DOUBLE PRECISION REAL array, dimension (5),giving parameters 
-  of modified plane rotation 
-  param(1)=DFLAG
-  param(2)=DH11
-  param(3)=DH21
-  param(4)=DH12
-  param(5)=DH22
-  */
-template< typename T >
-void rotm(
-    int64_t n,
-    T *x, int64_t incx,
-    T *y, int64_t incy,
-    const T *P)
-{
-    cblas_rotm( n, x, incx, y, incy, P );
-}
-
-/*! \brief Interchanges two vectors of arbitrary data types 
-
-  \b Purpose:	
-
-  SWAP interchanges two vectors uses unrolled loops for increments equal to 1.\n
-  x <=> y  \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  \param[in] n
-  n is INTEGER
-  Number of elements in x and y. n >= 0.
- 
-  \param[in] x
-  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER.
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[in, out] y
-  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER.
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
-  */
-template< typename T >
-void swap(
-    int64_t n,
-    T *x, int64_t incx,
-    T *y, int64_t incy )
-{
-    cblas_swap( n, x, incx, y, incy );
-}
-
-/*! \brief Scales a vector of arbitrary data types by a constant.
-
-  \b Purpose:	
-
-  SCAL scales a vector by a constant, uses unrolled loops for increment equal to 1.\n
-  x = alpha * x \n
-  Data precisions of vector & constant include SINGLE/DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  \param[in] n
-  n is INTEGER
-  Number of elements in x. n >= 0.
- 
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha. 
-  
-  \param[in ,out] x
-  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
-  */
-template< typename TA, typename TB >
-void scal(
-    int64_t n,
-    TA alpha,
-    TB* x, int64_t incx )
-{
-    cblas_scal( n, alpha, x, incx );
-}
-
-/*! \brief Copies a vector x to a vector y for arbitrary data types 
-
-  \b Purpose:	
-
-  COPY copies a vector x to a vector y.\n
-  y = x  \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  \param[in] n
-  n is INTEGER
-  Number of elements in x and y. n >= 0.
- 
-  \param[in] x
-  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER.
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[out] y
-  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER.
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
-  */
-template< typename T >
-void copy(
-    int64_t n,
-    T const *x, int64_t incx,
-    T       *y, int64_t incy )
-{
-    cblas_copy( n, x, incx, y, incy );
-}
-
-/*! \brief Performs addition of scaled vector for arbitrary data types 
-
-  \b Purpose:	
-
-  AXPY constant times a vector plus a vector.\n
-  y = alpha*x + y  \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  \param[in] n
-  n is INTEGER
-  Number of elements in x and y. n >= 0.
- 
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.\n
-  If alpha is zero, y is not updated.
-  
-  \param[in] x
-  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER.
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[out] y
-  REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array.
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER.
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
-  */
-template< typename T >
-void axpy(
-    int64_t n,
-    T alpha,
-    T const *x, int64_t incx,
-    T       *y, int64_t incy )
-{
-    cblas_axpy( n, alpha, x, incx, y, incy );
-}
-
-/*! \brief Performs the dot product of two vectors for arbitrary data types 
-
-  \b Purpose:	
-
-  DOT forms the dot product of two vectors
-  uses unrolled loops for increments equal to one.\n
-  dot = x^T * y \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-
-  \param[in] n
-  n is INTEGER
-  Number of elements in x and y. n >= 0.
-  
-  \param[in] x
-  REAL/DOUBLE PRECISION array.
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER.
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[in] y
-  REAL/DOUBLE PRECISION array.
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER.
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
-  
-  \return Unconjugated dot product, x^T * y.
-  REAL/DOUBLE PRECISION 
-  */
-template< typename T, typename TR >
-TR dot(
-    int64_t n,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy )
-{
-    return cblas_dot( n, x, incx, y, incy );
-}
-
-/*! \brief Performs the dot product of two complex vectors 
-
-  \b Purpose:	
-
-  DOTU forms the dot product of two complex vectors. \n
-  CDOTU = X^T * Y \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX
-
-  \param[in] n
-  n is INTEGER
-  Number of elements in x and y. n >= 0.
-  
-  \param[in] x
-  REAL/DOUBLE PRECISION COMPLEX array.
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER.
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[in] y
-  REAL/DOUBLE PRECISION COMPLEX array.
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER.
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
-  
-  \return Unconjugated dot product, x^T * y.
-  REAL/DOUBLE PRECISION COMPLEX
-  */
-template< typename T >
-T dotu(
-    int64_t n,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy )
-{
-    return cblas_dotu( n, x, incx, y, incy );
-}
-
-/*! \brief Performs the dot product of two complex vectors 
-
-  \b Purpose:	
-
-  DOTC forms the dot product of two complex vectors. \n
-  CDOTU = X^H * Y \n
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX
-
-  \param[in] n
-  n is INTEGER
-  Number of elements in x and y. n >= 0.
-  
-  \param[in] x
-  REAL/DOUBLE PRECISION COMPLEX array.
-  The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
- 
-  \param[in] incx
-  incx is INTEGER.
-  Stride between elements of x. incx must not be zero.
-  If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
- 
-  \param[in] y
-  REAL/DOUBLE PRECISION COMPLEX array.
-  The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
- 
-  \param[in] incy
-  incy is INTEGER.
-  Stride between elements of y. incy must not be zero.
-  If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
-  
-  \return Conjugated dot product, x^H * y.
-  REAL/DOUBLE PRECISION COMPLEX
-  */
-template< typename T >
-T dotc(
-    int64_t n,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy )
-{
-    return cblas_dotc( n, x, incx, y, incy );
-}
-
-/*! \brief Performs inner product of two vectors with extended precision accumulation
-
-  \b Purpose:	
-
-  DOTC forms the inner product of two vectors with extended precision accumulation. \n
-  Data precisions supported include SINGLE PRECISION REAL
-
-  \param[in] n
-  n is INTEGER\n
-  number of elements in input vector(s)
-  
-  \param[in] alpha
-  alpha is REAL\n
-  single precision scalar to be added to inner product
-  
-  \param[in] x
-  x is REAL array, dimension ( 1 + ( n - 1 )*abs( incx ) )\n
-  single precision vector with n elements
-  
-  \param[in] incx
-  incx is INTEGER\n
-  storage spacing between elements of x
-  
-  \param[in] y
-  y is REAL array, dimension ( 1 + ( n - 1 )*abs( incx ) )\n
-  single precision vector with n elements
-  
-  \param[in] incy
-  incy is INTEGER\n
-  storage spacing between elements of y
-  
-  \return S.P. result with dot product accumulated in D.P.
-  */
-template< typename T >
-T sdsdot(
-    int64_t n,
-    T alpha,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy )
-{
-    return cblas_sdsdot( n, alpha, x, incx, y, incy );
-}
-
-/*! \brief return 2-norm of vectors of arbitrary data types
-
-  \b Purpose:	
-
-  NRM2 returns the euclidean norm of a vector via the function name, so that
-  SNRM2 := sqrt( x'*x ). \n
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  \param[in] n
-  n is INTEGER\n
-  number of elements in input vector(s)
-  
-  \param[in] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, 
-  dimension ( 1 + ( n - 1 )*abs( incx ) )\n
-  single precision vector with n elements
-  
-  \param[in] incx
-  incx is INTEGER\n
-  storage spacing between elements of x
-  
-  \return 2-norm of vector
-  REAL SINGLE/DOUBLE PRECISION
-  */
-template< typename T >
-real_type<T>
-nrm2(
-    int64_t n,
-    T const * x, int64_t incx )
-{
-    return cblas_nrm2( n, x, incx );
-}
-
-/*! \brief return 1-norm of vector of arbitrary data types
-
-  \b Purpose:	
-
-  ASUM takes the sum of the absolute values, uses unrolled loops for 
-  increment equal to one. \n
-  ASUM := || Re(x) ||_1 + || Im(x) ||_1. \n
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  \param[in] n
-  n is INTEGER\n
-  number of elements in input vector(s)
-  
-  \param[in] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, 
-  dimension ( 1 + ( n - 1 )*abs( incx ) )\n
-  single precision vector with n elements
-  
-  \param[in] incx
-  incx is INTEGER\n
-  storage spacing between elements of x
-  
-  \return 1-norm of vector
-  REAL SINGLE/DOUBLE PRECISION
-  */
-template< typename T >
-real_type<T>
-asum(
-    int64_t n,
-    T const *x, int64_t incx )
-{
-    return cblas_asum( n, x, incx );
-}
-
-/*! \brief Return Index of infinity-norm of vectors of arbitrary types.
-
-  \b Purpose:	
-
-  IAMAX finds the index of the first element having maximum |Re(.)| + |Im(.)|. \n
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  \param[in] n
-  n is INTEGER\n
-  number of elements in input vector(s)
-  
-  \param[in] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, 
-  dimension ( 1 + ( n - 1 )*abs( incx ) ) \n
-  single precision vector with n elements
-  
-  \param[in] incx
-  incx is INTEGER\n
-  storage spacing between elements of x
-  
-  \return Index of infinity-norm of vector
-  INTEGER
-  */
-template< typename T >
-int64_t iamax(
-    int64_t n,
-    T const *x, int64_t incx )
-{
-    return cblas_iamax( n, x, incx );
-}
-
-/*! \brief Solve General matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  GEMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
- 
-     y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are vectors and A is an
-  m by n matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be used as follows: \n
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,y := alpha*A*x + beta*y. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  y := alpha*A**T*x + beta*y. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  y := alpha*A**T*x + beta*y.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m specifies the number of rows of the matrix A.
-  m must be at least zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the number of columns of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  m-by-n , stored in an lda-by-n array [RowMajor: m-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda >= max(1, m) [RowMajor: lda >= max(1, n)].
-
-  \param[in] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
-  If trans = CblasNoTrans:
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Otherwise:
-  at least ( 1 + ( m - 1 )*abs( incx ) ).
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then y need not be set on input.
-
-  \param[in,out] y
-  y is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : \n
-  If trans = CblasNoTrans:
-  at least ( 1 + ( m - 1 )*abs( incy ) ). \n
-  Otherwise:
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void gemv(
-    CBLAS_ORDER layout,
-    CBLAS_TRANSPOSE trans,
-    int64_t m, int64_t n,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_gemv(layout, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-/*! \brief Solve General matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  GBMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
- 
-     y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
-	 
-	 y := alpha*A**H*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are vectors and A is an
-  m by n matrix with kl sub-diagonals and ku super-diagonals.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be used as follows: \n
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,y := alpha*A*x + beta*y. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  y := alpha*A**T*x + beta*y. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  y := alpha*A**H*x + beta*y.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m specifies the number of rows of the matrix A.
-  m must be at least zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the number of columns of the matrix A.
-  n must be at least zero.
-
-  \param[in] kl
-  kl is INTEGER
-  On entry,  kl specifies the number of sub-diagonals of the matrix A.
-  kl must be at least zero.
-
-  \param[in] ku
-  ku is INTEGER
-  On entry,  ku specifies the number of super-diagonals of the matrix A.
-  ku must be at least zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension lda-by-n.
-  Before entry, the leading ( kl + ku + 1 ) by n part of the
-  array A must contain the matrix of coefficients, supplied
-  column by column, with the leading diagonal of the matrix in
-  row ( ku + 1 ) of the array, the first super-diagonal
-  starting at position 2 in row ku, the first sub-diagonal
-  starting at position 1 in row ( ku + 2 ), and so on.
-  Elements in the array A that do not correspond to elements
-  in the band matrix (such as the top left ku by ku triangle)
-  are not referenced.
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda >= ( kl + ku + 1 )
-
-  \param[in] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
-  If trans = CblasNoTrans:
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Otherwise:
-  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.  
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then y need not be set on input.
-
-  \param[in,out] y
-  y is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension : \n
-  If trans = CblasNoTrans:
-  at least ( 1 + ( m - 1 )*abs( incy ) ). \n
-  Otherwise:
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void gbmv(
-    CBLAS_ORDER layout,
-    CBLAS_TRANSPOSE trans,
-    int64_t m, int64_t n,
-    int64_t kl, int64_t ku,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_gbmv(layout, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-/*! \brief Solves Hermitian matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  HEMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION COMPLEX, 
-  DOUBLE PRECISION COMPLEX(COMPLEX*16)
- 
-     y := alpha*A*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are  n element vectors and 
-  A is an n by n hermitian matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] alpha
-  alpha is COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is COMPLEX/COMPLEX*16 array,dimension lda-by-n. \n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular part of the hermitian matrix and the strictly
-  lower triangular part of A is not referenced.
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular part of the hermitian matrix and the strictly
-  upper triangular part of A is not referenced. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set and are assumed to be zero.
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-
-  \param[in] x
-  x is COMPLEX/COMPLEX*16 array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then y need not be set on input.
-
-  \param[in,out] y
-  y is COMPLEX/COMPLEX*16 array, dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void hemv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_hemv(layout, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-/*! \brief Solves Hermitian matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  HBMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION COMPLEX, 
-  DOUBLE PRECISION COMPLEX(COMPLEX*16)
- 
-     y := alpha*A*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are  n element vectors and 
-  A is an n by n hermitian matrix with k super-diagonals.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether the the upper or lower triangular
-  part of the band matrix A is being supplied as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] k
-  k is INTEGER
-  On entry,  k specifies the number of super-diagonals of the matrix A.
-  k must be at least zero.
-
-  \param[in] alpha
-  alpha is COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is COMPLEX/COMPLEX*16 array,dimension lda-by-n. \n
-  Before entry with UPLO = CblasUpper, the leading ( k + 1 )
-  by n part of the array A must contain the upper triangular
-  band part of the hermitian matrix, supplied column by
-  column, with the leading diagonal of the matrix in row
-  ( k + 1 ) of the array, the first super-diagonal starting at
-  position 2 in row k, and so on. The top left k by k triangle
-  of the array A is not referenced. \n
-  Before entry with UPLO = CblasLower, the leading ( k + 1 )
-  by n part of the array A must contain the lower triangular
-  band part of the hermitian matrix, supplied column by
-  column, with the leading diagonal of the matrix in row 1 of
-  the array, the first sub-diagonal starting at position 1 in
-  row 2, and so on. The bottom right k by k triangle of the
-  array A is not referenced. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set and are assumed to be zero.
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least ( k + 1 ).
-
-  \param[in] x
-  x is COMPLEX/COMPLEX*16 array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.
-
-  \param[in,out] y
-  y is COMPLEX/COMPLEX*16 array, dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void hbmv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n, int64_t k,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_hbmv(layout, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-/*! \brief Solves Hermitian matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  HPMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION COMPLEX, 
-  DOUBLE PRECISION COMPLEX(COMPLEX*16)
- 
-     y := alpha*A*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are  n element vectors and 
-  A is an n by n hermitian matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether the the upper or lower triangular
-  part of the band matrix A is supplied in the packed array Ap as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] alpha
-  alpha is COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] Ap
-  Ap is COMPLEX/COMPLEX*16 array,dimension atleast ( ( n*( n + 1 ) )/2 ). \n
-  Before entry with UPLO = CblasUpper, the array Ap must
-  contain the upper triangular part of the hermitian matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
-  and a( 2, 2 ) respectively, and so on. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular part of the hermitian matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
-  and a( 3, 1 ) respectively, and so on. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set and are assumed to be zero.
-
-  \param[in] x
-  x is COMPLEX/COMPLEX*16 array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.When beta is
-  supplied as zero then y need not be set on input.
-
-  \param[in,out] y
-  y is COMPLEX/COMPLEX*16 array, dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void hpmv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,
-    T const *Ap,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_hpmv(layout, uplo, n, alpha, Ap, x, incx, beta, y, incy);
-}
-
-/*! \brief Solves Symmetric matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  SYMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
- 
-     y := alpha*A*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are  n element vectors and 
-  A is an n by n symmetric matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is SINGLE/DOUBLE PRECISION REAL array,dimension lda-by-n. \n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular part of the symmetric matrix and the strictly
-  lower triangular part of A is not referenced.
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular part of the symmetric matrix and the strictly
-  upper triangular part of A is not referenced. \n
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION REAL array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is SINGLE/DOUBLE PRECISION REAL
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then y need not be set on input.
-
-  \param[in,out] y
-  y is SINGLE/DOUBLE PRECISION REAL array, dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void symv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_symv(layout, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-/*! \brief Solves symmetric matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  SBMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
- 
-     y := alpha*A*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are  n element vectors and 
-  A is an n by n symmetric matrix with k super-diagonals.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether the the upper or lower triangular
-  part of the band matrix A is being supplied as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] k
-  k is INTEGER
-  On entry,  k specifies the number of super-diagonals of the matrix A.
-  k must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is SINGLE/DOUBLE PRECISION REAL array,dimension lda-by-n. \n
-  Before entry with UPLO = CblasUpper, the leading ( k + 1 )
-  by n part of the array A must contain the upper triangular
-  band part of the symmetric matrix, supplied column by
-  column, with the leading diagonal of the matrix in row
-  ( k + 1 ) of the array, the first super-diagonal starting at
-  position 2 in row k, and so on. The top left k by k triangle
-  of the array A is not referenced. \n
-  Before entry with UPLO = CblasLower, the leading ( k + 1 )
-  by n part of the array A must contain the lower triangular
-  band part of the symmetric matrix, supplied column by
-  column, with the leading diagonal of the matrix in row 1 of
-  the array, the first sub-diagonal starting at position 1 in
-  row 2, and so on. The bottom right k by k triangle of the
-  array A is not referenced. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set and are assumed to be zero.
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least ( k + 1 ).
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION REAL array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is SINGLE/DOUBLE PRECISION REAL
-  On entry, beta specifies the scalar alpha.
-
-  \param[in,out] y
-  y is SINGLE/DOUBLE PRECISION REAL array, dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void sbmv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n, int64_t k,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_sbmv(layout, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-/*! \brief Solves symmetric matrix-vector multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  SPMV  performs one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
- 
-     y := alpha*A*x + beta*y,
- 
-  where alpha and beta are scalars, x and y are  n element vectors and 
-  A is an n by n symmetric matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether the the upper or lower triangular
-  part of the band matrix A is supplied in the packed array Ap as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] Ap
-  Ap is SINGLE/DOUBLE PRECISION REAL array,dimension atleast ( ( n*( n + 1 ) )/2 ). \n
-  Before entry with UPLO = CblasUpper, the array Ap must
-  contain the upper triangular part of the symmetric matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
-  and a( 2, 2 ) respectively, and so on. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular part of the symmetric matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
-  and a( 3, 1 ) respectively, and so on. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set and are assumed to be zero.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION REAL array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] beta
-  beta is SINGLE/DOUBLE PRECISION REAL
-  On entry, beta specifies the scalar alpha.When beta is
-  supplied as zero then y need not be set on input.
-
-  \param[in,out] y
-  y is SINGLE/DOUBLE PRECISION REAL array, dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry with beta non-zero, the incremented array y
-  must contain the vector y. On exit, y is overwritten by the
-  updated vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  */
-template< typename T >
-void spmv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,
-    T const *Ap,
-    T const *x, int64_t incx,
-    T beta,
-    T *y, int64_t incy )
-{
-    cblas_spmv(layout, uplo, n, alpha, Ap, x, incx, beta, y, incy);
-}
-
-/*! \brief Solve the one of the matrix-vector operations for arbitrary data types 
-
-  \b Purpose:	
-
-  TRMV  performs  one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  x := A*x,   or   x := A**T*x,
-
-  where x is an n element vector and  A is an n by n unit, or non-unit,
-  upper or lower triangular matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-  
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be performed as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  x := A*x. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  x := A**T*x. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  x := A**T*x.
-  
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows: \n
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular matrix and the strictly lower triangular part of
-  A is not referenced. \n
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular matrix and the strictly upper triangular part of
-  A is not referenced. \n
-  Note that when  DIAG = CblasUnit, the diagonal elements of
-  A are not referenced either, but are assumed to be unity.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-
-  \param[in, out] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.On exit, x is overwritten with the transformed vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-  */
-template< typename T >
-void trmv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t n,
-    T const *A, int64_t lda,
-    T       *x, int64_t incx )
-{
-    cblas_trmv(layout, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-/*! \brief Solve the one of the matrix-vector operations for arbitrary data types 
-
-  \b Purpose:	
-
-  TBMV  performs  one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  x := A*x,   or   x := A**T*x,
-
-  where x is an n element vector and  A is an n by n unit, or non-unit,
-  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-  
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be performed as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  x := A*x. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  x := A**T*x. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  x := A**T*x.
-  
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows: \n
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] k
-  k is INTEGER
-  On entry with UPLO = CblasUpper, k specifies the number of
-  super-diagonals of the matrix A.
-  On entry with UPLO = CblasLower, k specifies the number of
-  sub-diagonals of the matrix A.
-  k must at least zero.
- 
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
-  Before entry with UPLO = CblasUpper, the leading ( k + 1 )
-  by n part of the array A must contain the upper triangular
-  band part of the matrix of coefficients, supplied column by
-  column, with the leading diagonal of the matrix in row
-  ( k + 1 ) of the array, the first super-diagonal starting at
-  position 2 in row k, and so on. The top left k by k triangle
-  of the array A is not referenced. \n 
-  Before entry with UPLO = CblasLower, the leading ( k + 1 )
-  by n part of the array A must contain the lower triangular
-  band part of the matrix of coefficients, supplied column by
-  column, with the leading diagonal of the matrix in row 1 of
-  the array, the first sub-diagonal starting at position 1 in
-  row 2, and so on. The bottom right k by k triangle of the
-  array A is not referenced. \n
-  Note that when DIAG = CblasUnit the elements of the array A
-  corresponding to the diagonal elements of the matrix are not
-  referenced, but are assumed to be unity.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, ( k + 1 ) ).
-
-  \param[in, out] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.On exit, x is overwritten with the transformed vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-  */
-template< typename T >
-void tbmv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t n, int64_t k,
-    T const *A, int64_t lda,
-    T       *x, int64_t incx )
-{
-    cblas_tbmv(layout, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-
-/*! \brief Solve the one of the matrix-vector operations for arbitrary data types 
-
-  \b Purpose:	
-
-  TPMV  performs  one of the matrix-vector operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  x := A*x,   or   x := A**T*x,
-
-  where x is an n element vector and  A is an n by n unit, or non-unit,
-  upper or lower triangular matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-  
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be performed as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  x := A*x. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  x := A**T*x. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  x := A**T*x.
-  
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows: \n
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
- 
- \param[in] Ap
-  Ap is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension 
-  ( ( n*( n + 1 ) )/2 ). \n
-  Before entry with  UPLO = CblasUpper, the array Ap must
-  contain the upper triangular matrix packed sequentially,
-  column by column, so that Ap( 1 ) contains a( 1, 1 ),
-  Ap( 2 ) and Ap( 3 ) contain a( 1, 2 ) and a( 2, 2 )
-  respectively, and so on. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular matrix packed sequentially,
-  column by column, so that Ap( 1 ) contains a( 1, 1 ),
-  Ap( 2 ) and Ap( 3 ) contain a( 2, 1 ) and a( 3, 1 )
-  respectively, and so on. \n
-  Note that when  DIAG = CblasUnit, the diagonal elements of
-  A are not referenced, but are assumed to be unity.
-
-  \param[in, out] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : \n
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  vector x.On exit, x is overwritten with the transformed vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-  */
-template< typename T >
-void tpmv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t n,
-    T const *Ap,
-    T       *x, int64_t incx )
-{
-    cblas_tpmv(layout, uplo, trans, diag, n, Ap, x, incx);
-}
-
-/*! \brief Solve the one of the triangular matrix-vector equation for arbitrary data types 
-
-  \b Purpose:	
-
-  TRSV  solves one of the systems of equations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A*x = b,   or   A**T*x = b,
-
-  where b and x are n element vectors and A is an n by n unit, or
-  non-unit, upper or lower triangular matrix
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-  
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be performed as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  A*x = b. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  A**T*x = b. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  A**T*x = b.
-  
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows: \n
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular matrix and the strictly lower triangular part of
-  A is not referenced. \n
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular matrix and the strictly upper triangular part of
-  A is not referenced. \n
-  Note that when  DIAG = CblasUnit, the diagonal elements of
-  A are not referenced either, but are assumed to be unity.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-
-  \param[in, out] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  element right-hand side vector b.On exit, x is overwritten
-  with the transformed vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-  */
-template< typename T >
-void trsv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t n,
-    T const *A, int64_t lda,
-    T       *x, int64_t incx )
-{
-    cblas_trsv(layout, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-/*! \brief Solve the one of the triangular matrix-vector equation for arbitrary data types 
-
-  \b Purpose:	
-
-  TBSV  solves one of the systems of equations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A*x = b,   or   A**T*x = b,
-
-  where b and x are n element vectors and A is an n by n unit, or
-  non-unit, upper or lower triangular band matrix, with ( k + 1 )
-  diagonals.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-  
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be performed as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  A*x = b. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  A**T*x = b. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  A**T*x = b.
-  
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows: \n
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
-  \param[in] k
-  k is INTEGER
-  On entry with UPLO = CblasUpper, k specifies the number of
-  super-diagonals of the matrix A.
-  On entry with UPLO = CblasLower, k specifies the number of
-  sub-diagonals of the matrix A.
-  k must at least zero.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension ( lda, n )\n
-  Before entry with  UPLO = CblasUpper, the leading ( k + 1 )
-  by n part of the array A must contain the upper triangular
-  band part of the matrix of coefficients, supplied column by
-  column, with the leading diagonal of the matrix in row
-  ( k + 1 ) of the array, the first super-diagonal starting at
-  position 2 in row k, and so on. The top left k by k triangle
-  of the array A is not referenced. \n
-  Before entry with UPLO = CblasLower, the leading ( k + 1 )
-  by n part of the array A must contain the lower triangular
-  band part of the matrix of coefficients, supplied column by
-  column, with the leading diagonal of the matrix in row 1 of
-  the array, the first sub-diagonal starting at position 1 in
-  row 2, and so on. The bottom right k by k triangle of the
-  array A is not referenced. \n
-  Note that when  DIAG = CblasUnit, the elements of the array A
-  corresponding to the diagonal elements of the matrix are not
-  referenced, but are assumed to be unity.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, k+1 ).
-
-  \param[in, out] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  element right-hand side vector b.On exit, x is overwritten
-  with the solution vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-  */
-template< typename T >
-void tbsv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t n, int64_t k,
-    T const *A, int64_t lda,
-    T       *x, int64_t incx )
-{
-    cblas_tbsv(layout, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-
-/*! \brief Solve the one of the triangular matrix-vector equation for arbitrary data types 
-
-  \b Purpose:	
-
-  TPSV  solves one of the systems of equations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A*x = b,   or   A**T*x = b,
-
-  where b and x are n element vectors and A is an n by n unit, or
-  non-unit, upper or lower triangular band matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-  
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be performed as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  A*x = b. \n
-  trans = CBLAS_TRANSPOSE::CblasTrans,  A**T*x = b. \n
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  A**T*x = b.
-  
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows: \n
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.\n
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.n must be at least zero.
-
- \param[in] Ap
-  Ap is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension 
-  ( ( n*( n + 1 ) )/2 ). \n
-  Before entry with  UPLO = CblasUpper, the array Ap must
-  contain the upper triangular matrix packed sequentially,
-  column by column, so that Ap( 1 ) contains a( 1, 1 ),
-  Ap( 2 ) and Ap( 3 ) contain a( 1, 2 ) and a( 2, 2 )
-  respectively, and so on. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular matrix packed sequentially,
-  column by column, so that Ap( 1 ) contains a( 1, 1 ),
-  Ap( 2 ) and Ap( 3 ) contain a( 2, 1 ) and a( 3, 1 )
-  respectively, and so on. \n
-  Note that when  DIAG = CblasUnit, the diagonal elements of
-  A are not referenced, but are assumed to be unity.
-
-  \param[in, out] x
-  x is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the
-  element right-hand side vector b.On exit, x is overwritten
-  with the solution vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-  */
-template< typename T >
-void tpsv(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t n,
-    T const *Ap,
-    T       *x, int64_t incx )
-{
-    cblas_tpsv(layout, uplo, trans, diag, n, Ap, x, incx);
-}
-
-/*! \brief Perform the General matrix rank-1 update for arbitrary data types 
-
-  \b Purpose:	
-
-  GER  performs the rank 1 operation for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-
-  A := alpha*x*y**T + A,
- 
-  where alpha is a scalar, x is an m element vector, y is an n element
-  vector and A is an m by n matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m specifies the number of rows of the matrix A.
-  m must be at least zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the number of columns of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is REAL/DOUBLE PRECISION array,dimension : 
-  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the m
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] y
-  y is REAL/DOUBLE PRECISION array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry, the incremented array y must contain the n
-  element vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-
-  \param[in,out] A
-  A is REAL/DOUBLE PRECISION array,dimension ( lda, n )\n
-  Before entry, the leading m by n part of the array A must
-  contain the matrix of coefficients. On exit, A is
-  overwritten by the updated matrix.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, m ).
-  */
-template< typename T >
-void ger(
-    CBLAS_ORDER layout,
-    int64_t m, int64_t n,
-    T alpha,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy,
-    T *A, int64_t lda )
-{
-    cblas_ger(layout, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-/*! \brief Perform the General matrix rank-1 update for arbitrary data types 
-
-  \b Purpose:	
-
-  GERU  performs the rank 1 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A := alpha*x*y**T + A,
- 
-  where alpha is a scalar, x is an m element vector, y is an n element
-  vector and A is an m by n matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m specifies the number of rows of the matrix A.
-  m must be at least zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the number of columns of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION COMPLEX
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the m
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] y
-  y is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry, the incremented array y must contain the n
-  element vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-
-  \param[in,out] A
-  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
-  Before entry, the leading m by n part of the array A must
-  contain the matrix of coefficients. On exit, A is
-  overwritten by the updated matrix.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, m ).
-  */
-template< typename T >
-void geru(
-    CBLAS_ORDER layout,
-    int64_t m, int64_t n,
-    T alpha,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy,
-    T *A, int64_t lda )
-{
-    cblas_geru(layout, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-/*! \brief Perform the General matrix rank-1 update for arbitrary data types 
-
-  \b Purpose:	
-
-  GERC  performs the rank 1 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A := alpha*x*y**T + A,
- 
-  where alpha is a scalar, x is an m element vector, y is an n element
-  vector and A is an m by n matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m specifies the number of rows of the matrix A.
-  m must be at least zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the number of columns of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION COMPLEX
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( m - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the m
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] y
-  y is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry, the incremented array y must contain the n
-  element vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-
-  \param[in,out] A
-  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
-  Before entry, the leading m by n part of the array A must
-  contain the matrix of coefficients. On exit, A is
-  overwritten by the updated matrix.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, m ).
-  */
-template< typename T >
-void gerc(
-    CBLAS_ORDER layout,
-    int64_t m, int64_t n,
-    T alpha,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy,
-    T *A, int64_t lda )
-{
-    cblas_gerc(layout, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-/*! \brief Perform the hermitian rank 1 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  HER  performs the hermitian rank 1 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A := alpha*x*x**H + A,
- 
-  where alpha is a real scalar, x is an n element vector, A is an n by n
-  hermitian matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the upper or lower triangular 
-  part of the array A is to be referenced as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in,out] A
-  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular part of the hermitian matrix and the strictly
-  lower triangular part of A is not referenced. On exit, the
-  upper triangular part of the array A is overwritten by the
-  upper triangular part of the updated matrix. \n
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular part of the hermitian matrix and the strictly
-  upper triangular part of A is not referenced. On exit, the
-  lower triangular part of the array A is overwritten by the
-  lower triangular part of the updated matrix. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set, they are assumed to be zero, and on exit they
-  are set to zero.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-  */
-template< typename T >
-void her(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    real_type<T> alpha,  // zher takes double alpha; use real
-    T const *x, int64_t incx,
-    T       *A, int64_t lda )
-{
-    cblas_her(layout, uplo, n, alpha, x, incx, A, lda);
-}
-
-/*! \brief Perform the hermitian rank 1 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  HPR  performs the hermitian rank 1 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A := alpha*x*x**H + A,
- 
-  where alpha is a real scalar, x is an n element vector, A is an n by n
-  hermitian matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the upper or lower triangular 
-  part of the array A is to be referenced as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
-                                  supplied in Ap. \n
-  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
-                                  supplied in Ap.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in,out] Ap
-  Ap is SINGLE/DOUBLE PRECISION COMPLEX array,dimension 
-  atleast ( ( n*( n + 1 ) )/2 ).\n
-  Before entry with  UPLO = CblasUpper, the array Ap must
-  contain the upper triangular part of the hermitian matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
-  and a( 2, 2 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the upper triangular part of the
-  updated matrix. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular part of the hermitian matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
-  and a( 3, 1 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the lower triangular part of the
-  updated matrix. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set, they are assumed to be zero, and on exit they
-  are set to zero.  
-  */
-template< typename T >
-void hpr(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    real_type<T> alpha,  // zher takes double alpha; use real
-    T const *x, int64_t incx,
-    T       *Ap )
-{
-    cblas_hpr(layout, uplo, n, alpha, x, incx, Ap);
-}
-
-/*! \brief Perform the hermitian rank 2 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  HER2  performs the hermitian rank 2 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
- 
-  where alpha is a scalar, x and y are n element vector, A is an n by n
-  hermitian matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies whether the upper or lower triangular part of the 
-  array A is to be referenced as follows: \n
-  UPLO = CblasUpper   Only the upper triangular part of A
-                      is to be referenced. \n
-  UPLO = CblasLower   Only the lower triangular part of A
-                      is to be referenced.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION COMPLEX
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] y
-  y is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry, the incremented array y must contain the n
-  element vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  
-  \param[in,out] A
-  A is SINGLE/DOUBLE PRECISION COMPLEX array,dimension ( lda, n )\n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular part of the hermitian matrix and the strictly
-  lower triangular part of A is not referenced. On exit, the
-  upper triangular part of the array A is overwritten by the
-  upper triangular part of the updated matrix. \n
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular part of the hermitian matrix and the strictly
-  upper triangular part of A is not referenced. On exit, the
-  lower triangular part of the array A is overwritten by the
-  lower triangular part of the updated matrix. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set, they are assumed to be zero, and on exit they
-  are set to zero.  
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-  */
-template< typename T >
-void her2(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,  
-    T const *x, int64_t incx,
-    T const *y, int64_t incy,
-    T       *A, int64_t lda )
-{
-    cblas_her2(layout, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-/*! \brief Perform the hermitian rank 2 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  HPR2  performs the hermitian rank 2 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
- 
-  where alpha is a scalar, x and y are n element vector, A is an n by n
-  hermitian matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the upper or lower triangular 
-  part of the array A is to be referenced as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
-                                  supplied in Ap. \n
-  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
-                                  supplied in Ap.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION COMPLEX
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION COMPLEX array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] y
-  y is SINGLE/DOUBLE PRECISION REAL array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry, the incremented array y must contain the n
-  element vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  
-  \param[in,out] Ap
-  Ap is SINGLE/DOUBLE PRECISION COMPLEX array,dimension 
-  atleast ( ( n*( n + 1 ) )/2 ).\n
-  Before entry with  UPLO = CblasUpper, the array Ap must
-  contain the upper triangular part of the hermitian matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
-  and a( 2, 2 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the upper triangular part of the
-  updated matrix. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular part of the hermitian matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
-  and a( 3, 1 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the lower triangular part of the
-  updated matrix. \n
-  Note that the imaginary parts of the diagonal elements need
-  not be set, they are assumed to be zero, and on exit they
-  are set to zero.  
-  */
-template< typename T >
-void hpr2(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,  
-    T const *x, int64_t incx,
-    T const *y, int64_t incy,
-    T       *Ap )
-{
-    cblas_hpr2(layout, uplo, n, alpha, x, incx, y, incy, Ap);
-}
-
-/*! \brief Perform the symmetric rank 1 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  SYR performs the symmetric rank 1 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-
-  A := alpha*x*x**T + A,
- 
-  where alpha is a real scalar, x is an n element vector, A is an n by n
-  symmetric matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the upper or lower triangular 
-  part of the array A is to be referenced as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix. \n
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in,out] A
-  A is SINGLE/DOUBLE PRECISION REAL array,dimension ( lda, n )\n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular part of the symmetric matrix and the strictly
-  lower triangular part of A is not referenced. On exit, the
-  upper triangular part of the array A is overwritten by the
-  upper triangular part of the updated matrix. \n
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular part of the symmetric matrix and the strictly
-  upper triangular part of A is not referenced. On exit, the
-  lower triangular part of the array A is overwritten by the
-  lower triangular part of the updated matrix. \n
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-  */
-template< typename T >
-void syr(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,  
-    T const *x, int64_t incx,
-    T       *A, int64_t lda )
-{
-    cblas_syr(layout, uplo, n, alpha, x, incx, A, lda);
-}
-
-/*! \brief Perform the symmetric rank 1 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  SPR  performs the symmetric rank 1 operation for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL
-
-  A := alpha*x*x**T + A,
- 
-  where alpha is a real scalar, x is an n element vector, A is an n by n
-  symmetric matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the upper or lower triangular 
-  part of the array A is to be referenced as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
-                                  supplied in Ap. \n
-  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
-                                  supplied in Ap.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in,out] Ap
-  Ap is SINGLE/DOUBLE PRECISION REAL array,dimension 
-  atleast ( ( n*( n + 1 ) )/2 ).\n
-  Before entry with  UPLO = CblasUpper, the array Ap must
-  contain the upper triangular part of the symmetric matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
-  and a( 2, 2 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the upper triangular part of the
-  updated matrix. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular part of the symmetric matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
-  and a( 3, 1 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the lower triangular part of the
-  updated matrix. \n  
-  */
-template< typename T >
-void spr(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,  
-    T const *x, int64_t incx,
-    T       *Ap )
-{
-    cblas_spr(layout, uplo, n, alpha, x, incx, Ap);
-}
-
-/*! \brief Perform the symmetric rank 2 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  SYR2  performs the symmetric rank 2 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-
-  A := alpha*x*y**T + alpha*y*x**T + A,
- 
-  where alpha is a scalar, x and y are n element vector, A is an n by n
-  symmetric matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies whether the upper or lower triangular part of the 
-  array A is to be referenced as follows: \n
-  UPLO = CblasUpper   Only the upper triangular part of A
-                      is to be referenced. \n
-  UPLO = CblasLower   Only the lower triangular part of A
-                      is to be referenced.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] y
-  y is SINGLE/DOUBLE PRECISION REAL array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry, the incremented array y must contain the n
-  element vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  
-  \param[in,out] A
-  A is SINGLE/DOUBLE PRECISION REAL array,dimension ( lda, n )\n
-  Before entry with  UPLO = CblasUpper, the leading n by n
-  upper triangular part of the array A must contain the upper
-  triangular part of the symmetric matrix and the strictly
-  lower triangular part of A is not referenced. On exit, the
-  upper triangular part of the array A is overwritten by the
-  upper triangular part of the updated matrix. \n
-  Before entry with UPLO = CblasLower, the leading n by n
-  lower triangular part of the array A must contain the lower
-  triangular part of the symmetric matrix and the strictly
-  upper triangular part of A is not referenced. On exit, the
-  lower triangular part of the array A is overwritten by the
-  lower triangular part of the updated matrix. \n
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  lda must be at least max( 1, n ).
-  */
-template< typename T >
-void syr2(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy,
-    T       *A, int64_t lda )
-{
-    cblas_syr2(layout, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-/*! \brief Perform the symmetric rank 2 operation for arbitrary data types 
-
-  \b Purpose:	
-
-  SPR2  performs the symmetric rank 2 operation for arbitrary data types
-  Data precisions supported include SINGLE/DOUBLE PRECISION REAL
-
-  A := alpha*x*y**T + alpha*y*x**T + A,
- 
-  where alpha is a scalar, x and y are n element vector, A is an n by n
-  symmetric matrix, supplied in packed form.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO.
-  uplo specifies specifies whether the upper or lower triangular 
-  part of the array A is to be referenced as follows: \n
-  uplo = CBLAS_UPLO::CblasUpper   The upper triangular part of A is
-                                  supplied in Ap. \n
-  uplo = CBLAS_UPLO::CblasLower   The lower triangular part of A is
-                                  supplied in Ap.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix A.
-  n must be at least zero.
-
-  \param[in] alpha
-  alpha is SINGLE/DOUBLE PRECISION REAL
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] x
-  x is SINGLE/DOUBLE PRECISION REAL array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incx ) ). \n
-  Before entry, the incremented array x must contain the n
-  element vector x.
-
-  \param[in] incx
-  incx is INTEGER
-  On entry, incx specifies the increment for the elements of x.
-  incx must not be zero.
-
-  \param[in] y
-  y is SINGLE/DOUBLE PRECISION REAL array,dimension : 
-  at least ( 1 + ( n - 1 )*abs( incy ) ). \n
-  Before entry, the incremented array y must contain the n
-  element vector y.
-
-  \param[in] incy
-  incy is INTEGER
-  On entry, incy specifies the increment for the elements of y.
-  incy must not be zero.
-  
-  \param[in,out] Ap
-  Ap is SINGLE/DOUBLE PRECISION REAL array,dimension 
-  atleast ( ( n*( n + 1 ) )/2 ).\n
-  Before entry with  UPLO = CblasUpper, the array Ap must
-  contain the upper triangular part of the symmetric matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 1, 2 )
-  and a( 2, 2 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the upper triangular part of the
-  updated matrix. \n
-  Before entry with UPLO = CblasLower, the array Ap must
-  contain the lower triangular part of the symmetric matrix
-  packed sequentially, column by column, so that Ap( 1 )
-  contains a( 1, 1 ), Ap( 2 ) and Ap( 3 ) contain a( 2, 1 )
-  and a( 3, 1 ) respectively, and so on. On exit, the array
-  Ap is overwritten by the lower triangular part of the
-  updated matrix. \n  
-  */
-template< typename T >
-void spr2(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    int64_t n,
-    T alpha,
-    T const *x, int64_t incx,
-    T const *y, int64_t incy,
-    T       *Ap )
-{
-    cblas_spr2(layout, uplo, n, alpha, x, incx, y, incy, Ap);
-}
-
-/*! \brief General matrix-matrix multiply for arbitrary data types
-
-  \b Purpose:	
-
-  GEMM  performs general matrix-matrix multiply for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  C := alpha*op( A )*op( B ) + beta*C,
-
-  where  op( X ) is one of
-
-  op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
-
-  alpha and beta are scalars, and A, B and C are matrices, with op( A )
-  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] transA
-  transA is CBLAS_TRANSPOSE
-  On entry, transA specifies the form of op( A ) to be used in
-  the matrix multiplication as follows:
-  transA = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
-  transA = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
-  transA = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**H.
-
-  \param[in] transB
-  transB is CBLAS_TRANSPOSE
-  On entry, transB specifies the form of op( B ) to be used in
-  the matrix multiplication as follows:
-  transB = CBLAS_TRANSPOSE::CblasNoTrans,  op( B ) = B.
-  transB = CBLAS_TRANSPOSE::CblasTrans,  op( B ) = B**T.
-  transB = CBLAS_TRANSPOSE::CblasConjTrans,  op( B ) = B**H.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m  specifies  the number  of rows  of the  matrix
-  op( A )  and of the  matrix  C.  m  must  be at least  zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n  specifies the number  of columns of the matrix
-  op( B ) and the number of columns of the matrix C. n must be
-  at least zero.
-
-  \param[in] k
-  k is INTEGER
-  On entry,  k  specifies  the number of columns of the matrix
-  op( A ) and the number of rows of the matrix op( B ). k must
-  be at least  zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If transA = CblasNoTrans:
-  m-by-k , stored in an lda-by-k array [RowMajor: m-by-lda].
-  Otherwise:
-  k-by-m , stored in an lda-by-m array [RowMajor: k-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If transA = CblasNoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)].
-  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, m)].
-
-  \param[in] B
-  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If transA = CblasNoTrans:
-  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb].
-  Otherwise:
-  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
-
-  \param[in] ldb
-  ldb is INTEGER
-  On entry, ldb specifies the Leading dimension of B
-  If transA = CblasNoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
-  Otherwise:                ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
-
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then C need not be set on input.
-
-  \param[in,out] C
-  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
-  m-by-n stored in an ldc-by-n array [RowMajor: m-by-ldc].
-  Before entry, the leading  m by n  part of the array  C must
-  contain the matrix  C,  except when  beta  is zero, in which
-  case C need not be set on entry.
-  On exit, the array  C  is overwritten by the  m by n  matrix
-  ( alpha*op( A )*op( B ) + beta*C ).
-
-  \param[in] ldc
-  ldc is INTEGER
-  On entry, ldc specifies the first dimension of C
-  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
-  */	
-template< typename T >
+// =============================================================================
+/// General matrix-matrix multiply,
+///     \f[ C = \alpha op(A) \times op(B) + \beta C, \f]
+/// where op(X) is one of
+///     \f[ op(X) = X,   \f]
+///     \f[ op(X) = X^T, \f]
+///     \f[ op(X) = X^H, \f]
+/// alpha and beta are scalars, and A, B, and C are matrices, with
+/// op(A) an m-by-k matrix, op(B) a k-by-n matrix, and C an m-by-n matrix.
+///
+/// Generic implementation for arbitrary data types.
+/// TODO: generic version not yet implemented.
+///
+/// @param[in] layout
+///     Matrix storage, Layout::ColMajor or Layout::RowMajor.
+///
+/// @param[in] transA
+///     The operation op(A) to be used:
+///     - Op::NoTrans:   \f$ op(A) = A.   \f$
+///     - Op::Trans:     \f$ op(A) = A^T. \f$
+///     - Op::ConjTrans: \f$ op(A) = A^H. \f$
+///
+/// @param[in] transB
+///     The operation op(B) to be used:
+///     - Op::NoTrans:   \f$ op(B) = B.   \f$
+///     - Op::Trans:     \f$ op(B) = B^T. \f$
+///     - Op::ConjTrans: \f$ op(B) = B^H. \f$
+///
+/// @param[in] m
+///     Number of rows of the matrix C and op(A). m >= 0.
+///
+/// @param[in] n
+///     Number of columns of the matrix C and op(B). n >= 0.
+///
+/// @param[in] k
+///     Number of columns of op(A) and rows of op(B). k >= 0.
+///
+/// @param[in] alpha
+///     Scalar alpha. If alpha is zero, A and B are not accessed.
+///
+/// @param[in] A
+///     - If transA = NoTrans:
+///       the m-by-k matrix A, stored in an lda-by-k array [RowMajor: m-by-lda].
+///     - Otherwise:
+///       the k-by-m matrix A, stored in an lda-by-m array [RowMajor: k-by-lda].
+///
+/// @param[in] lda
+///     Leading dimension of A.
+///     - If transA = NoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)].
+///     - Otherwise:           lda >= max(1, k) [RowMajor: lda >= max(1, m)].
+///
+/// @param[in] B
+///     - If transB = NoTrans:
+///       the k-by-n matrix B, stored in an ldb-by-n array [RowMajor: k-by-ldb].
+///     - Otherwise:
+///       the n-by-k matrix B, stored in an ldb-by-k array [RowMajor: n-by-ldb].
+///
+/// @param[in] ldb
+///     Leading dimension of B.
+///     - If transB = NoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
+///     - Otherwise:           ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
+///
+/// @param[in] beta
+///     Scalar beta. If beta is zero, C need not be set on input.
+///
+/// @param[in] C
+///     The m-by-n matrix C, stored in an ldc-by-n array [RowMajor: m-by-ldc].
+///
+/// @param[in] ldc
+///     Leading dimension of C. ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
+///
+/// @ingroup gemm
+
+template< typename TA, typename TB, typename TC >
 void gemm(
-    CBLAS_ORDER layout,
+    CBLAS_LAYOUT layout,
     CBLAS_TRANSPOSE transA,
     CBLAS_TRANSPOSE transB,
     int64_t m, int64_t n, int64_t k,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *B, int64_t ldb,
-    T beta,
-    T       *C, int64_t ldc )
+    scalar_type<TA, TB, TC> alpha,
+    TA const *A, int64_t lda,
+    TB const *B, int64_t ldb,
+    scalar_type<TA, TB, TC> beta,
+    TC       *C, int64_t ldc )
 {
+//    printf("In gemm.cpp\n");
     cblas_gemm(layout, transA, transB, m, n, k, alpha, A,lda, B, ldb, beta, C, ldc);
-}
 
-/*! \brief Solve the triangular matrix-matrix equation for arbitrary data types 
-
-  \b Purpose:	
-
-  TRSM  performs  one of the matrix equations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
-
-  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
-  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
-  where  op( X ) is one of
-      
-  op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
-
-  The matrix X is overwritten on B.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] side
-  side is enum CBLAS_SIDE
-  side specifies specifies whether op( A ) appears on the left
-  or right of X as follows:
-  side = CBLAS_SIDE::CblasLeft   op( A )*X = alpha*B.
-  side = CBLAS_SIDE::CblasRight   op( A )*X = alpha*B.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether the matrix A is an upper or
-  lower triangular matrix as follows:
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix.
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-  
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the form of op( A ) to be used in
-  the matrix multiplication as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
-  trans = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**H.
-  
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows:
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m  specifies  the number  of rows  of the  matrix
-  B.  m  must  be at least  zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n  specifies the number  of columns of the matrix
-  B. n must be at least zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If side = CblasLeft:
-  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
-  If side = CblasRight:
-  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If side = CblasLeft: lda >= max(1, m) .
-  If side = CblasRight:lda >= max(1, k) .
-
-  \param[in,out] B
-  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb]. 
-  on exit  is overwritten by the solution matrix  X.
-
-  \param[in] ldb
-  ldb is INTEGER
-  On entry, ldb specifies the Leading dimension of B
-  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
-  */
-template< typename T >
-void trsm(
-    CBLAS_ORDER layout,
-    CBLAS_SIDE side,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t m,
-    int64_t n,
-    T alpha,
-    T const *A, int64_t lda,
-    T       *B, int64_t ldb )
-{
-    cblas_trsm( layout, side, uplo, trans, diag, m, n,  alpha, A, lda, B, ldb);
-}
-/*! \brief Solve the Triangular matrix-matrix multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  TRMM  performs solves one of the matrix equations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
-
-  where alpha is a scalar, B is an m by n matrices, A is a unit, or
-  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
-  op( A ) = A   or   op( A ) = A**T.
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] side
-  side is enum CBLAS_SIDE
-  side specifies whether op( A ) multiplies B from left or right of X
-  as follows:
-  side = CBLAS_SIDE::CblasLeft   B := alpha*op( A )*B.
-  side = CBLAS_SIDE::CblasRight  B := alpha*B*op( A ).
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies whether the matrix A is an upper or lower triangular
-  matrix as follows:
-  uplo = CBLAS_UPLO::CblasUpper   A is an upper triangular matrix.
-  uplo = CBLAS_UPLO::CblasLower   A is a lower triangular matrix.
-
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the form of op( A ) to be used in
-  the matrix multiplication as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  op( A ) = A.
-  trans = CBLAS_TRANSPOSE::CblasTrans,  op( A ) = A**T.
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,  op( A ) = A**T.
-
-  \param[in] diag
-  diag is enum CBLAS_DIAG
-  diag specifies specifies whether or not A is unit triangular
-  as follows:
-  diag = CBLAS_DIAG::CblasUnit   A is assumed to be unit triangular.
-  diag = CBLAS_DIAG::CblasNonUnit   A is not assumed to be unit
-                                 triangular.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m  specifies  the number  of rows  of the  matrix
-  B.  m  must  be at least  zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n  specifies the number  of columns of the matrix
-  B. n must be at least zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.When  alpha is
-  zero then  A is not referenced and  B need not be set before
-  entry.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If side = CblasLeft:
-  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
-  If side = CblasRight:
-  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If side = CblasLeft: lda >= max(1, m) .
-  If side = CblasRight:lda >= max(1, n) .
-
-  \param[in,out] B
-  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
-
-  \param[in] ldb
-  ldb is INTEGER
-  On entry, ldb specifies the Leading dimension of B
-  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
-  */
-template< typename T >
-void trmm(
-    CBLAS_ORDER layout,
-    CBLAS_SIDE side,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    CBLAS_DIAG diag,
-    int64_t m,
-    int64_t n,
-    T alpha,
-    T const *A, int64_t lda,
-    T       *B, int64_t ldb )
-{
-    cblas_trmm( layout, side, uplo, trans, diag, m, n,  alpha, A, lda, B, ldb);
-}
-
-/*! \brief Solve the Hermitian matrix-matrix multiply for arbitrary data types 
-
-  \b Purpose:	
-
-  HEMM  performs solves one of the matrix-matrix operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  C := alpha*A*B + beta*C
-
-  or
-
-  C := alpha*B*A + beta*C,
-
-  where alpha is a scalar,  A is an hermitian matrix
-  C and B are m by n matrices
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] side
-  side is enum CBLAS_SIDE
-  side specifies specifies whether the  hermitian matrix  A
-  appears on the  left or right  in the  operation as follows:
-  side = CBLAS_SIDE::CblasLeft   C := alpha*A*B + beta*C,
-  side = CBLAS_SIDE::CblasRight   C := alpha*B*A + beta*C
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether  the  upper  or  lower
-  triangular  part  of  the  hermitian  matrix   A  is  to  be
-  referenced as follows:
-  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of the
-                                  hermitian matrix is to be referenced.
-  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of the
-                                  hermitian matrix is to be referenced.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m  specifies  the number  of rows  of the  matrix
-  C.  m  must  be at least  zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n  specifies the number  of columns of the matrix
-  C. n must be at least zero.
-
-  \param[in] alpha
-  alpha is COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is COMPLEX/COMPLEX*16 array,dimension :
-  If side = CblasLeft:
-  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
-  If side = CblasRight:
-  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If side = CblasLeft: lda >= max(1, m) .
-  If side = CblasRight:lda >= max(1, k) .
-
-  \param[in] B
-  B is COMPLEX/COMPLEX*16 array,dimension :
-  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
-
-  \param[in] ldb
-  ldb is INTEGER
-  On entry, ldb specifies the Leading dimension of B
-  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
-
-  \param[in] beta
-  beta is COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar beta.
-  If beta is zero, C need not be set on input
-
-  \param[in,out] C
-  C is COMPLEX/COMPLEX*16 array,dimension :
-  m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc].
-
-  \param[in] ldc
-  ldc is INTEGER
-  On entry, ldc specifies the Leading dimension of C
-  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
-  */
-template< typename T >
-void hemm(
-    CBLAS_ORDER layout,
-    CBLAS_SIDE side,
-    CBLAS_UPLO uplo,
-    int64_t m, int64_t n,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *B, int64_t ldb,
-    T beta,
-    T       *C, int64_t ldc )
-{
-    cblas_hemm( layout, side, uplo, m, n,  alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-/*! \brief Solve the Symmetric matrix-matrix multiply for arbitrary data types 
-
-  \b Purpose:	
-  
-  SYMM  performs solves one of the matrix-matrix operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  C := alpha*A*B + beta*C
-
-  or
-
-  C := alpha*B*A + beta*C,
-
-  where alpha is a scalar,  A is an symmetric matrix
-  C and B are m by n matrices
-
-  \param[in] layout
-  layout is enum CBLAS_ORDER
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_ORDER::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] side
-  side is enum CBLAS_SIDE
-  side specifies specifies whether the  symmetric matrix  A
-  appears on the  left or right  in the  operation as follows:
-  side = CBLAS_SIDE::CblasLeft   C := alpha*A*B + beta*C,
-  side = CBLAS_SIDE::CblasRight   C := alpha*B*A + beta*C
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether  the  upper  or  lower
-  triangular  part  of  the  symmetric  matrix   A  is  to  be
-  referenced as follows:
-  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of the
-                                  symmetric matrix is to be referenced.
-  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of the
-                                  symmetric matrix is to be referenced.
-
-  \param[in] m
-  m is INTEGER
-  On entry,  m  specifies  the number  of rows  of the  matrix
-  C.  m  must  be at least  zero.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n  specifies the number  of columns of the matrix
-  C. n must be at least zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If side = CblasLeft:
-  the m-by-m matrix A, stored in an lda-by-m array [RowMajor: m-by-lda].
-  If side = CblasRight:
-  the n-by-n matrix A, stored in an lda-by-n array [RowMajor: n-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If side = CblasLeft: lda >= max(1, m) .
-  If side = CblasRight:lda >= max(1, k) .
-
-  \param[in] B
-  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  m-by-n , stored in an ldb-by-n array [RowMajor: m-by-ldb].
-
-  \param[in] ldb
-  ldb is INTEGER
-  On entry, ldb specifies the Leading dimension of B
-  ldb >= max(1, m) [RowMajor: ldb >= max(1, n)].
-
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar beta.
-  If beta is zero, C need not be set on input
-
-  \param[in, out] C
-  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  m-by-n , stored in an ldc-by-n array [RowMajor: m-by-ldc].
-
-  \param[in] ldc
-  ldc is INTEGER
-  On entry, ldc specifies the Leading dimension of C
-  ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].
-  */
-template< typename T >
-void symm(
-    CBLAS_ORDER layout,
-    CBLAS_SIDE side,
-    CBLAS_UPLO uplo,
-    int64_t m, int64_t n,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *B, int64_t ldb,
-    T beta,
-    T       *C, int64_t ldc )
-{
-    cblas_symm( layout, side, uplo, m, n,  alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-/*! \brief Solve the Symmetric rank-k operations for arbitrary data types 
-
-  \b Purpose:	
-
-  SYRK  performs one of the symmetric rank k operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  C := alpha*A*A**T + beta*C,
-
-  or
-
-  C := alpha*A**T*A + beta*C,
-
-  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
-  and  A  is an  n by k  matrix in the first case and a  k by n  matrix
-  in the second case.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether  the  upper  or  lower
-  triangular  part  of  the  array C  is  to  be  referenced
-  as follows:
-  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
-                                  is to be referenced.
-  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
-                                  is to be referenced.
-
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be used as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*A**T + beta*C.
-  trans = CBLAS_TRANSPOSE::CblasTrans,C := alpha*A**T*A + beta*C.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix C.  n must be
-  at least zero.
-
-  \param[in] k
-  k is INTEGER
-  If trans = CblasNoTrans: k is number of columns of the matrix A.
-  Otherwise:               k is number of rows    of the matrix A.
-  k must be at least  zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If transA = CblasNoTrans:
-  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
-  Otherwise:
-  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If transA = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
-  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
-
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then C need not be set on input.
-
-  \param[in,out] C
-  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
-  The n-by-n symmetric matrix C,
-  stored in an ldc-by-n array [RowMajor: n-by-ldc].
-  On exit, the array  C  is overwritten by the  lower/upper
-  triangular part of the updated matrix.
-
-  \param[in] ldc
-  ldc is INTEGER
-  On entry, ldc specifies the first dimension of C
-  ldc >= max(1, n)
-  */
-template< typename T >
-void syrk(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    int64_t n, int64_t k,
-    T alpha,
-    T const *A, int64_t lda,
-    T beta,
-    T       *C, int64_t ldc )
-{
-    cblas_syrk( layout, uplo, trans,  n, k,  alpha, A, lda, beta, C, ldc);
-}
-
-/*! \brief Solve the Symmetric rank 2k operations for arbitrary data types 
-
-  \b Purpose:	
-
-  SYR2K  performs one of the symmetric rank 2k operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION REAL, DOUBLE PRECISION REAL,
-  SINGLE PRECISION COMPLEX, DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-  C := alpha*A*B**T + alpha*B*A**T + beta*C,
-
-  or
-
-  C := alpha*A**T*B + alpha*B**T*A + beta*C,
-
-  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
-  and  A  and B are n by k  matrices in the first case and k by n  matrices
-  in the second case.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether  the  upper  or  lower
-  triangular  part  of  the  array C  is  to  be  referenced
-  as follows:
-  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
-                                  is to be referenced.
-  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
-                                  is to be referenced.
-
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be used as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,C := alpha*A*B**T + alpha*B*A**T + beta*C.
-  trans = CBLAS_TRANSPOSE::CblasTrans,  C := alpha*A**T*B + alpha*B**T*A + beta*C.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix C.  n must be
-  at least zero.
-
-  \param[in] k
-  k is INTEGER
-  If trans = CblasNoTrans: k is number of columns of the matrices A & B.
-  Otherwise:               k is number of rows    of the matrices A & B.
-  k must be at least  zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If trans = CblasNoTrans:
-  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
-  Otherwise:
-  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
-  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
-
-  \param[in] B
-  B is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array,dimension :
-  If trans = CblasNoTrans:
-  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
-  Otherwise:
-  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb]
-
-  \param[in] ldb
-  ldb is INTEGER
-  On entry, ldb specifies the Leading dimension of B
-  If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
-  Otherwise:               ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
-
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then C need not be set on input.
-
-  \param[in,out] C
-  C is REAL/DOUBLE PRECISION/COMPLEX/COMPLEX*16 array, dimension :
-  The n-by-n symmetric matrix C,
-  stored in an ldc-by-n array [RowMajor: n-by-ldc].
-  On exit, the array  C  is overwritten by the  lower/upper
-  triangular part of the updated matrix.
-
-  \param[in] ldc
-  ldc is INTEGER
-  On entry, ldc specifies the first dimension of C
-  ldc >= max(1, n)
-  */
-template< typename T >
-void syr2k(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    int64_t n, int64_t k,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *B, int64_t ldb,
-    T beta,
-    T       *C, int64_t ldc )
-{
-    cblas_syr2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );	
-}
-
-/*! \brief Solve the Hermitian rank k operations for arbitrary data types 
-
-  \b Purpose:	
-
-  HERK  performs one of the hermitian rank k operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION COMPLEX, 
-  DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-    C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
-  
-  or
-  
-    C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
-
-  where  alpha and beta  are real scalars,  C is an  n by n  hermitian 
-  matrix and  A  is an n by k  matrix in the first case and 
-  k by n  matrix in the second case.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether  the  upper  or  lower
-  triangular  part  of  the  array C  is  to  be  referenced 
-  as follows:
-  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
-                                  is to be referenced.
-  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
-                                  is to be referenced.
-
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be used as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  C := alpha*A*A**H + beta*C.
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*A + beta*C.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix C.  n must be
-  at least zero.
-
-  \param[in] k
-  k is INTEGER
-  If trans = CblasNoTrans: k is number of columns of the matrix   A. 
-  Otherwise:               k is number of rows    of the matrix   A.
-  k must be at least  zero.
-
-  \param[in] alpha
-  alpha is REAL/DOUBLE PRECISION
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is COMPLEX/COMPLEX*16 array,dimension :
-  If trans = CblasNoTrans:
-  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
-  Otherwise:
-  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
-  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
-  
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then C need not be set on input.
-
-  \param[in,out] C
-  C is COMPLEX/COMPLEX*16 array, dimension :
-  The n-by-n Hermitian matrix C,
-  stored in an ldc-by-n array [RowMajor: n-by-ldc].
-  On exit, the array  C  is overwritten by the  lower/upper 
-  triangular part of the updated matrix.
-
-  \param[in] ldc
-  ldc is INTEGER
-  On entry, ldc specifies the first dimension of C
-  ldc >= max(1, n) 
-  */
-template< typename T >
-void herk(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    int64_t n, int64_t k,
-    real_type<T> alpha,
-    T const *A, int64_t lda,
-    real_type<T> beta,
-    T       *C, int64_t ldc )
-{
-    cblas_herk( layout, uplo, trans, n, k, alpha, A, lda, beta, C, ldc );	
-}
-
-/*! \brief Solve the Hermitian rank 2k operations for arbitrary data types 
-
-  \b Purpose:	
-
-  HER2K  performs one of the hermitian rank 2k operations for arbitrary data types
-  Data precisions supported include SINGLE PRECISION COMPLEX,
-  DOUBLE PRECISION COMPLEX(COMPLEX*16)
-
-    C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
-  
-  or
-  
-    C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
-
-  where  alpha and beta  are scalars with  beta  real,  C is an  n by n 
-  hermitian matrix and  A  and B are n by k  matrices in the first case 
-  and k by n  matrices in the second case.
-
-  \param[in] layout
-  layout is enum CBLAS_LAYOUT
-  layout specifies Matrix storage as follows:
-  layout = CBLAS_LAYOUT::CblasRowMajor or Layout::CblasColMajor.
-
-  \param[in] uplo
-  uplo is enum CBLAS_UPLO
-  uplo specifies specifies whether  the  upper  or  lower
-  triangular  part  of  the  array C  is  to  be  referenced 
-  as follows:
-  uplo = CBLAS_UPLO::CblasUpper   Only the upper triangular part of C
-                                  is to be referenced.
-  uplo = CBLAS_UPLO::CblasLower   Only the lower triangular part of C
-                                  is to be referenced.
-
-  \param[in] trans
-  trans is CBLAS_TRANSPOSE
-  On entry, trans specifies the operation to be used as follows:
-  trans = CBLAS_TRANSPOSE::CblasNoTrans,  C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C.
-  trans = CBLAS_TRANSPOSE::CblasConjTrans,C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C.
-
-  \param[in] n
-  n is INTEGER
-  On entry,  n specifies the order of the matrix C.  n must be
-  at least zero.
-
-  \param[in] k
-  k is INTEGER
-  If trans = CblasNoTrans: k is number of columns of the matrices A & B. 
-  Otherwise:               k is number of rows    of the matrices A & B.
-  k must be at least  zero.
-
-  \param[in] alpha
-  alpha is COMPLEX/COMPLEX*16
-  On entry, alpha specifies the scalar alpha.
-
-  \param[in] A
-  A is COMPLEX/COMPLEX*16 array,dimension :
-  If trans = CblasNoTrans:
-  n-by-k , stored in an lda-by-k array [RowMajor: n-by-lda].
-  Otherwise:
-  k-by-n , stored in an lda-by-n array [RowMajor: k-by-lda].
-
-  \param[in] lda
-  lda is INTEGER
-  On entry, lda specifies the Leading dimension of A
-  If trans = CblasNoTrans: lda >= max(1, n) [RowMajor: lda >= max(1, k)].
-  Otherwise:                lda >= max(1, k) [RowMajor: lda >= max(1, n)].
-
-  \param[in] B
-  B is COMPLEX/COMPLEX*16 array,dimension :
-  If trans = CblasNoTrans:
-  n-by-k , stored in an ldb-by-k array [RowMajor: n-by-ldb].
-  Otherwise:
-  k-by-n , stored in an ldb-by-n array [RowMajor: k-by-ldb]
-
-  \param[in] ldb
-  ldb is INTEGER
-  On entry, ldb specifies the Leading dimension of B
-  If trans = CblasNoTrans: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
-  Otherwise:               ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
-  
-  \param[in] beta
-  beta is REAL/DOUBLE PRECISION
-  On entry, beta specifies the scalar alpha.When  beta  is
-  supplied as zero then C need not be set on input.
-
-  \param[in,out] C
-  C is COMPLEX/COMPLEX*16 array, dimension :
-  The n-by-n Hermitian matrix C,
-  stored in an ldc-by-n array [RowMajor: n-by-ldc].
-  On exit, the array  C  is overwritten by the  lower/upper 
-  triangular part of the updated matrix.
-
-  \param[in] ldc
-  ldc is INTEGER
-  On entry, ldc specifies the first dimension of C
-  ldc >= max(1, n) 
-  */
-template< typename T >
-void her2k(
-    CBLAS_ORDER layout,
-    CBLAS_UPLO uplo,
-    CBLAS_TRANSPOSE trans,
-    int64_t n, int64_t k,
-    T alpha,
-    T const *A, int64_t lda,
-    T const *B, int64_t ldb,
-    real_type<T> beta,
-    T       *C, int64_t ldc )
-{
-    cblas_her2k( layout, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc );	
-}
+};
 
 }  // namespace blis
+
 #endif        //  #ifndef BLIS_HH
diff --git a/cpp/blis_util.hh b/cpp/blis_util.hh
new file mode 100644
index 000000000..ebe4274e6
--- /dev/null
+++ b/cpp/blis_util.hh
@@ -0,0 +1,226 @@
+#ifndef BLIS_UTIL_HH
+#define BLIS_UTIL_HH
+
+#include <complex>
+#include <cstdarg>
+
+namespace blis {
+
+// -----------------------------------------------------------------------------
+// Extend real, imag, conj to other datatypes.
+template< typename T >
+inline T real( T x ) { return x; }
+
+template< typename T >
+inline T imag( T x ) { return 0; }
+
+template< typename T >
+inline T conj( T x ) { return x; }
+
+// -----------------------------------------------------------------------------
+// 1-norm absolute value, |Re(x)| + |Im(x)|
+template< typename T >
+T abs1( T x )
+{
+    return std::abs( x );
+}
+
+template< typename T >
+T abs1( std::complex<T> x )
+{
+    return std::abs( real(x) ) + std::abs( imag(x) );
+}
+
+// -----------------------------------------------------------------------------
+// common_type_t is defined in C++14; here's a C++11 definition
+#if __cplusplus >= 201402L
+    using std::common_type_t;
+    using std::decay_t;
+#else
+    template< typename... Ts >
+    using common_type_t = typename std::common_type< Ts... >::type;
+
+    template< typename... Ts >
+    using decay_t = typename std::decay< Ts... >::type;
+#endif
+
+//------------------------------------------------------------------------------
+/// True if T is std::complex<T2> for some type T2.
+template <typename T>
+struct is_complex:
+    std::integral_constant<bool, false>
+{};
+
+// specialize for std::complex
+template <typename T>
+struct is_complex< std::complex<T> >:
+    std::integral_constant<bool, true>
+{};
+
+// -----------------------------------------------------------------------------
+// Based on C++14 common_type implementation from
+// http://www.cplusplus.com/reference/type_traits/common_type/
+// Adds promotion of complex types based on the common type of the associated
+// real types. This fixes various cases:
+//
+// std::common_type_t< double, complex<float> > is complex<float>  (wrong)
+//        scalar_type< double, complex<float> > is complex<double> (right)
+//
+// std::common_type_t< int, complex<long> > is not defined (compile error)
+//        scalar_type< int, complex<long> > is complex<long> (right)
+
+// for zero types
+template< typename... Types >
+struct scalar_type_traits;
+
+// define scalar_type<> type alias
+template< typename... Types >
+using scalar_type = typename scalar_type_traits< Types... >::type;
+
+// for one type
+template< typename T >
+struct scalar_type_traits< T >
+{
+    using type = decay_t<T>;
+};
+
+// for two types
+// relies on type of ?: operator being the common type of its two arguments
+template< typename T1, typename T2 >
+struct scalar_type_traits< T1, T2 >
+{
+    using type = decay_t< decltype( true ? std::declval<T1>() : std::declval<T2>() ) >;
+};
+
+// for either or both complex,
+// find common type of associated real types, then add complex
+template< typename T1, typename T2 >
+struct scalar_type_traits< std::complex<T1>, T2 >
+{
+    using type = std::complex< common_type_t< T1, T2 > >;
+};
+
+template< typename T1, typename T2 >
+struct scalar_type_traits< T1, std::complex<T2> >
+{
+    using type = std::complex< common_type_t< T1, T2 > >;
+};
+
+template< typename T1, typename T2 >
+struct scalar_type_traits< std::complex<T1>, std::complex<T2> >
+{
+    using type = std::complex< common_type_t< T1, T2 > >;
+};
+
+// for three or more types
+template< typename T1, typename T2, typename... Types >
+struct scalar_type_traits< T1, T2, Types... >
+{
+    using type = scalar_type< scalar_type< T1, T2 >, Types... >;
+};
+
+// -----------------------------------------------------------------------------
+// for any combination of types, determine associated real, scalar,
+// and complex types.
+//
+// real_type< float >                               is float
+// real_type< float, double, complex<float> >       is double
+//
+// scalar_type< float >                             is float
+// scalar_type< float, complex<float> >             is complex<float>
+// scalar_type< float, double, complex<float> >     is complex<double>
+//
+// complex_type< float >                            is complex<float>
+// complex_type< float, double >                    is complex<double>
+// complex_type< float, double, complex<float> >    is complex<double>
+
+// for zero types
+template< typename... Types >
+struct real_type_traits;
+
+// define real_type<> type alias
+template< typename... Types >
+using real_type = typename real_type_traits< Types... >::real_t;
+
+// define complex_type<> type alias
+template< typename... Types >
+using complex_type = std::complex< real_type< Types... > >;
+
+// for one type
+template< typename T >
+struct real_type_traits<T>
+{
+    using real_t = T;
+};
+
+// for one complex type, strip complex
+template< typename T >
+struct real_type_traits< std::complex<T> >
+{
+    using real_t = T;
+};
+
+// for two or more types
+template< typename T1, typename... Types >
+struct real_type_traits< T1, Types... >
+{
+    using real_t = scalar_type< real_type<T1>, real_type< Types... > >;
+};
+
+// -----------------------------------------------------------------------------
+// max that works with different data types: int64_t = max( int, int64_t )
+// and any number of arguments: max( a, b, c, d )
+
+// one argument
+template< typename T >
+T max( T x )
+{
+    return x;
+}
+
+// two arguments
+template< typename T1, typename T2 >
+scalar_type< T1, T2 >
+    max( T1 x, T2 y )
+{
+    return (x >= y ? x : y);
+}
+
+// three or more arguments
+template< typename T1, typename... Types >
+scalar_type< T1, Types... >
+    max( T1 first, Types... args )
+{
+    return max( first, max( args... ) );
+}
+
+// -----------------------------------------------------------------------------
+// min that works with different data types: int64_t = min( int, int64_t )
+// and any number of arguments: min( a, b, c, d )
+
+// one argument
+template< typename T >
+T min( T x )
+{
+    return x;
+}
+
+// two arguments
+template< typename T1, typename T2 >
+scalar_type< T1, T2 >
+    min( T1 x, T2 y )
+{
+    return (x <= y ? x : y);
+}
+
+// three or more arguments
+template< typename T1, typename... Types >
+scalar_type< T1, Types... >
+    min( T1 first, Types... args )
+{
+    return min( first, min( args... ) );
+}
+
+}  // namespace blis
+
+#endif        //  #ifndef BLIS_UTIL_HH
diff --git a/test/Makefile b/test/Makefile
index a2e0391d5..5c4aca248 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -148,10 +148,10 @@ CBLAS_HEADER_PATH  = ../frame/compat/cblas/src
 CINCFLAGS      := -I$(INC_PATH) -I$(CBLAS_HEADER_PATH)
 
 # Use the CFLAGS for the configuration family.
-CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
+override CFLAGS         += $(call get-user-cflags-for,$(CONFIG_NAME))
 
 # Add local header paths to CFLAGS
-CFLAGS         += -I$(TEST_SRC_PATH)
+override CFLAGS         += -I$(TEST_SRC_PATH)
 
 # Locate the libblis library to which we will link.
 #LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
diff --git a/test/test_gemm.c b/test/test_gemm.c
index 55e608c48..c2730e68d 100644
--- a/test/test_gemm.c
+++ b/test/test_gemm.c
@@ -91,13 +91,23 @@ int main( int argc, char** argv )
 	n_input = 4;
 #endif
 
-#if 1
-	//dt = BLIS_FLOAT;
-	dt = BLIS_DOUBLE;
+#if 0
+//	dt = BLIS_FLOAT;
+//	dt = BLIS_DOUBLE;
 #else
-	//dt = BLIS_SCOMPLEX;
-	dt = BLIS_DCOMPLEX;
+//	dt = BLIS_SCOMPLEX;
+//	dt = BLIS_DCOMPLEX;
 #endif
+#ifdef FLOAT
+        dt = BLIS_FLOAT;
+#elif defined DOUBLE
+        dt = BLIS_DOUBLE;
+#elif defined SCOMPLEX
+        dt = BLIS_SCOMPLEX;
+#elif defined DCOMPLEX
+        dt = BLIS_DCOMPLEX;
+#endif
+
 
 	transa = BLIS_NO_TRANSPOSE;
 	transb = BLIS_NO_TRANSPOSE;
diff --git a/testcpp/Makefile b/testcpp/Makefile
index ccd44172b..3f2dc1327 100644
--- a/testcpp/Makefile
+++ b/testcpp/Makefile
@@ -115,7 +115,11 @@ CINCFLAGS      := -I$(INC_PATH)
 CXX	= g++
 
 # Use the CFLAGS for the configuration family.
+<<<<<<< HEAD
 override CFLAGS         += $(call get-sandbox-cxxflags-for,$(CONFIG_NAME))
+=======
+override CFLAGS         += $(call get-user-cflags-for,$(CONFIG_NAME))
+>>>>>>> Code Cleanup done; Test code updated to add performance measurement
 
 # Add local header paths to CFLAGS
 #CFLAGS         = -O0 -g -Wall 
@@ -136,6 +140,7 @@ LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
 
 # Complete list of possible targets when defining 'all':
 #
+<<<<<<< HEAD
 #   blis 
 #
 all: blis 
@@ -182,6 +187,103 @@ blis: test_asum_blis.x \
 	test_trsm_blis.x \
 	test_trsv_blis.x 
  
+=======
+#   blis openblas atlas mkl mac essl
+#
+#all: blis openblas atlas mkl
+all: blis openblas mkl
+
+blis: test_gemm_blis.x \
+	test_gemm1_blis.x
+
+openblas: \
+      test_dotv_openblas.x \
+      test_axpyv_openblas.x \
+      test_gemv_openblas.x \
+      test_ger_openblas.x \
+      test_hemv_openblas.x \
+      test_her_openblas.x \
+      test_her2_openblas.x \
+      test_trmv_openblas.x \
+      test_trsv_openblas.x \
+      \
+      test_gemm_openblas.x \
+      test_hemm_openblas.x \
+      test_herk_openblas.x \
+      test_her2k_openblas.x \
+      test_trmm_openblas.x \
+      test_trsm_openblas.x
+
+atlas: \
+      test_dotv_atlas.x \
+      test_axpyv_atlas.x \
+      test_gemv_atlas.x \
+      test_ger_atlas.x \
+      test_hemv_atlas.x \
+      test_her_atlas.x \
+      test_her2_atlas.x \
+      test_trmv_atlas.x \
+      test_trsv_atlas.x \
+      \
+      test_gemm_atlas.x \
+      test_hemm_atlas.x \
+      test_herk_atlas.x \
+      test_her2k_atlas.x \
+      test_trmm_atlas.x \
+      test_trsm_atlas.x
+
+mkl:  test_dotv_mkl.x \
+      test_axpyv_mkl.x \
+      test_gemv_mkl.x \
+      test_ger_mkl.x \
+      test_hemv_mkl.x \
+      test_her_mkl.x \
+      test_her2_mkl.x \
+      test_trmv_mkl.x \
+      test_trsv_mkl.x \
+      \
+      test_gemm_mkl.x \
+      test_hemm_mkl.x \
+      test_herk_mkl.x \
+      test_her2k_mkl.x \
+      test_trmm_mkl.x \
+      test_trsm_mkl.x
+
+essl: test_dotv_essl.x \
+      test_axpyv_essl.x \
+      test_gemv_essl.x \
+      test_ger_essl.x \
+      test_hemv_essl.x \
+      test_her_essl.x \
+      test_her2_essl.x \
+      test_trmv_essl.x \
+      test_trsv_essl.x \
+      \
+      test_gemm_essl.x \
+      test_hemm_essl.x \
+      test_herk_essl.x \
+      test_her2k_essl.x \
+      test_trmm_essl.x \
+      test_trsm_essl.x
+
+mac:  test_dotv_mac.x \
+      test_axpyv_mac.x \
+      test_gemv_mac.x \
+      test_ger_mac.x \
+      test_hemv_mac.x \
+      test_her_mac.x \
+      test_her2_mac.x \
+      test_trmv_mac.x \
+      test_trsv_mac.x \
+      \
+      test_gemm_mac.x \
+      test_hemm_mac.x \
+      test_herk_mac.x \
+      test_her2k_mac.x \
+      test_trmm_mac.x \
+      test_trsm_mac.x
+
+>>>>>>> Code Cleanup done; Test code updated to add performance measurement
 
 
 # --Object file rules --
@@ -190,7 +292,11 @@ $(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.cc
 	$(CXX) $(CFLAGS) -c $< -o $@
 
 test_%_blis.o: test_%.cc
+<<<<<<< HEAD
 	@$(CXX) $(CFLAGS) -c $< -o $@
+=======
+	$(CXX) $(CFLAGS) -c $< -o $@
+>>>>>>> Code Cleanup done; Test code updated to add performance measurement
 
 
 # -- Executable file rules --
diff --git a/testcpp/test.sh b/testcpp/test.sh
index 6d06c867b..27c6c53f3 100644
--- a/testcpp/test.sh
+++ b/testcpp/test.sh
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 
 echo Build BLIS CPP Template tests
 make clean
@@ -44,3 +45,37 @@ echo Run tests
 ./test_trmm_blis.x
 ./test_trsm_blis.x
 ./test_trsv_blis.x
+=======
+CWD=$(pwd)
+echo $CWD
+make clean
+make blis CFLAGS+="-DFLOAT"
+numactl -C 1 ./test_gemm1_blis.x
+make clean
+make blis CFLAGS+="-DDOUBLE"
+numactl -C 1 ./test_gemm1_blis.x
+make clean
+make blis CFLAGS+="-DSCOMPLEX"
+numactl -C 1 ./test_gemm1_blis.x
+make clean
+make blis CFLAGS+="-DDCOMPLEX"
+numactl -C 1 ./test_gemm1_blis.x
+
+
+cd ../test/
+CWD=$(pwd)
+echo $CWD
+make clean
+make blis CFLAGS+="-DFLOAT"
+numactl -C 1 ./test_gemm_blis.x
+make clean
+make blis CFLAGS+="-DDOUBLE"
+numactl -C 1 ./test_gemm_blis.x
+make clean
+make blis CFLAGS+="-DSCOMPLEX"
+numactl -C 1 ./test_gemm_blis.x
+make clean
+make blis CFLAGS+="-DDCOMPLEX"
+numactl -C 1 ./test_gemm_blis.x
+
+>>>>>>> Code Cleanup done; Test code updated to add performance measurement
diff --git a/testcpp/test_gemm.cc b/testcpp/test_gemm.cc
index 2fe6e55a7..12d235e13 100644
--- a/testcpp/test_gemm.cc
+++ b/testcpp/test_gemm.cc
@@ -33,8 +33,14 @@
 
 #include <complex>
 #include <iostream>
+<<<<<<< HEAD
 #include "blis.hh"
 #include "test.hh"
+=======
+#include <string.h>
+#include <unistd.h>
+#include "blis.hh"
+>>>>>>> Code Cleanup done; Test code updated to add performance measurement
 
 using namespace blis;
 using namespace std;
@@ -154,10 +160,88 @@ void test_gemm(  )
 // -----------------------------------------------------------------------------
 int main( int argc, char** argv )
 {
+<<<<<<< HEAD
     test_gemm<double>( );
     test_gemm<float>( );
     test_gemm<complex<float>>( );
     test_gemm<complex<double>>( );
     return 0;
+=======
+	int M, N, K, lda, ldb, ldc;
+	double a_d[DIM * DIM] = { 1.111, 2.222, 3.333, 4.444 };
+	double b_d[DIM * DIM] = { 5.555, 6.666, 7.777, 8.888 };
+	double c_d[DIM * DIM];
+	double alpha_d, beta_d;
+	float a_f[DIM * DIM] = { 1.1, 2.2, 3.3, 4.4 };
+	float b_f[DIM * DIM] = { 5.5, 6.6, 7.7, 8.8 };
+	float c_f[DIM * DIM];
+	float alpha_f, beta_f;
+	std::complex<float> a_c[DIM * DIM]={{1, 2},{3, 4},{5,6},{7,8}};
+	std::complex<float> b_c[DIM * DIM]={{1, 2},{3, 4},{5,6},{7,8}};
+	std::complex<float> c_c[DIM * DIM];
+	std::complex<float> alpha_c, beta_c;
+	std::complex<double> a_z[DIM * DIM]={{1.1, 2.2},{3.3, 4.4},{5.5,6.6},{7.7,8.8}};
+	std::complex<double> b_z[DIM * DIM]={{1.1, 2.2},{3.3, 4.4},{5.5,6.6},{7.7,8.8}};
+	std::complex<double> c_z[DIM * DIM];
+	std::complex<double> alpha_z, beta_z;
+	M = DIM;
+	N = M;
+	K = M;
+	lda = M;
+	ldb = K;
+	ldc = M;
+	alpha_d = 1.0;
+	beta_d = 0.0;
+	alpha_f = 1.0;
+	beta_f = 0.0;
+	alpha_c = {1.0,1.0};
+	beta_c = {0.0,0.0};
+	alpha_z = {1.0,1.0};
+	beta_z = {0.0,0.0};
+
+	/*cblis_sgemm*/	
+	cout<<"a_f= \n";
+	print_matrix<float>(a_f , M , K);
+	cout<<"b_f= \n";
+	print_matrix<float>(b_f , K , N);
+	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_f, a_f,
+			lda, b_f, ldb, beta_f, c_f, ldc);
+	cout<<"c_f= \n";
+	print_matrix<float>(c_f , M , N);
+
+
+	/*cblis_dgemm*/	
+	printf("a_d = \n");
+	print_matrix<double>(a_d , M , K);
+	printf("b_d = \n");
+	print_matrix<double>(b_d , K , N);
+	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_d, a_d,
+			lda, b_d, ldb, beta_d, c_d, ldc);
+	printf("c_d = \n");
+	print_matrix<double>(c_d , M , N);
+
+
+	/*cblis_cgemm*/	
+	printf("a_c = \n");
+	print_matrix<std::complex<float>>(a_c , M , K);
+	printf("b_c = \n");
+	print_matrix<std::complex<float>>(b_c , K , N);
+	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_c, a_c,
+			lda, b_c, ldb, beta_c, c_c, ldc);
+	printf("c_c = \n");
+	print_matrix<std::complex<float>>(c_c , M , N);
+
+
+	/*cblis_zgemm*/	
+	printf("a_z = \n");
+	print_matrix<std::complex<double>>(a_z , M , K);
+	printf("b_z = \n");
+	print_matrix<std::complex<double>>(b_z , K , N);
+	blis::gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha_z, a_z,
+			lda, b_z, ldb, beta_z, c_z, ldc);
+	printf("c_z = \n");
+	print_matrix<std::complex<double>>(c_z , M , N);
+	return 0;
+>>>>>>> Code Cleanup done; Test code updated to add performance measurement
 
 }
diff --git a/testcpp/test_gemm1.cc b/testcpp/test_gemm1.cc
new file mode 100644
index 000000000..c428ebbe7
--- /dev/null
+++ b/testcpp/test_gemm1.cc
@@ -0,0 +1,421 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <complex>
+
+#include <stdio.h>
+#include <iostream>
+#include <string.h>
+#include <unistd.h>
+#include "blis.hh"
+using namespace std;
+
+
+//#define FILE_IN_OUT      // File based input matrix dimensions
+
+
+#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t a, b, c;
+	obj_t c_save;
+	obj_t alpha, beta;
+	dim_t m, n, k;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   m_input, n_input, k_input;
+	num_t dt;
+	int   r, n_repeats;
+	trans_t  transa;
+	trans_t  transb;
+//	f77_char f77_transa;
+//	f77_char f77_transb;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+
+
+#ifdef FILE_IN_OUT
+	FILE* fin  = NULL;
+	FILE* fout = NULL;
+	char gemm = 's';
+
+#endif
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 100;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end   = 2000;
+	p_inc   = 200;
+
+	m_input = -1;
+	n_input = -1;
+	k_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	m_input = 5;
+	k_input = 6;
+	n_input = 4;
+#endif
+
+#ifdef FLOAT
+	dt = BLIS_FLOAT;
+#elif defined DOUBLE
+	dt = BLIS_DOUBLE;
+#elif defined SCOMPLEX
+	dt = BLIS_SCOMPLEX;
+#elif defined DCOMPLEX
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	transa = BLIS_NO_TRANSPOSE;
+	transb = BLIS_NO_TRANSPOSE;
+
+//	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+//	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+
+#ifdef FILE_IN_OUT
+	if (argc < 3)
+	  {
+	    printf("Usage: ./test_gemm_XX.x input.csv output.csv\n");
+	    exit(1);
+	  }
+	fin = fopen(argv[1], "r");
+	if (fin == NULL)
+	  {
+	    printf("Error opening the file %s\n", argv[1]);
+	    exit(1);
+	  }
+	fout = fopen(argv[2], "w");
+	if (fout == NULL)
+	  {
+	    printf("Error opening output file %s\n", argv[2]);
+	    exit(1);
+	  }
+	fprintf(fout, "m\t k\t n\t cs_a\t cs_b\t cs_c\t gflops\t GEMM_Algo\n");
+
+
+	printf("~~~~~~~~~~_BLAS\t m\t k\t n\t cs_a\t cs_b\t cs_c \t gflops\t GEMM_Algo\n");
+
+	inc_t cs_a;
+	inc_t cs_b;
+	inc_t cs_c;
+
+	while (fscanf(fin, "%ld %ld %ld %ld %ld %ld\n", &m, &k, &n, &cs_a, &cs_b, &cs_c) == 6)
+	  {
+	    if ((m > cs_a) || (k > cs_b) || (m > cs_c)) continue; // leading dimension should be greater than number of rows
+	    
+	    bli_obj_create( dt, 1, 1, 0, 0, &alpha);
+	    bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+	    bli_obj_create( dt, m, k, 1, cs_a, &a );
+	    bli_obj_create( dt, k, n, 1, cs_b, &b );
+	    bli_obj_create( dt, m, n, 1, cs_c, &c );
+	    bli_obj_create( dt, m, n, 1, cs_c, &c_save );
+
+	    bli_obj_set_conjtrans( transa, &a);
+	    bli_obj_set_conjtrans( transb, &b);
+
+	    //bli_setsc( 0.0, -1, &alpha );
+	    //bli_setsc( 0.0, 1, &beta );
+
+	    bli_setsc( -1, 0.0, &alpha );
+	    bli_setsc( 1, 0.0, &beta );
+
+	    //	    printf("%1.1f %1.1f\n", *((double *)bli_obj_buffer_for_const(BLIS_FLOAT, &alpha)), *((double *)bli_obj_buffer_for_const(BLIS_FLOAT, &beta)));
+
+#else
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+		bli_obj_create( dt, m, k, 0, 0, &a );
+		bli_obj_create( dt, k, n, 0, 0, &b );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_setsc(  (0.9/1.0), 0.2, &alpha );
+		bli_setsc( -(1.1/1.0), 0.3, &beta );
+#endif
+
+		bli_copym( &c, &c_save );
+	
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "b", &b, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#if 0
+
+			bli_gemm( &alpha,
+			          &a,
+			          &b,
+			          &beta,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			int  M     = bli_obj_length( &c );
+			int  K     = bli_obj_width_after_trans( &a );
+			int  N     = bli_obj_width( &c );
+			int  lda    = bli_obj_col_stride( &a );
+			int  ldb    = bli_obj_col_stride( &b );
+			int  ldc    = bli_obj_col_stride( &c );
+			float*   alphap = (float *)bli_obj_buffer( &alpha );
+			float*   ap     = (float *)bli_obj_buffer( &a );
+			float*   bp     = (float *)bli_obj_buffer( &b );
+			float*   betap  = (float *)bli_obj_buffer( &beta );
+			float*   cp     = (float *)bli_obj_buffer( &c );
+		
+		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
+                		        lda, bp, ldb, *betap, cp, ldc);
+		
+#if 0
+			sgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+#endif
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			int  M     = bli_obj_length( &c );
+			int  K     = bli_obj_width_after_trans( &a );
+			int  N     = bli_obj_width( &c );
+			int  lda    = bli_obj_col_stride( &a );
+			int  ldb    = bli_obj_col_stride( &b );
+			int  ldc    = bli_obj_col_stride( &c );
+			double*  alphap = (double*)bli_obj_buffer( &alpha );
+			double*  ap     = (double*)bli_obj_buffer( &a );
+			double*  bp     = (double*)bli_obj_buffer( &b );
+			double*  betap  = (double*)bli_obj_buffer( &beta );
+			double*  cp     = (double*)bli_obj_buffer( &c );
+
+		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
+                		        lda, bp, ldb, *betap, cp, ldc);
+#if 0
+			dgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+#endif
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			int  M     = bli_obj_length( &c );
+			int  K     = bli_obj_width_after_trans( &a );
+			int  N     = bli_obj_width( &c );
+			int  lda    = bli_obj_col_stride( &a );
+			int  ldb    = bli_obj_col_stride( &b );
+			int  ldc    = bli_obj_col_stride( &c );
+			std::complex<float>*  alphap = (std::complex<float>*)bli_obj_buffer( &alpha );
+			std::complex<float>*  ap     = (std::complex<float>*)bli_obj_buffer( &a );
+			std::complex<float>*  bp     = (std::complex<float>*)bli_obj_buffer( &b );
+			std::complex<float>*  betap  = (std::complex<float>*)bli_obj_buffer( &beta );
+			std::complex<float>*  cp     = (std::complex<float>*)bli_obj_buffer( &c );
+
+		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
+                		        lda, bp, ldb, *betap, cp, ldc);
+#if 0
+			cgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+#endif
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int  M     = bli_obj_length( &c );
+			f77_int  K     = bli_obj_width_after_trans( &a );
+			f77_int  N     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			std::complex<double>*  alphap = (std::complex<double>*)bli_obj_buffer( &alpha );
+			std::complex<double>*  ap     = (std::complex<double>*)bli_obj_buffer( &a );
+			std::complex<double>*  bp     = (std::complex<double>*)bli_obj_buffer( &b );
+			std::complex<double>*  betap  = (std::complex<double>*)bli_obj_buffer( &beta );
+			std::complex<double>*  cp     = (std::complex<double>*)bli_obj_buffer( &c );
+
+		        blis::gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, *alphap, ap,
+                		        lda, bp, ldb, *betap, cp, ldc);
+#if 0
+			zgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+#endif
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_gemm_blis" );
+#else
+		//printf( "data_gemm_%s", BLAS );
+#endif
+
+
+#ifdef FILE_IN_OUT
+
+		if ( bli_is_double( dt ) ) {
+
+		  if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES/4))  || ((m  < (BLIS_SMALL_M_RECT_MATRIX_THRES/2) ) && (k < (BLIS_SMALL_K_RECT_MATRIX_THRES/2) )))
+		    gemm = 'S';      // small gemm
+		  else gemm = 'N';   // Normal blis gemm
+		  
+		}
+		else if (bli_is_float( dt )) {
+		  if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES))  || ((m  < BLIS_SMALL_M_RECT_MATRIX_THRES) && (k < BLIS_SMALL_K_RECT_MATRIX_THRES)))
+		    gemm = 'S';    // small gemm
+		  else gemm = 'N'; // normal blis gemm
+		}
+		
+		
+
+		printf("%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
+			( unsigned long )m,
+		        ( unsigned long )k,
+		       ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops, gemm );
+		
+		
+		fprintf(fout, "%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
+			( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops, gemm );
+		fflush(fout);
+
+#else
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, gflops );
+		
+
+#endif
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+#ifdef FILE_IN_OUT
+	fclose(fin);
+	fclose(fout);
+#endif
+
+	return 0;
+}
+