Optimised dotv kernel by SIMD approach and by removing framework overhead

Details: - Kernel is called directly from API call to avoid framework overhead in case of complex float and complex double precisions. - Added SIMD code for complex float and complex double and unrolled for loop 5 times to improve performance AMD-Internal: [CPUPL-1057] Change-Id: I3b9d202398cacc0168882c9d6da2b450c27466a0
2026-04-20 07:38:53 +00:00 · 2020-10-06 20:08:12 +05:30
parent 1c6cf5c891
commit 5716dd8cf9
7 changed files with 1383 additions and 457 deletions
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-	  16,
+	  18,
 #if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -99,6 +99,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+	  BLIS_DOTV_KER,   BLIS_SCOMPLEX, bli_cdotv_zen_int5,
+	  BLIS_DOTV_KER,   BLIS_DCOMPLEX, bli_zdotv_zen_int5,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -88,7 +88,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-	  16,
+	  18,
 #if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -103,6 +103,8 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int10,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int10,
+	  BLIS_DOTV_KER,   BLIS_SCOMPLEX, bli_cdotv_zen_int5,
+	  BLIS_DOTV_KER,   BLIS_DCOMPLEX, bli_zdotv_zen_int5,

 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -88,7 +88,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
    // Update the context with optimized level-1v kernels.
    bli_cntx_set_l1v_kers
    (
-      16,
+      18,
 #if 1
      // amaxv
      BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -103,6 +103,8 @@ void bli_cntx_init_zen3( cntx_t* cntx )
      // dotv
      BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int10,
      BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int10,
+      BLIS_DOTV_KER,   BLIS_SCOMPLEX, bli_cdotv_zen_int5,
+      BLIS_DOTV_KER,   BLIS_DCOMPLEX, bli_zdotv_zen_int5,

      // dotxv
      BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -49,41 +49,41 @@ ftype PASTEF772(ch,blasname,chc) \
       const ftype*   y, const f77_int* incy  \
     ) \
 { \
-	dim_t  n0; \
-	ftype* x0; \
-	ftype* y0; \
-	inc_t  incx0; \
-	inc_t  incy0; \
-	ftype  rho; \
+    dim_t  n0; \
+    ftype* x0; \
+    ftype* y0; \
+    inc_t  incx0; \
+    inc_t  incy0; \
+    ftype  rho; \
 \
-	/* Initialize BLIS. */ \
-	bli_init_auto(); \
+    /* Initialize BLIS. */ \
+    bli_init_auto(); \
 \
-	/* Convert/typecast negative values of n to zero. */ \
-	bli_convert_blas_dim1( *n, n0 ); \
+    /* Convert/typecast negative values of n to zero. */ \
+    bli_convert_blas_dim1( *n, n0 ); \
 \
-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */ \
-	bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
-	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */ \
+    bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+    bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
 \
-	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
-	( \
-	  blis_conjx, \
-	  BLIS_NO_CONJUGATE, \
-	  n0, \
-	  x0, incx0, \
-	  y0, incy0, \
-	  &rho, \
-	  NULL, \
-	  NULL  \
-	); \
+    /* Call BLIS interface. */ \
+    PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+    ( \
+      blis_conjx, \
+      BLIS_NO_CONJUGATE, \
+      n0, \
+      x0, incx0, \
+      y0, incy0, \
+      &rho, \
+      NULL, \
+      NULL  \
+    ); \
 \
-	/* Finalize BLIS. */ \
-	bli_finalize_auto(); \
+    /* Finalize BLIS. */ \
+    bli_finalize_auto(); \
 \
-	return rho; \
+    return rho; \
 }

 #ifdef BLIS_ENABLE_BLAS
@@ -96,41 +96,120 @@ dcomplex zdotc_
       const dcomplex*   y, const f77_int* incy
     )
 {
-	dim_t  n0;
-	dcomplex* x0;
-	dcomplex* y0;
-	inc_t  incx0;
-	inc_t  incy0;
-	dcomplex  rho;
+    dim_t  n0;
+    dcomplex* x0;
+    dcomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    dcomplex  rho;

-	/* Initialize BLIS. */
-	bli_init_auto();
+    /* Initialize BLIS. */
+    bli_init_auto();

-	/* Convert/typecast negative values of n to zero. */
-	bli_convert_blas_dim1( *n, n0 );
+    /* Convert/typecast negative values of n to zero. */
+    bli_convert_blas_dim1( *n, n0 );

-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */
-	bli_convert_blas_incv( n0, (dcomplex*)x, *incx, x0, incx0 );
-	bli_convert_blas_incv( n0, (dcomplex*)y, *incy, y0, incy0 );
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    bli_convert_blas_incv( n0, (dcomplex*)x, *incx, x0, incx0 );
+    bli_convert_blas_incv( n0, (dcomplex*)y, *incy, y0, incy0 );

-	/* Call BLIS interface. */
-	PASTEMAC2(z,dotv,_ex)
-	(
-	  BLIS_CONJUGATE,
-	  BLIS_NO_CONJUGATE,
-	  n0,
-	  x0, incx0,
-	  y0, incy0,
-	  &rho,
-	  NULL,
-	  NULL
-	);
+    /* Call BLIS interface. */
+    PASTEMAC2(z,dotv,_ex)
+    (
+      BLIS_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      NULL,
+      NULL
+    );

-	/* Finalize BLIS. */
-	bli_finalize_auto();
+    /* Finalize BLIS. */
+    bli_finalize_auto();
    *ret_val = rho;
-	return rho;
+    return rho;
+}
+#else
+dcomplex zdotc_
+     (
+       const f77_int* n,
+       const dcomplex*   x, const f77_int* incx,
+       const dcomplex*   y, const f77_int* incy
+     )
+{
+    dim_t  n0;
+    dcomplex* x0;
+    dcomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    dcomplex  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((dcomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((dcomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Call BLIS kernel. */
+    bli_zdotv_zen_int5
+    (
+      BLIS_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      NULL
+    );
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+
+    return rho;
 }
 #endif

@@ -157,8 +236,8 @@ float sdot_
    if ( *n < 0 ) n0 = ( dim_t )0;
    else              n0 = ( dim_t )(*n);

-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */ 
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */

    if ( *incx < 0 )
    {
@@ -197,22 +276,22 @@ float sdot_
        incy0 = ( inc_t )(*incy);
    }

-	/* Call BLIS kernel. */
-	bli_sdotv_zen_int10
-	(
-	  BLIS_NO_CONJUGATE,
-	  BLIS_NO_CONJUGATE,
-	  n0,
-	  x0, incx0,
-	  y0, incy0,
-	  &rho,
-	  NULL
-	);
+    /* Call BLIS kernel. */
+    bli_sdotv_zen_int10
+    (
+      BLIS_NO_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      NULL
+    );

-	/* Finalize BLIS. */
-//	bli_finalize_auto();
+    /* Finalize BLIS. */
+//  bli_finalize_auto();

-	return rho;
+    return rho;
 }

 double ddot_
@@ -222,22 +301,22 @@ double ddot_
       const double*   y, const f77_int* incy
     )
 {
-	dim_t  n0;
-	double* x0;
-	double* y0;
-	inc_t  incx0;
-	inc_t  incy0;
-	double  rho;
+    dim_t  n0;
+    double* x0;
+    double* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    double  rho;

-	/* Initialize BLIS. */
-//	bli_init_auto();
+    /* Initialize BLIS. */
+//  bli_init_auto();

-	/* Convert/typecast negative values of n to zero. */
+    /* Convert/typecast negative values of n to zero. */
    if ( *n < 0 ) n0 = ( dim_t )0;
    else              n0 = ( dim_t )(*n);

-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */ 
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */

    if ( *incx < 0 )
    {
@@ -276,28 +355,261 @@ double ddot_
        incy0 = ( inc_t )(*incy);
    }

-	/* Call BLIS kernel. */
-	bli_ddotv_zen_int10
-	(
-	  BLIS_NO_CONJUGATE,
-	  BLIS_NO_CONJUGATE,
-	  n0,
-	  x0, incx0,
-	  y0, incy0,
-	  &rho,
-	  NULL
-	);
+    /* Call BLIS kernel. */
+    bli_ddotv_zen_int10
+    (
+      BLIS_NO_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      NULL
+    );

-	/* Finalize BLIS. */
-//	bli_finalize_auto();
+    /* Finalize BLIS. */
+//  bli_finalize_auto();

-	return rho;
+    return rho;
+}
+
+scomplex cdotu_
+     (
+       const f77_int* n,
+       const scomplex*   x, const f77_int* incx,
+       const scomplex*   y, const f77_int* incy
+     )
+{
+    dim_t  n0;
+    scomplex* x0;
+    scomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    scomplex  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((scomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((scomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((scomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((scomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Call BLIS kernel. */
+    bli_cdotv_zen_int5
+    (
+      BLIS_NO_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      NULL
+    );
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+
+    return rho;
+}
+
+dcomplex zdotu_
+     (
+       const f77_int* n,
+       const dcomplex*   x, const f77_int* incx,
+       const dcomplex*   y, const f77_int* incy
+     )
+{
+    dim_t  n0;
+    dcomplex* x0;
+    dcomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    dcomplex  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((dcomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((dcomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Call BLIS kernel. */
+    bli_zdotv_zen_int5
+    (
+      BLIS_NO_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      NULL
+    );
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+
+    return rho;
+}
+
+
+scomplex cdotc_
+     (
+       const f77_int* n,
+       const scomplex*   x, const f77_int* incx,
+       const scomplex*   y, const f77_int* incy
+     )
+{
+    dim_t  n0;
+    scomplex* x0;
+    scomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    scomplex  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((scomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((scomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((scomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((scomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Call BLIS kernel. */
+    bli_cdotv_zen_int5
+    (
+      BLIS_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      NULL
+    );
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+
+    return rho;
 }
-#ifdef AOCL_F2C
-INSERT_GENTFUNCDOT_BLAS_CZ_F2C( dot, dotv)
-#else
-INSERT_GENTFUNCDOT_BLAS_CZ( dot, dotv )
-#endif
 #else
 #ifdef AOCL_F2C
 INSERT_GENTFUNCDOT_BLAS_SDC( dot, dotv )
@@ -318,16 +630,16 @@ float PASTEF77(sd,sdot)
       const float*   y, const f77_int* incy
     )
 {
-	return ( float )
-	       (
-	         ( double )(*sb) +
-	         PASTEF77(d,sdot)
-	         (
-	           n,
-	           x, incx,
-	           y, incy
-	         )
-	       );
+    return ( float )
+           (
+             ( double )(*sb) +
+             PASTEF77(d,sdot)
+             (
+               n,
+               x, incx,
+               y, incy
+             )
+           );
 }

 // Input vectors stored in single precision, computed in double precision,
@@ -339,39 +651,39 @@ double PASTEF77(d,sdot)
       const float*   y, const f77_int* incy
     )
 {
-	dim_t   n0;
-	float*  x0;
-	float*  y0;
-	inc_t   incx0;
-	inc_t   incy0;
-	double  rho;
-	dim_t   i;
+    dim_t   n0;
+    float*  x0;
+    float*  y0;
+    inc_t   incx0;
+    inc_t   incy0;
+    double  rho;
+    dim_t   i;

-	/* Initialization of BLIS is not required. */
+    /* Initialization of BLIS is not required. */

-	/* Convert/typecast negative values of n to zero. */
-	bli_convert_blas_dim1( *n, n0 );
+    /* Convert/typecast negative values of n to zero. */
+    bli_convert_blas_dim1( *n, n0 );

-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */
-	bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
-	bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
+    bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );

-	rho = 0.0;
+    rho = 0.0;

-	for ( i = 0; i < n0; i++ )
-	{
-		float* chi1 = x0 + (i  )*incx0;
-		float* psi1 = y0 + (i  )*incy0;
+    for ( i = 0; i < n0; i++ )
+    {
+        float* chi1 = x0 + (i  )*incx0;
+        float* psi1 = y0 + (i  )*incy0;

-		bli_ddots( (( double )(*chi1)),
-		           (( double )(*psi1)), rho );
-	}
+        bli_ddots( (( double )(*chi1)),
+                   (( double )(*psi1)), rho );
+    }

-	/* Finalization of BLIS is not required, because initialization was
-	   not required. */
+    /* Finalization of BLIS is not required, because initialization was
+       not required. */

-	return rho;
+    return rho;
 }

 #endif
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -58,9 +58,11 @@ AXPYV_KER_PROT( double,   d, axpyv_zen_int )
 DOTV_KER_PROT( float,    s, dotv_zen_int )
 DOTV_KER_PROT( double,   d, dotv_zen_int )

-	// dotv (intrinsics, unrolled x10)
-	DOTV_KER_PROT( float,    s, dotv_zen_int10 )
-	DOTV_KER_PROT( double,   d, dotv_zen_int10 )
+// dotv (intrinsics, unrolled x10)
+DOTV_KER_PROT( float,    s, dotv_zen_int10 )
+DOTV_KER_PROT( double,   d, dotv_zen_int10 )
+DOTV_KER_PROT( scomplex,  c, dotv_zen_int5 )
+DOTV_KER_PROT( dcomplex,  z, dotv_zen_int5 )

 // dotxv (intrinsics)
 DOTXV_KER_PROT( float,    s, dotxv_zen_int )
--- a/kernels/zen2/.gitignore
+++ b/kernels/zen2/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore