diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h
new file mode 100644
index 000000000..7bf97d555
--- /dev/null
+++ b/kernels/armv8a/3/armv8a_asm_utils.h
@@ -0,0 +1,49 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+// Apple's local label requirements.
+#if defined(__APPLE__)
+#define LABEL(str) "   L" #str": \n\t"
+#define BEQ(str) "b.eq L" #str"  \n\t"
+#define BNE(str) "b.ne L" #str"  \n\t"
+#define BRANCH(str) "b L" #str"  \n\t"
+#else
+#define LABEL(str) "   ." #str": \n\t"
+#define BEQ(str) "b.eq ." #str"  \n\t"
+#define BNE(str) "b.ne ." #str"  \n\t"
+#define BRANCH(str) "b ." #str"  \n\t"
+#endif
+
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index c01c67f5a..251931f7c 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -34,6 +34,7 @@
 */
 
 #include "blis.h"
+#include "armv8a_asm_utils.h"
 
 /*
    o 4x4 Single precision micro-kernel fully functional.
@@ -155,7 +156,7 @@ __asm__ volatile
 " dup  v31.4s, wzr                           \n\t" // Vector for accummulating column 11
 "                                            \n\t"
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-" beq .SCONSIDERKLEFT                        \n\t"
+BEQ(SCONSIDERKLEFT)
 "                                            \n\t"
 " ldr q0, [x0]                               \n\t"
 " ldr q1, [x0, #16]                          \n\t" // Load a
@@ -168,9 +169,9 @@ __asm__ volatile
 " add x1, x1, #48                            \n\t" //update address of B
 "                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
-" beq .SLASTITER                             \n\t" // (as loop is do-while-like).
+BEQ(SLASTITER)                                     // (as loop is do-while-like).
 "                                            \n\t"
-" .SLOOPKITER:                               \n\t" // Body of the k_iter loop.
+LABEL(SLOOPKITER)                                  // Body of the k_iter loop.
 "                                            \n\t"
 " ldr q5, [x0]                               \n\t"
 " fmla v8.4s, v0.4s,v2.s[0]                  \n\t" // Accummulate.
@@ -316,9 +317,9 @@ __asm__ volatile
 "                                            \n\t" //End It 4
 " sub x5,x5,1                                \n\t" // i-=1.
 " cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-" bne .SLOOPKITER                            \n\t"
+BNE(SLOOPKITER)
 "                                            \n\t" 
-" .SLASTITER:                                \n\t" // Last iteration of k_iter loop.
+LABEL(SLASTITER)                                   // Last iteration of k_iter loop.
 "                                            \n\t" 
 "                                            \n\t"
 " ldr q5, [x0]                               \n\t"
@@ -454,11 +455,11 @@ __asm__ volatile
 " add x0, x0, #96                            \n\t"
 "                                            \n\t" //End It 4
 "                                            \n\t"
-" .SCONSIDERKLEFT:                           \n\t" 
+LABEL(SCONSIDERKLEFT)
 " cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-" beq .SPOSTACCUM                            \n\t" // else, we enter the k_left loop.
+BEQ(SPOSTACCUM)                                    // else, we enter the k_left loop.
 "                                            \n\t"
-" .SLOOPKLEFT:                               \n\t" // Body of the left iterations
+LABEL(SLOOPKLEFT)                                  // Body of the left iterations
 "                                            \n\t"
 " ldr q0, [x0],#16                           \n\t"
 " ldr q1, [x0],#16                           \n\t" // Load a
@@ -497,17 +498,17 @@ __asm__ volatile
 " fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
 " cmp x6,0                                   \n\t" // Iterate again.
-" bne .SLOOPKLEFT                            \n\t" // if i!=0.
+BNE(SLOOPKLEFT)                                    // if i!=0.
 "                                            \n\t"
-" .SPOSTACCUM:                               \n\t"
+LABEL(SPOSTACCUM)
 "                                            \n\t"
 " ld1r {v6.4s},[x7]                          \n\t" // Load alpha.
 " ld1r {v7.4s},[x8]                          \n\t" // Load beta
 "                                            \n\t"
 " cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
-" bne .SGENSTORED                            \n\t"
+BNE(SGENSTORED)
 "                                            \n\t"
-" .SCOLSTORED:                               \n\t" // C is column-major.
+LABEL(SCOLSTORED)                                  // C is column-major.
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
 " dup  v1.4s, wzr                            \n\t"
@@ -517,7 +518,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x2]                               \n\t" //Load column 0 of C
 " ldr q1, [x2, #16]                          \n\t"
@@ -533,7 +534,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS1:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS1)
 "                                            \n\t"
 " fmla v0.4s,v8.4s,v6.s[0]                   \n\t" // Scale by alpha
 " fmla v1.4s,v9.4s,v6.s[0]                   \n\t" // Scale by alpha
@@ -557,7 +558,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                           \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x18]                              \n\t" //Load column 3 of C
 " ldr q9, [x18, #16]                         \n\t"
@@ -573,7 +574,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS2:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS2)
 "                                            \n\t"
 " fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
@@ -597,7 +598,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x21]                              \n\t" //Load column 6 of C
 " ldr q1, [x21, #16]                         \n\t"
@@ -613,7 +614,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS3:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS3)
 "                                            \n\t"
 " fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
@@ -637,7 +638,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x24]                              \n\t" //Load column 9 of C
 " ldr q9, [x24, #16]                         \n\t"
@@ -653,7 +654,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS4:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -673,10 +674,10 @@ __asm__ volatile
 " str q13, [x26, #16]                        \n\t"
 "                                            \n\t"
 "                                            \n\t"
-" b .SEND                                    \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
+BRANCH(SEND)                                       // Done.
 "                                            \n\t"
 "                                            \n\t"
-" .SGENSTORED:                               \n\t" // C is general-stride stored.
+LABEL(SGENSTORED)                                  // C is general-stride stored.
 "                                            \n\t"
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
@@ -687,7 +688,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x2                                \n\t"
 "                                            \n\t"
@@ -729,7 +730,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS1:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS1)
 "                                            \n\t"
 " fmla v0.4s, v8.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v1.4s, v9.4s,v6.s[0]                  \n\t" // Scale by alpha
@@ -779,7 +780,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                           \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x18                               \n\t"
 "                                            \n\t"
@@ -821,7 +822,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS2:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS2)
 "                                            \n\t"
 " fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
@@ -871,7 +872,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x21                               \n\t"
 "                                            \n\t"
@@ -913,7 +914,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS3:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS3)
 "                                            \n\t"
 " fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
@@ -963,7 +964,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                           \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x24                               \n\t"
 "                                            \n\t"
@@ -1005,7 +1006,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS4:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -1050,7 +1051,7 @@ __asm__ volatile
 " st1 {v13.s}[2],[x27],x14                   \n\t" // Store c116  into quad and increment by rs_c.
 " st1 {v13.s}[3],[x27],x14                   \n\t" // Store c147  into quad and increment by rs_c.
 "                                            \n\t"
-" .SEND:                                     \n\t" // Done!
+LABEL(SEND)                                        // Done!
 "                                            \n\t"
 :// output operands (none)
 :// input operands
@@ -1203,7 +1204,7 @@ __asm__ volatile
 "                                            \n\t"
 "                                            \n\t"
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-" beq .DCONSIDERKLEFT                        \n\t"
+BEQ(DCONSIDERKLEFT)
 "                                            \n\t"
 " ldr q0, [x0]                               \n\t" // Load a
 " ldr q1, [x0, #16]                          \n\t"
@@ -1218,9 +1219,9 @@ __asm__ volatile
 " add x1, x1, #64                            \n\t" //update address of B
 "                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
-" beq .DLASTITER                             \n\t" // (as loop is do-while-like).
+BEQ(DLASTITER)                                     // (as loop is do-while-like).
 "                                            \n\t"
-" DLOOP:                                     \n\t" // Body
+LABEL(DLOOP)                                       // Body
 "                                            \n\t"
 " fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 " prfm    PLDL1KEEP, [x1, #448]              \n\t" //512-64=448
@@ -1394,9 +1395,9 @@ __asm__ volatile
 "                                            \n\t"
 " sub x5,x5,1                                \n\t" // i-=1
 " cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-" bne DLOOP                                  \n\t"
+BNE(DLOOP)
 "                                            \n\t"
-".DLASTITER:                                 \n\t"
+LABEL(DLASTITER)
 "                                            \n\t"
 " fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 " fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1554,11 +1555,11 @@ __asm__ volatile
 "                                            \n\t"                  //End it 4
 " add x0, x0, #144                           \n\t"
 "                                            \n\t"
-" .DCONSIDERKLEFT:                           \n\t" 
+LABEL(DCONSIDERKLEFT)
 " cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-" beq .DPOSTACCUM                            \n\t" // else, we enter the k_left loop.
+BEQ(DPOSTACCUM)                                    // else, we enter the k_left loop.
 "                                            \n\t"
-".DLOOPKLEFT:                                \n\t"
+LABEL(DLOOPKLEFT)
 "                                            \n\t"
 " ldr q0, [x0],#16                           \n\t"
 " ldr q1, [x0],#16                           \n\t" // Load a
@@ -1605,17 +1606,17 @@ __asm__ volatile
 " fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
 "                                            \n\t"
 " cmp x6,0                                   \n\t" // Iterate again.
-" bne .DLOOPKLEFT                            \n\t" // if i!=0.
+BNE(DLOOPKLEFT)                                    // if i!=0.
 "                                            \n\t"
-" .DPOSTACCUM:                               \n\t"
+LABEL(DPOSTACCUM)
 "                                            \n\t"
 " ld1r {v6.2d},[x7]                          \n\t" // Load alpha.
 " ld1r {v7.2d},[x8]                          \n\t" // Load beta
 "                                            \n\t"
 " cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
-" bne .DGENSTORED                            \n\t"
+BNE(DGENSTORED)
 "                                            \n\t"
-" .DCOLSTORED:                               \n\t" // C is column-major.
+LABEL(DCOLSTORED)                                  // C is column-major.
 "                                            \n\t"
 " dup  v0.2d, xzr                            \n\t"
 " dup  v1.2d, xzr                            \n\t"
@@ -1625,7 +1626,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x2]                               \n\t" //Load column 0 of C
 " ldr q1, [x2, #16]                          \n\t"
@@ -1642,7 +1643,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS1:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS1)
 "                                            \n\t"
 " fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
 " fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
@@ -1667,7 +1668,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x21]                              \n\t" //Load column 2 of C
 " ldr q9, [x21, #16]                         \n\t"
@@ -1684,7 +1685,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS2:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS2)
 "                                            \n\t"
 " fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
 " fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
@@ -1709,7 +1710,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x23]                              \n\t" //Load column 4 of C
 " ldr q1, [x23, #16]                         \n\t"
@@ -1726,7 +1727,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS3:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS3)
 "                                            \n\t"
 " fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
 " fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
@@ -1751,7 +1752,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x25]                              \n\t" //Load column 6 of C
 " ldr q9, [x25, #16]                         \n\t"
@@ -1768,7 +1769,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS4:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -1788,9 +1789,9 @@ __asm__ volatile
 " str q12, [x26, #16]                        \n\t"
 " str q13, [x26, #32]                        \n\t"
 "                                            \n\t"
-" b .DEND                                    \n\t"
+BRANCH(DEND)
 "                                            \n\t"
-" .DGENSTORED:                               \n\t" // C is general-stride stored.
+LABEL(DGENSTORED)                                  // C is general-stride stored.
 "                                            \n\t"
 " dup  v0.2d, xzr                            \n\t"
 " dup  v1.2d, xzr                            \n\t"
@@ -1800,7 +1801,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x2                                \n\t"
 "                                            \n\t" // Load address of C.
@@ -1827,7 +1828,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS1:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS1)
 "                                            \n\t"
 " fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
 " fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
@@ -1862,7 +1863,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x21                               \n\t" // Load address of C.
 "                                            \n\t"
@@ -1889,7 +1890,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS2:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS2)
 "                                            \n\t"
 " fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
 " fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
@@ -1924,7 +1925,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x23                               \n\t" // Load address of C.
 "                                            \n\t"
@@ -1951,7 +1952,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS3:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS3)
 "                                            \n\t"
 " fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
 " fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
@@ -1986,7 +1987,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x25                               \n\t"
 "                                            \n\t"
@@ -2013,7 +2014,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS4:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -2043,7 +2044,7 @@ __asm__ volatile
 " st1 {v13.d}[0],[x27],x14                   \n\t" // Store c74  into quad and increment by rs_c.
 " st1 {v13.d}[1],[x27],x14                   \n\t" // Store c75  into quad and increment by rs_c.
 "                                            \n\t"
-" .DEND:                                     \n\t" // Done!
+LABEL(DEND)                                        // Done!
 "                                            \n\t"
 :// output operands (none)
 :// input operands