Bugfix for A matrix packing in int8(S8/U8) APIs

- A matrix packing by default in isn't necessary for row-major matrix data. Also, it seems packing of A was giving regressions and hence wasn't expected to be used. - However, packA is necessary in column-major cases, where transpose has to be done. This path has been verified. - Hence, when user sets pack A explicitly, it gets into the incomplete packA function, and overwrites the elements in the buffer after subsequent iterations, leading to accuracy issues. As a fix to this the patch updates PACK condition to UNPACKED at the interface while user explicitly sets one, ensuring seamless execution. [ AMD-Internal : CPUPL - 7193 ]
2026-04-20 15:48:50 +00:00 · 2025-08-22 18:46:19 +05:30
parent 5044b69d3d
commit 6cdab2720c
10 changed files with 68 additions and 10 deletions
--- a/addon/aocl_gemm/aocl_gemm_s8s8s32obf16.c
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32obf16.c
@@ -158,7 +158,14 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,bfloat16,int32_t,s8s8s32obf16)
 						" not supported.", __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	 // A matrix packing is only done in column major case, or when
+    // A matrix is transposed in row major. PackA kernels for row-maj
+    // is not supported, hence we set it to unpacked and proceed with GEMM.
+    if ((is_row_major == TRUE) && (mtag_a == PACK)) {
+        mtag_a = UNPACKED;
+    } else if (is_column_major == TRUE && mtag_b == PACK) {
+        mtag_b = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_s8s8s32of32.c
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32of32.c
@@ -151,7 +151,14 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,float,int32_t,s8s8s32of32)
 						" not supported.", __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	 // A matrix packing is only done in column major case, or when
+    // A matrix is transposed in row major. PackA kernels for row-maj
+    // is not supported, hence we set it to unpacked and proceed with GEMM.
+    if ((is_row_major == TRUE) && (mtag_a == PACK)) {
+        mtag_a = UNPACKED;
+    } else if (is_column_major == TRUE && mtag_b == PACK) {
+        mtag_b = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c
@@ -151,7 +151,14 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
 						"is not supported.", __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	// A matrix packing is only done in column major case, or when
+    // A matrix is transposed in row major. PackA kernels for row-maj
+    // is not supported, hence we set it to unpacked and proceed with GEMM.
+    if ((is_row_major == TRUE) && (mtag_a == PACK)) {
+        mtag_a = UNPACKED;
+    } else if (is_column_major == TRUE && mtag_b == PACK) {
+        mtag_b = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c
@@ -151,7 +151,14 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
 						" not supported.", __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	 // A matrix packing is only done in column major case, or when
+    // A matrix is transposed in row major. PackA kernels for row-maj
+    // is not supported, hence we set it to unpacked and proceed with GEMM.
+    if ((is_row_major == TRUE) && (mtag_a == PACK)) {
+        mtag_a = UNPACKED;
+    } else if (is_column_major == TRUE && mtag_b == PACK) {
+        mtag_b = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_s8s8s32ou8.c
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32ou8.c
@@ -139,7 +139,12 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,uint8_t,int32_t,s8s8s32ou8)
 					  __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	// A matrix packing is done only if the inputs are transposed in a
+    // row major scenario. A matrix packing in row major is not supported,
+    // hence it is changed to unpacked and proceed with the GEMM.
+    if (mtag_a == PACK) {
+        mtag_a = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32obf16.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32obf16.c
@@ -144,7 +144,12 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,bfloat16,int32_t,u8s8s32obf16)
 					  __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	// A matrix packing is done only if the inputs are transposed in a
+    // row major scenario. A matrix packing in row major is not supported,
+    // hence it is changed to unpacked and proceed with the GEMM.
+    if (mtag_a == PACK) {
+        mtag_a = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32of32.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32of32.c
@@ -139,7 +139,12 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,float,int32_t,u8s8s32of32)
 					  __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	// A matrix packing is done only if the inputs are transposed in a
+    // row major scenario. A matrix packing in row major is not supported,
+    // hence it is changed to unpacked and proceed with the GEMM.
+    if (mtag_a == PACK) {
+        mtag_a = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c
@@ -139,7 +139,12 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
 					  __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	// A matrix packing is done only if the inputs are transposed in a
+    // row major scenario. A matrix packing in row major is not supported,
+    // hence it is changed to unpacked and proceed with the GEMM.
+    if (mtag_a == PACK) {
+        mtag_a = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c
@@ -139,7 +139,12 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 					  __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	// A matrix packing is done only if the inputs are transposed in a
+    // row major scenario. A matrix packing in row major is not supported,
+    // hence it is changed to unpacked and proceed with the GEMM.
+    if (mtag_a == PACK) {
+        mtag_a = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32ou8.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32ou8.c
@@ -139,7 +139,12 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int32_t,u8s8s32ou8)
 					  __FILE__, __LINE__);
 		goto err_hndl;
 	}
-
+	// A matrix packing is done only if the inputs are transposed in a
+    // row major scenario. A matrix packing in row major is not supported,
+    // hence it is changed to unpacked and proceed with the GEMM.
+    if (mtag_a == PACK) {
+        mtag_a = UNPACKED;
+    }
 	// From 5-loop function point of view
 	// B matrix needs to be packed in a certain format in order to be loaded
 	// and used in bf16 instrution. As such the mtag_b always needs to be either