Partial completion of work in L1 APIs

- Partial completion of compute was happening since BLIS was unable to launch the required number of threads. This was because rntm was returning a thread count greater than the maximum number of threads that can be launched in the subsequent parallel region. - Added 'omp_get_num_threads' inside the parallel regions to get the actual number of threads spawned. The work distribution happens based on the actual number of threads launched in that region. AMD-Internal: [CPUPL-3268] Change-Id: I086ad4b9b644f966b7bab439e43222396f0c2bf0
2026-07-17 09:07:31 +00:00 · 2023-04-26 18:05:56 +05:30
parent 7e50ba669b
commit 828ac8e2dd
3 changed files with 43 additions and 14 deletions
--- a/frame/compat/bla_axpy_amd.c
+++ b/frame/compat/bla_axpy_amd.c
@@ -398,11 +398,17 @@ void daxpy_blis_impl
        // Get the thread ID
        dim_t thread_id = omp_get_thread_num();

-        // Calculate the compute range for the current thread
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
        bli_thread_vector_partition
        (
          n_elem,
-          nt,
+          nt_use,
          &start, &length,
          thread_id
        );
--- a/frame/compat/bla_dot_amd.c
+++ b/frame/compat/bla_dot_amd.c
@@ -466,11 +466,17 @@ double ddot_blis_impl
        // Get the thread ID
        dim_t thread_id = omp_get_thread_num();

-        // Calculate the compute range for the current thread
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
        bli_thread_vector_partition
        (
          n_elem,
-          nt,
+          nt_use,
          &start, &length,
          thread_id
        );
@@ -492,13 +498,18 @@ double ddot_blis_impl
        );
    }

-    // Accumulating the nt thread outputs to rho
-    for ( dim_t i = 0; i < nt; i++ )
-        rho += rho_temp[i];
-
-    // Releasing the allocated memory if it was allocated
-    if( bli_mem_is_alloc(&mem_buf_rho))
+    /*
+      Accumulate the values in rho_temp only when mem is allocated.
+      When the memory cannot be allocated rho_temp will point to
+      rho
+    */
+    if (bli_mem_is_alloc(&mem_buf_rho))
    {
+        // Accumulating the nt thread outputs to rho
+        for (dim_t i = 0; i < nt; i++)
+          rho += rho_temp[i];
+
+        // Releasing the allocated memory if it was allocated
        bli_membrk_release(&rntm, &mem_buf_rho);
    }
 #endif
--- a/frame/compat/bla_scal_amd.c
+++ b/frame/compat/bla_scal_amd.c
@@ -383,11 +383,17 @@ void dscal_blis_impl
        // Get the thread ID
        dim_t thread_id = omp_get_thread_num();

-        // Calculate the compute range for the current thread
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
        bli_thread_vector_partition
        (
          n_elem,
-          nt,
+          nt_use,
          &start, &length,
          thread_id
        );
@@ -563,11 +569,17 @@ void zdscal_blis_impl
        // Get the thread ID
        dim_t thread_id = omp_get_thread_num();

-        // Calculate the compute range for the current thread
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
        bli_thread_vector_partition
        (
          n_elem,
-          nt,
+          nt_use,
          &start, &length,
          thread_id
        );