BLAS Extension API - ?gemm_compute()

- Added support for 2 new APIs: 1. sgemm_compute() 2. dgemm_compute() These are dependent on the ?gemm_pack_get_size() and ?gemm_pack() APIs. - ?gemm_compute() takes the packed matrix buffer (represented by the packed matrix identifier) and performs the GEMM operation: C := A * B + beta * C. - Whenever the kernel storage preference and the matrix storage scheme isn't matching, and the respective matrix being loaded isn't packed either, on-the-go packing has been enabled for such cases to pack that matrix. - Note: If both the matrices are packed using the ?gemm_pack() API, it is the responsibility of the user to pack only one matrix with alpha scalar and the other with a unit scalar. - Note: Support is presently limited to Single Thread only. Both, pack and compute APIs are forced to take n_threads=1. AMD-Internal: [CPUPL-3560] Change-Id: I825d98a0a5038d31668d2a4b84b3ccc204e6c158
2026-04-19 23:28:52 +00:00 · 2023-07-17 12:44:42 +05:30
parent 81161066e5
commit c8f14edcf5
32 changed files with 3623 additions and 20 deletions
--- a/bench/Makefile
+++ b/bench/Makefile
@@ -6,7 +6,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -193,7 +193,8 @@ blis: \
       bench_amaxv_blis.x \
       bench_copyv_blis.x \
       bench_swapv_blis.x \
-       bench_axpbyv_blis.x
+       bench_axpbyv_blis.x \
+       bench_gemm_pack_compute_blis.x

 openblas: \
      bench_gemm_openblas.x \
@@ -240,7 +241,8 @@ mkl:  \
      bench_amaxv_mkl.x \
      bench_copyv_mkl.x \
      bench_swapv_mkl.x \
-      bench_axpbyv_mkl.x
+      bench_axpbyv_mkl.x \
+      bench_gemm_pack_compute_mkl.x


 # --Object file rules --
--- a/bench/bench_gemm_pack_compute.c
+++ b/bench/bench_gemm_pack_compute.c
@@ -0,0 +1,930 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+
+// Benchmark application to process aocl logs generated by BLIS library.
+#ifndef DT
+#define DT BLIS_DOUBLE
+#endif
+
+#ifndef IND
+#define IND BLIS_NAT
+#endif
+
+#ifndef N_REPEAT
+//#define N_REPEAT 100
+#endif
+
+
+#define AOCL_MATRIX_INITIALISATION
+#define BUFFER_SIZE 256
+
+/* For BLIS since logs are collected at BLAS interfaces
+ * we disable cblas interfaces for this benchmark application
+ */
+
+#ifdef BLIS_ENABLE_CBLAS
+// #define CBLAS
+#endif
+
+// #define PRINT
+
+int main( int argc, char** argv )
+{
+    obj_t a, b, c;
+    obj_t c_save;
+    obj_t alpha, beta, alpha_one;
+    dim_t m, n, k;
+    dim_t  p_inc = 0; // to keep track of number of inputs
+    num_t dt;
+    //    ind_t    ind;
+    char     dt_ch;
+    int   r, n_repeats;
+    trans_t  transa;
+    trans_t  transb;
+
+    double   dtime;
+    double   dtime_save;
+    double   gflops;
+
+    int packA, packB;
+
+    FILE* fin  = NULL;
+    FILE* fout = NULL;
+
+    n_repeats = N_REPEAT;  // This macro will get from Makefile.
+
+    dt = DT;
+
+    if (argc < 3)
+    {
+        printf("Usage: ./test_gemm_pack_compute_XX.x input.csv output.csv\n");
+        exit(1);
+    }
+    fin = fopen(argv[1], "r");
+    if (fin == NULL)
+    {
+        printf("Error opening the file %s\n", argv[1]);
+        exit(1);
+    }
+    fout = fopen(argv[2], "w");
+    if (fout == NULL)
+    {
+        printf("Error opening output file %s\n", argv[2]);
+        exit(1);
+    }
+  if (argc > 3)
+  {
+    n_repeats = atoi(argv[3]);
+  }
+
+    fprintf(fout, "Dt transa transb identifier m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n");
+
+    // Following variables are needed for scanf to read inputs properly
+    // however they are not used in bench.
+    char api_name[BUFFER_SIZE];       // to store function name, line no present in logs
+    char dummy_buffer[BUFFER_SIZE];
+
+    // Variables extracted from the logs which are used by bench
+    char stor_scheme, transA_c, transB_c, packA_c, packB_c;
+    double alpha_r, beta_r, alpha_i, beta_i;
+    dim_t m_trans, n_trans;
+    inc_t lda, ldb, ldc;
+
+    stor_scheme = 'C'; // By default set it to Column Major
+
+    //{S, D, C, Z} transa, transb, packA, packB, m, n, k, alpha_real,
+    //             alpha_imag, lda ldb, beta_real, beta_imag, ldc,
+    //
+    //             number of threads, execution time, gflops ---> ignored by bench
+    while (fscanf(fin, "%s %c %c %c %c %c " INT_FS INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]",
+            api_name, &dt_ch, &transA_c, &transB_c, &packA_c, &packB_c, &m, &n, &k, &alpha_r, &alpha_i,
+            &lda, &ldb, &beta_r, &beta_i, &ldc) == 16)
+    {
+        // Discard any extra data on current line in the input file.
+        fgets(dummy_buffer, BUFFER_SIZE, fin );
+
+        // At BLAS level only column major order is supported.
+        stor_scheme = 'C';
+
+        if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
+        else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT;
+        else
+        {
+            printf("Invalid data type %c\n", dt_ch);
+            continue;
+        }
+
+        if      ( transA_c == 'n' || transA_c == 'N' ) transa = BLIS_NO_TRANSPOSE;
+        else if ( transA_c == 't' || transA_c == 'T' ) transa = BLIS_TRANSPOSE;
+        else if ( transA_c == 'c' || transA_c == 'C' ) transa = BLIS_CONJ_TRANSPOSE;
+        else
+        {
+            printf("Invalid option for transA \n");
+            continue;
+        }
+
+        if      ( transB_c == 'n' || transB_c == 'N' ) transb = BLIS_NO_TRANSPOSE;
+        else if ( transB_c == 't' || transB_c == 'T' ) transb = BLIS_TRANSPOSE;
+        else if ( transB_c == 'c' || transB_c == 'C' ) transb = BLIS_CONJ_TRANSPOSE;
+        else
+        {
+            printf("Invalid option for transB \n");
+            continue;
+        }
+
+        if      ( packA_c == 'p' || packA_c == 'P' ) packA = TRUE;
+        else if ( packA_c == 'u' || packA_c == 'U' ) packA = FALSE;
+        else
+        {
+            printf("Invalid option for packA \n");
+            continue;
+        }
+
+        if      ( packB_c == 'p' || packB_c == 'P') packB = TRUE;
+        else if ( packB_c == 'u' || packB_c == 'U') packB = FALSE;
+        else
+        {
+            printf("Invalid option for packB \n");
+            continue;
+        }
+
+        bli_obj_create( dt, 1, 1, 0, 0, &alpha);
+        bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+        bli_obj_create( dt, 1, 1, 0, 0, &alpha_one);
+
+        if( (stor_scheme == 'C') || (stor_scheme == 'c') )
+        {
+            // leading dimension should be greater than number of rows
+            // if ((m > lda) || (k > ldb) || (m > ldc)) continue;
+            // Since this bench app is run on logs generated by AOCL trace logs
+            // - we have relaxed the checks on the input parameters.
+
+            // if A is transpose - A(lda x m), lda >= max(1,k)
+            // if A is non-transpose - A (lda x k), lda >= max(1,m)
+            // if B is transpose - B (ldb x k), ldb >= max(1,n)
+            // if B is non-transpose - B (ldb x n), ldb >= max(1,k)
+            //    C is ldc x n - ldc >= max(1, m)
+            //if(transa) lda = k; // We will end up overwriting lda
+            bli_set_dims_with_trans( transa, m, k, &m_trans, &n_trans);
+            bli_obj_create( dt, m_trans, n_trans, 1, lda, &a);
+
+            //if(transb) ldb = n; // we will end up overwriting ldb, ldb >= n
+            bli_set_dims_with_trans( transb, k, n, &m_trans, &n_trans);
+            bli_obj_create( dt, m_trans, n_trans, 1, ldb, &b);
+
+            bli_obj_create( dt, m, n, 1, ldc, &c);
+            bli_obj_create( dt, m, n, 1, ldc, &c_save );
+        }
+        else if( (stor_scheme == 'r') || (stor_scheme == 'R') )
+        {
+            //leading dimension should be greater than number of columns
+            //if ((k > lda) || (n > ldb) || (n > ldc)) continue;
+            // Since this bench app is run on logs generated by AOCL trace logs
+            // - we have relaxed the checks on the input parameters.
+
+            // if A is transpose - A(k x lda), lda >= max(1,m)
+            // if A is non-transpose - A (m x lda), lda >= max(1,k)
+            // if B is transpose - B (n x ldb), ldb >= max(1,k)
+            // if B is non-transpose - B (k x ldb ), ldb >= max(1,n)
+            //    C is m x ldc - ldc >= max(1, n)
+
+            //if(transa) lda = m; // this will overwrite lda
+            bli_set_dims_with_trans(transa, m, k, &m_trans, &n_trans);
+            bli_obj_create( dt, m_trans, n_trans, lda, 1, &a);
+
+            //if(transb) ldb = k; // this will overwrite ldb
+            bli_set_dims_with_trans(transb, k, n, &m_trans, &n_trans);
+            bli_obj_create( dt, m_trans, n_trans, ldb, 1, &b);
+
+            bli_obj_create( dt, m, n, ldc, 1, &c);
+            bli_obj_create( dt, m, n, ldc, 1, &c_save );
+        }
+        else
+        {
+            printf("Invalid storage scheme\n");
+            continue;
+        }
+#ifndef BLIS // Incase if we are using blis interface we don't have to check for col-storage.
+     #ifndef CBLAS
+        if( ( stor_scheme == 'R' ) || ( stor_scheme == 'r' ) )
+        {
+            printf("BLAS APIs doesn't support row-storage: Enable CBLAS\n");
+            continue;
+        }
+     #endif
+#endif
+
+#ifdef AOCL_MATRIX_INITIALISATION
+        bli_randm( &a );
+        bli_randm( &b );
+        bli_randm( &c );
+#endif
+        bli_copym( &c, &c_save );
+
+        bli_obj_set_conjtrans( transa, &a);
+        bli_obj_set_conjtrans( transb, &b);
+
+        bli_setsc( 1.0, 1.0, &alpha_one );
+        bli_setsc( alpha_r, alpha_i, &alpha );
+        bli_setsc( beta_r, beta_i, &beta );
+
+        dtime_save = DBL_MAX;
+
+        for ( r = 0; r < n_repeats; ++r )
+        {
+            bli_copym( &c_save, &c );
+#ifdef PRINT
+            bli_printm( "a", &a, "%4.6f", "" );
+            bli_printm( "b", &b, "%4.6f", "" );
+            bli_printm( "c", &c, "%4.6f", "" );
+#endif
+            dtime = bli_clock();
+
+#ifdef BLIS
+
+            printf( "BLAS Extension APIs don't have a BLIS interface."
+                    "Enable CBLAS or BLAS interface!\n" );
+
+#else
+
+#ifdef CBLAS
+            enum CBLAS_ORDER      cblas_order;
+            enum CBLAS_TRANSPOSE  cblas_transa;
+            enum CBLAS_TRANSPOSE  cblas_transb;
+            enum CBLAS_IDENTIFIER cblas_identifierA;
+            enum CBLAS_IDENTIFIER cblas_identifierB;
+
+            size_t bufSizeA;
+            size_t bufSizeB;
+
+            if ( ( stor_scheme == 'C' ) || ( stor_scheme == 'c' ) )
+              cblas_order = CblasColMajor;
+            else
+              cblas_order = CblasRowMajor;
+
+            if( bli_is_trans( transa ) )
+              cblas_transa = CblasTrans;
+            else if( bli_is_conjtrans( transa ) )
+              cblas_transa = CblasConjTrans;
+            else
+              cblas_transa = CblasNoTrans;
+
+            if( bli_is_trans( transb ) )
+              cblas_transb = CblasTrans;
+            else if( bli_is_conjtrans( transb ) )
+              cblas_transb = CblasConjTrans;
+            else
+              cblas_transb = CblasNoTrans;
+
+            if ( packA )
+              cblas_identifierA = CblasAMatrix;
+
+            if ( packB )
+              cblas_identifierB = CblasBMatrix;
+#else
+            f77_char f77_transa;
+            f77_char f77_transb;
+            f77_char f77_identifierA;
+            f77_char f77_identifierB;
+            f77_int  f77_bufSizeA;
+            f77_int  f77_bufSizeB;
+
+            f77_char f77_packed = 'P';
+            f77_identifierA = 'A';
+            f77_identifierB = 'B';
+            bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+            bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+            err_t err = BLIS_SUCCESS;
+
+#endif
+            if ( bli_is_float( dt ) )
+            {
+                f77_int  mm     = bli_obj_length( &c );
+                f77_int  kk     = bli_obj_width_after_trans( &a );
+                f77_int  nn     = bli_obj_width( &c );
+
+                float*   alphaonep = bli_obj_buffer( &alpha_one );
+                float*   alphap = bli_obj_buffer( &alpha );
+                float*   ap     = bli_obj_buffer( &a );
+                float*   bp     = bli_obj_buffer( &b );
+                float*   betap  = bli_obj_buffer( &beta );
+                float*   cp     = bli_obj_buffer( &c );
+
+#ifdef CBLAS
+                float* aBuffer;
+                float* bBuffer;
+
+                if ( packA && !packB )
+                {
+                  // Only A is pre-packed.
+                  bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+                  aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
+
+                  cblas_sgemm_pack( cblas_order,
+                                    CblasAMatrix,
+                                    cblas_transa,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphap,
+                                    ap, lda,
+                                    aBuffer );
+
+                  cblas_sgemm_compute( cblas_order,
+                                       CblasPacked,
+                                       cblas_transb,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       aBuffer, lda,
+                                       bp, ldb,
+                                       *betap,
+                                       cp, ldc );
+
+                  bli_free_user(aBuffer);
+                }
+                else if ( !packA && packB )
+                {
+                  // Only B is pre-packed.
+                  bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+                  bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
+
+                  cblas_sgemm_pack( cblas_order,
+                                    CblasBMatrix,
+                                    cblas_transb,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphap,
+                                    bp, ldb,
+                                    bBuffer );
+
+                  cblas_sgemm_compute( cblas_order,
+                                       cblas_transa,
+                                       CblasPacked,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       ap, lda,
+                                       bBuffer, ldb,
+                                       *betap,
+                                       cp, ldc );
+
+                  bli_free_user(bBuffer);
+                }
+                else if ( packA && packB )
+                {
+                  // Both A & B are pre-packed.
+                  bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+                  aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
+
+                  bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+                  bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
+
+                  cblas_sgemm_pack( cblas_order,
+                                    CblasAMatrix,
+                                    cblas_transa,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphap,
+                                    ap, lda,
+                                    aBuffer );
+
+                  cblas_sgemm_pack( cblas_order,
+                                    CblasBMatrix,
+                                    cblas_transb,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphaonep,
+                                    bp, ldb,
+                                    bBuffer );
+
+                  cblas_sgemm_compute( cblas_order,
+                                       CblasPacked,
+                                       CblasPacked,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       aBuffer, lda,
+                                       bBuffer, ldb,
+                                       *betap,
+                                       cp, ldc );
+
+                  bli_free_user(aBuffer);
+                  bli_free_user(bBuffer);
+                }
+                else
+                {
+                  // Neither A nor B is pre-packed.
+                  cblas_sgemm_compute( cblas_order,
+                                       cblas_transa,
+                                       cblas_transb,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       ap, lda,
+                                       bp, ldb,
+                                       *betap,
+                                       cp, ldc );
+                }
+#else           // -- BLAS API --
+                float* aBuffer;
+                float* bBuffer;
+
+                if ( packA && !packB )
+                {
+                  // Only A is pre-packed.
+                  f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+                  aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
+
+                  sgemm_pack_( &f77_identifierA,
+                               &f77_transa,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphap,
+                               ap,
+                               (f77_int*)&lda,
+                               aBuffer );
+
+                  sgemm_compute_( &f77_packed,
+                                  &f77_transb,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  aBuffer, (f77_int*)&lda,
+                                  bp, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+
+                  bli_free_user( aBuffer );
+                }
+                else if ( !packA && packB )
+                {
+                  // Only B is pre-packed.
+                  f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+                  bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
+
+                  sgemm_pack_( &f77_identifierB,
+                               &f77_transb,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphap,
+                               bp,
+                               (f77_int*)&ldb,
+                               bBuffer );
+
+                  sgemm_compute_( &f77_transa,
+                                  &f77_packed,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  ap, (f77_int*)&lda,
+                                  bBuffer, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+
+                  bli_free_user( bBuffer );
+                }
+                else if ( packA && packB )
+                {
+                  // Both A & B are pre-packed.
+                  f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+
+                  bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
+
+                  f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+
+                  aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
+
+                  sgemm_pack_( &f77_identifierA,
+                               &f77_transa,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphap,
+                               ap,
+                               (f77_int*)&lda,
+                               aBuffer );
+
+                  sgemm_pack_( &f77_identifierB,
+                               &f77_transb,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphaonep,
+                               bp,
+                               (f77_int*)&ldb,
+                               bBuffer );
+
+                  sgemm_compute_( &f77_packed,
+                                  &f77_packed,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  aBuffer, (f77_int*)&lda,
+                                  bBuffer, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+
+                  bli_free_user(aBuffer);
+                  bli_free_user(bBuffer);
+                }
+                else
+                {
+                  // Neither A nor B is reordered.
+                  sgemm_compute_( &f77_transa,
+                                  &f77_transb,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  ap, (f77_int*)&lda,
+                                  bp, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+                }
+#endif
+            }
+            else if ( bli_is_double( dt ) )
+            {
+                f77_int  mm     = bli_obj_length( &c );
+                f77_int  kk     = bli_obj_width_after_trans( &a );
+                f77_int  nn     = bli_obj_width( &c );
+
+                double*  alphap = bli_obj_buffer( &alpha );
+                double*  alphaonep = bli_obj_buffer( &alpha_one );
+                double*  ap     = bli_obj_buffer( &a );
+                double*  bp     = bli_obj_buffer( &b );
+                double*  betap  = bli_obj_buffer( &beta );
+                double*  cp     = bli_obj_buffer( &c );
+
+#ifdef CBLAS
+                double* aBuffer;
+                double* bBuffer;
+
+                if ( packA && !packB )
+                {
+                  // Only A is pre-packed.
+                  bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+                  aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
+
+                  cblas_dgemm_pack( cblas_order,
+                                    CblasAMatrix,
+                                    cblas_transa,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphap,
+                                    ap, lda,
+                                    aBuffer );
+
+                  cblas_dgemm_compute( cblas_order,
+                                       CblasPacked,
+                                       cblas_transb,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       aBuffer, lda,
+                                       bp, ldb,
+                                       *betap,
+                                       cp, ldc );
+
+                  bli_free_user(aBuffer);
+                }
+                else if ( !packA && packB )
+                {
+                  // Only B is pre-packed.
+                  bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+
+                  cblas_dgemm_pack( cblas_order,
+                                    CblasBMatrix,
+                                    cblas_transb,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphap,
+                                    bp, ldb,
+                                    bBuffer );
+
+                  cblas_dgemm_compute( cblas_order,
+                                       cblas_transa,
+                                       CblasPacked,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       ap, lda,
+                                       bBuffer, ldb,
+                                       *betap,
+                                       cp, ldc );
+
+                  bli_free_user(bBuffer);
+                }
+                else if ( packA && packB )
+                {
+                  // Both A & B are pre-packed.
+                  bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+                  aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
+
+                  bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
+                                                        mm,
+                                                        nn,
+                                                        kk );
+                  bBuffer = (double*) bli_malloc_user( bufSizeB, &err );
+
+                  cblas_dgemm_pack( cblas_order,
+                                    CblasAMatrix,
+                                    cblas_transa,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphap,
+                                    ap, lda,
+                                    aBuffer );
+
+                  cblas_dgemm_pack( cblas_order,
+                                    CblasBMatrix,
+                                    cblas_transb,
+                                    mm,
+                                    nn,
+                                    kk,
+                                    *alphap,
+                                    bp, ldb,
+                                    bBuffer );
+
+                  cblas_dgemm_compute( cblas_order,
+                                       CblasPacked,
+                                       CblasPacked,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       aBuffer, lda,
+                                       bBuffer, ldb,
+                                       *betap,
+                                       cp, ldc );
+
+                  bli_free_user(aBuffer);
+                  bli_free_user(bBuffer);
+                }
+                else
+                {
+                  // Neither A nor B is pre-packed.
+                  cblas_dgemm_compute( cblas_order,
+                                       cblas_transa,
+                                       cblas_transb,
+                                       mm,
+                                       nn,
+                                       kk,
+                                       ap, lda,
+                                       bp, ldb,
+                                       *betap,
+                                       cp, ldc );
+                }
+
+#else           // -- BLAS API --
+                double* aBuffer;
+                double* bBuffer;
+
+                if ( packA && !packB )
+                {
+                  // Only A is pre-packed.
+                  f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+                  aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
+
+                  dgemm_pack_( &f77_identifierA,
+                               &f77_transa,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphap,
+                               ap,
+                               (f77_int*)&lda,
+                               aBuffer );
+
+                  dgemm_compute_( &f77_packed,
+                                  &f77_transb,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  aBuffer, (f77_int*)&lda,
+                                  bp, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+
+                  bli_free_user( aBuffer );
+                }
+                else if ( !packA && packB )
+                {
+                  // Only B is pre-packed.
+                  f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+                  bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
+
+                  dgemm_pack_( &f77_identifierB,
+                               &f77_transb,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphap,
+                               bp,
+                               (f77_int*)&ldb,
+                               bBuffer );
+
+                  dgemm_compute_( &f77_transa,
+                                  &f77_packed,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  ap, (f77_int*)&lda,
+                                  bBuffer, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+
+                  bli_free_user( bBuffer );
+                }
+                else if ( packA && packB )
+                {
+                  // Both A & B are pre-packed.
+                  f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+                  aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
+
+                  f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
+                                                       &mm,
+                                                       &nn,
+                                                       &kk );
+                  bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
+
+                  dgemm_pack_( &f77_identifierA,
+                               &f77_transa,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphap,
+                               ap,
+                               (f77_int*)&lda,
+                               aBuffer );
+
+                  dgemm_pack_( &f77_identifierB,
+                               &f77_transb,
+                               &mm,
+                               &nn,
+                               &kk,
+                               alphaonep,
+                               bp,
+                               (f77_int*)&ldb,
+                               bBuffer );
+
+                  dgemm_compute_( &f77_packed,
+                                  &f77_packed,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  aBuffer, (f77_int*)&lda,
+                                  bBuffer, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+
+                  bli_free_user(aBuffer);
+                  bli_free_user(bBuffer);
+                }
+                else
+                {
+                  // Neither A nor B is reordered.
+                  dgemm_compute_( &f77_transa,
+                                  &f77_transb,
+                                  &mm,
+                                  &nn,
+                                  &kk,
+                                  ap, (f77_int*)&lda,
+                                  bp, (f77_int*)&ldb,
+                                  betap,
+                                  cp, (f77_int*)&ldc );
+                }
+#endif
+            }
+#endif
+
+#ifdef PRINT
+            bli_printm( "c compute", &c, "%4.6f", "" );
+#endif
+
+            dtime_save = bli_clock_min_diff( dtime_save, dtime );
+        }
+
+        gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
+
+        if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+        printf( "data_%cgemm_%s", dt_ch, BLAS );
+
+        p_inc++;
+        printf("( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+               (unsigned long)(p_inc),
+               (unsigned long)m,
+               (unsigned long)n,
+               (unsigned long)k, gflops);
+
+        fprintf (fout, "%c %c %c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld %6.3f\n", \
+                 dt_ch, transA_c, transB_c, packA_c, packB_c, m, n, k, alpha_r, alpha_i, lda, ldb, beta_r, beta_i, ldc, gflops);
+
+        fflush(fout);
+
+        bli_obj_free( &alpha );
+        bli_obj_free( &beta );
+
+        bli_obj_free( &a );
+        bli_obj_free( &b );
+        bli_obj_free( &c );
+        bli_obj_free( &c_save );
+    }
+
+    //bli_finalize();
+    fclose(fin);
+    fclose(fout);
+
+    return 0;
+}
--- a/bench/inputgemmpackcompute.txt
+++ b/bench/inputgemmpackcompute.txt
@@ -0,0 +1,92 @@
+sgemm_ S N N P U 1 1 1 1 0 1 1 1 0 1
+sgemm_ S N N P U 2 2 2 1 0 2 2 1 0 2
+sgemm_ S N N P U 3 3 3 1 0 3 3 1 0 3
+sgemm_ S N N P U 4 4 4 1 0 4 4 1 0 4
+sgemm_ S N N P U 5 5 5 1 0 5 5 1 0 5
+sgemm_ S N N P U 6 6 6 1 0 6 6 1 0 6
+sgemm_ S N N P U 7 7 7 1 0 7 7 1 0 7
+sgemm_ S N N P U 8 8 8 1 0 8 8 1 0 8
+sgemm_ S N N P U 9 9 9 1 0 9 9 1 0 9
+sgemm_ S N N P U 10 10 10 1 0 10 10 1 0 10
+sgemm_ S N N P U 20 20 20 1 0 20 20 1 0 20
+sgemm_ S N N P U 30 30 30 1 0 30 30 1 0 30
+sgemm_ S N N P U 40 40 40 1 0 40 40 1 0 40
+sgemm_ S N N P U 50 50 50 1 0 50 50 1 0 50
+sgemm_ S N N P U 60 60 60 1 0 60 60 1 0 60
+sgemm_ S N N P U 70 70 70 1 0 70 70 1 0 70
+sgemm_ S N N P U 80 80 80 1 0 80 80 1 0 80
+sgemm_ S N N P U 90 90 90 1 0 90 90 1 0 90
+sgemm_ S N N P U 100 100 100 1 0 100 100 1 0 100
+sgemm_ S N N P U 200 200 200 1 0 200 200 1 0 200
+sgemm_ S N N P U 300 300 300 1 0 300 300 1 0 300
+sgemm_ S N N P U 400 400 400 1 0 400 400 1 0 400
+sgemm_ S N N P U 500 500 500 1 0 500 500 1 0 500
+dgemm_ D N N P U 1 1 1 1 0 1 1 1 0 1
+dgemm_ D N N P U 2 2 2 1 0 2 2 1 0 2
+dgemm_ D N N P U 3 3 3 1 0 3 3 1 0 3
+dgemm_ D N N P U 4 4 4 1 0 4 4 1 0 4
+dgemm_ D N N P U 5 5 5 1 0 5 5 1 0 5
+dgemm_ D N N P U 6 6 6 1 0 6 6 1 0 6
+dgemm_ D N N P U 7 7 7 1 0 7 7 1 0 7
+dgemm_ D N N P U 8 8 8 1 0 8 8 1 0 8
+dgemm_ D N N P U 9 9 9 1 0 9 9 1 0 9
+dgemm_ D N N P U 10 10 10 1 0 10 10 1 0 10
+dgemm_ D N N P U 20 20 20 1 0 20 20 1 0 20
+dgemm_ D N N P U 30 30 30 1 0 30 30 1 0 30
+dgemm_ D N N P U 40 40 40 1 0 40 40 1 0 40
+dgemm_ D N N P U 50 50 50 1 0 50 50 1 0 50
+dgemm_ D N N P U 60 60 60 1 0 60 60 1 0 60
+dgemm_ D N N P U 70 70 70 1 0 70 70 1 0 70
+dgemm_ D N N P U 80 80 80 1 0 80 80 1 0 80
+dgemm_ D N N P U 90 90 90 1 0 90 90 1 0 90
+dgemm_ D N N P U 100 100 100 1 0 100 100 1 0 100
+dgemm_ D N N P U 200 200 200 1 0 200 200 1 0 200
+dgemm_ D N N P U 300 300 300 1 0 300 300 1 0 300
+dgemm_ D N N P U 400 400 400 1 0 400 400 1 0 400
+dgemm_ D N N P U 500 500 500 1 0 500 500 1 0 500
+sgemm_ S N N U P 1 1 1 1 0 1 1 1 0 1
+sgemm_ S N N U P 2 2 2 1 0 2 2 1 0 2
+sgemm_ S N N U P 3 3 3 1 0 3 3 1 0 3
+sgemm_ S N N U P 4 4 4 1 0 4 4 1 0 4
+sgemm_ S N N U P 5 5 5 1 0 5 5 1 0 5
+sgemm_ S N N U P 6 6 6 1 0 6 6 1 0 6
+sgemm_ S N N U P 7 7 7 1 0 7 7 1 0 7
+sgemm_ S N N U P 8 8 8 1 0 8 8 1 0 8
+sgemm_ S N N U P 9 9 9 1 0 9 9 1 0 9
+sgemm_ S N N U P 10 10 10 1 0 10 10 1 0 10
+sgemm_ S N N U P 20 20 20 1 0 20 20 1 0 20
+sgemm_ S N N U P 30 30 30 1 0 30 30 1 0 30
+sgemm_ S N N U P 40 40 40 1 0 40 40 1 0 40
+sgemm_ S N N U P 50 50 50 1 0 50 50 1 0 50
+sgemm_ S N N U P 60 60 60 1 0 60 60 1 0 60
+sgemm_ S N N U P 70 70 70 1 0 70 70 1 0 70
+sgemm_ S N N U P 80 80 80 1 0 80 80 1 0 80
+sgemm_ S N N U P 90 90 90 1 0 90 90 1 0 90
+sgemm_ S N N U P 100 100 100 1 0 100 100 1 0 100
+sgemm_ S N N U P 200 200 200 1 0 200 200 1 0 200
+sgemm_ S N N U P 300 300 300 1 0 300 300 1 0 300
+sgemm_ S N N U P 400 400 400 1 0 400 400 1 0 400
+sgemm_ S N N U P 500 500 500 1 0 500 500 1 0 500
+dgemm_ D N N U P 1 1 1 1 0 1 1 1 0 1
+dgemm_ D N N U P 2 2 2 1 0 2 2 1 0 2
+dgemm_ D N N U P 3 3 3 1 0 3 3 1 0 3
+dgemm_ D N N U P 4 4 4 1 0 4 4 1 0 4
+dgemm_ D N N U P 5 5 5 1 0 5 5 1 0 5
+dgemm_ D N N U P 6 6 6 1 0 6 6 1 0 6
+dgemm_ D N N U P 7 7 7 1 0 7 7 1 0 7
+dgemm_ D N N U P 8 8 8 1 0 8 8 1 0 8
+dgemm_ D N N U P 9 9 9 1 0 9 9 1 0 9
+dgemm_ D N N U P 10 10 10 1 0 10 10 1 0 10
+dgemm_ D N N U P 20 20 20 1 0 20 20 1 0 20
+dgemm_ D N N U P 30 30 30 1 0 30 30 1 0 30
+dgemm_ D N N U P 40 40 40 1 0 40 40 1 0 40
+dgemm_ D N N U P 50 50 50 1 0 50 50 1 0 50
+dgemm_ D N N U P 60 60 60 1 0 60 60 1 0 60
+dgemm_ D N N U P 70 70 70 1 0 70 70 1 0 70
+dgemm_ D N N U P 80 80 80 1 0 80 80 1 0 80
+dgemm_ D N N U P 90 90 90 1 0 90 90 1 0 90
+dgemm_ D N N U P 100 100 100 1 0 100 100 1 0 100
+dgemm_ D N N U P 200 200 200 1 0 200 200 1 0 200
+dgemm_ D N N U P 300 300 300 1 0 300 300 1 0 300
+dgemm_ D N N U P 400 400 400 1 0 400 400 1 0 400
+dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500
--- a/frame/3/CMakeLists.txt
+++ b/frame/3/CMakeLists.txt
@@ -1,4 +1,4 @@
-##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.##

 target_sources("${PROJECT_NAME}"
     PRIVATE
@@ -26,12 +26,13 @@ target_sources("${PROJECT_NAME}"
    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_oapi.c
    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_tapi.c
    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_smart_threading.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute.c
    )
 # Select AMD specific sources for AMD configurations.
-if(${TARGET_ARCH} STREQUAL zen OR 
-   ${TARGET_ARCH} STREQUAL zen2 OR 
+if(${TARGET_ARCH} STREQUAL zen OR
+   ${TARGET_ARCH} STREQUAL zen2 OR
   ${TARGET_ARCH} STREQUAL zen3 OR
-   ${TARGET_ARCH} STREQUAL zen4 OR 
+   ${TARGET_ARCH} STREQUAL zen4 OR
   ${TARGET_ARCH} STREQUAL amdzen)
    target_sources("${PROJECT_NAME}"
    PRIVATE
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-22, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -105,3 +105,6 @@

 // Smart Threading API's.
 #include "bli_l3_smart_threading.h"
+
+// BLAS Extension API - Compute
+#include "bli_l3_compute.h"
--- a/frame/3/bli_l3_compute.c
+++ b/frame/3/bli_l3_compute.c
@@ -0,0 +1,637 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_gemm_compute_init
+(
+    obj_t* a,
+    obj_t* b,
+    obj_t* beta,
+    obj_t* c,
+    cntx_t* cntx,
+    rntm_t* rntm
+)
+{
+    if ( bli_error_checking_is_enabled() )
+    {
+        // @todo: Add call to error checking function here
+    }
+
+    // Initializing the cntx if one isn't already passed.
+    if ( cntx == NULL ) {
+        cntx = bli_gks_query_cntx();
+    }
+
+    // Initialize a local runtime with global settings if necessary. Note
+    // that in the case that a runtime is passed in, we make a local copy.
+    rntm_t rntm_l;
+    if ( rntm == NULL )
+    {
+        bli_rntm_init_from_global( &rntm_l );
+        rntm = &rntm_l;
+    }
+    else
+    {
+        rntm_l = *rntm;
+        rntm = &rntm_l;
+    }
+
+    // @todo: AOCL Dynamic yet to be implemented for pack-compute APIs.
+#ifdef AOCL_DYNAMIC
+    // If dynamic-threading is enabled, calculate optimum number
+    //  of threads.
+    //  rntm will be updated with optimum number of threads.
+
+    // bli_nthreads_optimum(a, b, c, BLIS_GEMM, rntm );
+#endif
+
+    // Explicitly set n_threads=1 and update rntm since only ST supported.
+    dim_t n_threads = 1;
+    bli_rntm_set_num_threads( n_threads, rntm );
+    bli_rntm_set_ways_from_rntm_sup
+    (
+      bli_obj_length( c ),
+      bli_obj_width( c ),
+      bli_obj_width( a ),
+      rntm
+    );
+
+    bli_l3_compute_thread_decorator
+    (
+        bli_gemm_compute,
+        BLIS_GEMM,
+        a,
+        b,
+        beta,
+        c,
+        cntx,
+        rntm
+    );
+}
+
+err_t bli_gemm_compute
+(
+    obj_t*     a,
+    obj_t*     b,
+    obj_t*     beta,
+    obj_t*     c,
+    cntx_t*    cntx,
+    rntm_t*    rntm,
+    thrinfo_t* thread
+)
+{
+    const num_t  dt     = bli_obj_dt( c );
+    const dim_t  m      = bli_obj_length( c );
+    const dim_t  n      = bli_obj_width( c );
+          dim_t  k      = bli_obj_width( a );
+
+    void* restrict buf_a = bli_obj_buffer_at_off( a );
+          inc_t    rs_a;
+          inc_t    cs_a;
+
+    void* restrict buf_b = bli_obj_buffer_at_off( b );
+          inc_t    rs_b;
+          inc_t    cs_b;
+
+    stor3_t    stor_id  = bli_obj_stor3_from_strides( c, a, b );
+    const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+
+    // packedX defines whether matrix X is pre-packed (reordered) or not.
+    bool packeda = bli_obj_is_packed( a );
+    bool packedb = bli_obj_is_packed( b );
+
+    // packX defines whether to pack matrix X on-the-go or not.
+    bool packa = bli_rntm_pack_a( rntm );
+    bool packb = bli_rntm_pack_b( rntm );
+    const bool transa = bli_obj_has_trans( a );
+    const bool transb = bli_obj_has_trans( b );
+
+    // is_col_stored_a = TRUE when,
+    //  A is col stored and not transposed,
+    //  or, A is row stored and transposed.
+    const bool is_col_stored_a = bli_obj_is_col_stored( a ) && !transa;
+
+    // is_row_stored_b = TRUE when,
+    //  B is row stored and not transposed,
+    //  or, B is col stored and transposed.
+    const bool is_row_stored_b = bli_obj_is_row_stored( b ) && !transb;
+
+    // If kernel is row-preferred but B is not row-stored and unpacked,
+    // enable on-the-go packing of B.
+    // Else if kernel is col-preferred but A is not col-stored and unpacked,
+    // enable on-the-go packing of A.
+    if ( row_pref )
+    {
+        if ( !packedb && !is_row_stored_b ) packb = TRUE;
+    }
+    else // if ( col_pref )
+    {
+        if ( !packeda && !is_col_stored_a ) packa = TRUE;
+    }
+
+    if ( bli_obj_has_notrans( a ) )
+    {
+        k     = bli_obj_width( a );
+
+        rs_a  = bli_obj_row_stride( a );
+        cs_a  = bli_obj_col_stride( a );
+    }
+    else // if ( bli_obj_has_trans( a ) )
+    {
+        // Assign the variables with an implicit transposition.
+        k     = bli_obj_length( a );
+
+        rs_a  = bli_obj_col_stride( a );
+        cs_a  = bli_obj_row_stride( a );
+    }
+
+    if ( bli_obj_has_notrans( b ) )
+    {
+        rs_b = bli_obj_row_stride( b );
+        cs_b = bli_obj_col_stride( b );
+    }
+    else // if ( bli_obj_has_trans( b ) )
+    {
+        rs_b = bli_obj_col_stride( b );
+        cs_b = bli_obj_row_stride( b );
+    }
+
+    void* restrict buf_c    = bli_obj_buffer_at_off( c );
+    const inc_t    rs_c     = bli_obj_row_stride( c );
+    const inc_t    cs_c     = bli_obj_col_stride( c );
+
+    void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
+
+    // Setting the packing status in rntm.
+    if ( packa ) bli_rntm_set_pack_a( 1, rntm );
+    else         bli_rntm_set_pack_a( 0, rntm );
+
+    if ( packb ) bli_rntm_set_pack_b( 1, rntm );
+    else         bli_rntm_set_pack_b( 0, rntm );
+
+    if ( bli_is_float( dt ) )
+    {
+        PASTEMAC( s, gemm_compute )
+        (
+          packa,
+          packb,
+          packeda,
+          packedb,
+          m,
+          n,
+          k,
+          buf_a, rs_a, cs_a,
+          buf_b, rs_b, cs_b,
+          buf_beta,
+          buf_c, rs_c, cs_c,
+          BLIS_RRR,     // Using BLIS_RRR since we want to redirect to m kernels.
+          cntx,
+          rntm,
+          thread
+        );
+    }
+    else if ( bli_is_double( dt ) )
+    {
+        PASTEMAC( d, gemm_compute )
+        (
+          packa,
+          packb,
+          packeda,
+          packedb,
+          m,
+          n,
+          k,
+          buf_a, rs_a, cs_a,
+          buf_b, rs_b, cs_b,
+          buf_beta,
+          buf_c, rs_c, cs_c,
+          BLIS_RRR,     // Using BLIS_RRR since we want to redirect to m kernels.
+          cntx,
+          rntm,
+          thread
+        );
+    }
+
+    return BLIS_SUCCESS;
+}
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC( ch, varname ) \
+      ( \
+        bool             packa, \
+        bool             packb, \
+        bool             packeda, \
+        bool             packedb, \
+        dim_t            m, \
+        dim_t            n, \
+        dim_t            k, \
+        void*   restrict a, inc_t rs_a, inc_t cs_a, \
+        void*   restrict b, inc_t rs_b, inc_t cs_b, \
+        void*   restrict beta, \
+        void*   restrict c, inc_t rs_c, inc_t cs_c, \
+        stor3_t          stor_id, \
+        cntx_t* restrict cntx, \
+        rntm_t* restrict rntm, \
+        thrinfo_t* restrict thread \
+      ) \
+{ \
+    const num_t dt = PASTEMAC( ch, type ); \
+\
+    /* If m or n is zero, return immediately. */ \
+    if ( bli_zero_dim2( m, n ) ) return; \
+\
+    /* @todo Add early return for k < 1 or alpha = 0 here. */ \
+\
+    /* Query the context for various blocksizes. */ \
+    const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
+    const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
+    const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
+    const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
+    const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+    /* @note: Modifications of KC are just a part of optimizations.
+        Such optimizations have been removed for simplicity and will be a part
+        of the optimizations patch. */ \
+    dim_t KC; \
+    KC = KC0; \
+\
+    /* Query the maximum blocksize for NR, which implies a maximum blocksize
+       extension for the final iteration. */ \
+    const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
+    const dim_t NRE = NRM - NR; \
+\
+    /* Compute partitioning step values for each matrix of each loop. */ \
+    const inc_t jcstep_c = cs_c; \
+    const inc_t jcstep_b = cs_b; \
+\
+    const inc_t jcstep_b_use = k; \
+\
+    const inc_t pcstep_a = cs_a; \
+    const inc_t pcstep_b = rs_b; \
+\
+    const inc_t icstep_c = rs_c; \
+    const inc_t icstep_a = rs_a; \
+\
+    const inc_t pcstep_a_use = ( ( m + MR - 1 ) / MR ) * MR; \
+\
+    const inc_t jrstep_c = cs_c * NR; \
+\
+    PASTECH(ch,gemmsup_ker_ft) \
+               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+\
+    ctype* restrict a_00       = a; \
+    ctype* restrict b_00       = b; \
+    ctype* restrict c_00       = c; \
+    ctype* restrict beta_cast  = beta; \
+\
+    /* Make local copies of beta and one scalars to prevent any unnecessary
+       sharing of cache lines between the cores' caches. */ \
+    ctype           beta_local = *beta_cast; \
+    ctype           one_local  = *PASTEMAC(ch,1); \
+\
+    auxinfo_t       aux; \
+    mem_t mem_a = BLIS_MEM_INITIALIZER; \
+    mem_t mem_b = BLIS_MEM_INITIALIZER; \
+\
+    /* Define an array of bszid_t ids, which will act as our substitute for
+       the cntl_t tree. */ \
+    /*                           5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop */ \
+    bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
+    bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
+    bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
+    bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
+    bszid_t* restrict bszids; \
+\
+    /* Set the bszids pointer to the correct bszids array above based on which
+       matrices (if any) are being packed. */ \
+\
+    if ( packa ) { if ( packb ) bszids = bszids_packab; \
+                   else         bszids = bszids_packa; } \
+    else         { if ( packb ) bszids = bszids_packb; \
+                   else         bszids = bszids_nopack; } \
+\
+    /* Determine whether we are using more than one thread. */ \
+    const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
+\
+    thrinfo_t* restrict thread_jc = NULL; \
+    thrinfo_t* restrict thread_pc = NULL; \
+    thrinfo_t* restrict thread_pb = NULL; \
+    thrinfo_t* restrict thread_ic = NULL; \
+    thrinfo_t* restrict thread_pa = NULL; \
+    thrinfo_t* restrict thread_jr = NULL; \
+\
+    /* Grow the thrinfo_t tree. */ \
+    bszid_t*   restrict bszids_jc = bszids; \
+                        thread_jc = thread; \
+    bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+\
+    /* Compute the JC loop thread range for the current thread. */ \
+    dim_t jc_start, jc_end; \
+    bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
+    const dim_t n_local = jc_end - jc_start; \
+\
+    /* Compute number of primary and leftover components of the JC loop. */ \
+    /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
+    const dim_t jc_left =   n_local % NC; \
+\
+    /* Loop over the n dimension (NC rows/columns at a time). */ \
+    /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
+    for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
+    { \
+        /* Calculate the thread's current JC block dimension. */ \
+        const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
+        const inc_t pcstep_b_use = ( ( nc_cur + NR - 1 ) / NR ) * NR; \
+\
+        ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+        ctype* restrict b_jc_use = b_00 + jj * jcstep_b_use; \
+        ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+        /* Grow the thrinfo_t tree. */ \
+        bszid_t*   restrict bszids_pc = &bszids_jc[1]; \
+                            thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+        bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
+\
+        /* Compute the PC loop thread range for the current thread. */ \
+        const dim_t pc_start = 0, pc_end = k; \
+        const dim_t k_local = k; \
+\
+        /* Compute number of primary and leftover components of the PC loop. */ \
+        /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
+        const dim_t pc_left =   k_local % KC; \
+\
+        /* Loop over the k dimension (KC rows/columns at a time). */ \
+        /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
+        for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
+        { \
+            /* Calculate the thread's current PC block dimension. */ \
+            const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
+            const inc_t icstep_a_use = kc_cur; \
+\
+            ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+            ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+            ctype* restrict b_pc_use; \
+            ctype* restrict a_pc_use = a_00 + pp * pcstep_a_use; \
+\
+            /* Only apply beta to the first iteration of the pc loop. */ \
+            ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+\
+            ctype* b_use; \
+            inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+            /* Set the bszid_t array and thrinfo_t pointer based on whether
+               we will be packing B. If we won't be packing B, we alias to
+               the _pc variables so that code further down can unconditionally
+               reference the _pb variables. Note that *if* we will be packing
+               B, the thrinfo_t node will have already been created by a
+               previous call to bli_thrinfo_grow(), since bszid values of
+               BLIS_NO_PART cause the tree to grow by two (e.g. to the next
+               bszid that is a normal bszid_t value). */ \
+            bszid_t*   restrict bszids_pb; \
+            if ( packb ) { bszids_pb = &bszids_pc[1]; \
+                           thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
+            else         { bszids_pb = &bszids_pc[0]; \
+                           thread_pb = thread_pc; } \
+\
+            /* Determine the packing buffer and related parameters for matrix
+               B. (If B will not be packed, then a_use will be set to point to
+               b and the _b_use strides will be set accordingly.) Then call
+               the packm sup variant chooser, which will call the appropriate
+               implementation based on the schema deduced from the stor_id. */ \
+\
+            /* packedb == TRUE indicates that B is reordered thus, update the
+               necessary pointers.
+               Else, call packm routine to pack B on-the-go. */ \
+            if ( packedb ) \
+            { \
+                rs_b_use = NR; \
+                cs_b_use = 1; \
+                ps_b_use = kc_cur * NR; \
+                b_pc_use = b_jc_use + pp * pcstep_b_use; \
+            } else \
+            { \
+                PASTEMAC(ch,packm_sup_b) \
+                ( \
+                packb, \
+                BLIS_BUFFER_FOR_B_PANEL,  \
+                stor_id,                  \
+                BLIS_NO_TRANSPOSE, \
+                KC,     NC,       \
+                kc_cur, nc_cur, NR, \
+                &one_local, \
+                b_pc,   rs_b,      cs_b, \
+                &b_use, &rs_b_use, &cs_b_use, \
+                                   &ps_b_use, \
+                cntx, \
+                rntm, \
+                &mem_b, \
+                thread_pb  \
+                ); \
+\
+                b_pc_use = b_use; \
+            } \
+\
+            /* We don't need to embed the panel stride of B within the auxinfo_t
+               object because this variant iterates through B in the jr loop,
+               which occurs here, within the macrokernel, not within the
+               millikernel. */ \
+            bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
+\
+            /* Grow the thrinfo_t tree. */ \
+            bszid_t*   restrict bszids_ic = &bszids_pb[1]; \
+                                thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+            bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
+\
+            /* Compute the IC loop thread range for the current thread. */ \
+            dim_t ic_start, ic_end; \
+            bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
+            const dim_t m_local = ic_end - ic_start; \
+\
+            /* Compute number of primary and leftover components of the IC loop. */ \
+            /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
+            const dim_t ic_left =   m_local % MC; \
+\
+            /* Loop over the m dimension (MC rows at a time). */ \
+            /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
+            for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
+            { \
+                /* Calculate the thread's current IC block dimension. */ \
+                const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
+\
+                ctype* restrict a_ic = a_pc + ii * icstep_a; \
+                ctype* restrict c_ic = c_jc + ii * icstep_c; \
+                ctype* restrict a_ic_use; \
+\
+                ctype* a_use; \
+                inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+                /* Set the bszid_t array and thrinfo_t pointer based on whether
+                   we will be packing B. If we won't be packing A, we alias to
+                   the _ic variables so that code further down can unconditionally
+                   reference the _pa variables. Note that *if* we will be packing
+                   A, the thrinfo_t node will have already been created by a
+                   previous call to bli_thrinfo_grow(), since bszid values of
+                   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
+                   bszid that is a normal bszid_t value). */ \
+                bszid_t*   restrict bszids_pa; \
+                if ( packa ) { bszids_pa = &bszids_ic[1]; \
+                               thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
+                else         { bszids_pa = &bszids_ic[0]; \
+                               thread_pa = thread_ic; } \
+\
+                /* Determine the packing buffer and related parameters for matrix
+                   A. (If A will not be packed, then a_use will be set to point to
+                   a and the _a_use strides will be set accordingly.) Then call
+                   the packm sup variant chooser, which will call the appropriate
+                   implementation based on the schema deduced from the stor_id. */ \
+                /* packedb == TRUE indicates that B is reordered thus, update the
+                   necessary pointers.
+                   Else, call packm routine to pack B on-the-go. */ \
+                if ( packeda ) \
+                { \
+                    rs_a_use = 1; \
+                    cs_a_use = MR; \
+                    ps_a_use = MR * kc_cur; \
+                    a_ic_use = a_pc_use + ii * icstep_a_use; \
+                } \
+                else \
+                { \
+                    PASTEMAC(ch,packm_sup_a) \
+                    ( \
+                    packa, \
+                    BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
+                    stor_id,                 /* a "block of A."                  */ \
+                    BLIS_NO_TRANSPOSE, \
+                    MC,     KC,       /* This "block of A" is (at most) MC x KC. */ \
+                    mc_cur, kc_cur, MR, \
+                    &one_local, \
+                    a_ic,   rs_a,      cs_a, \
+                    &a_use, &rs_a_use, &cs_a_use, \
+                                       &ps_a_use, \
+                    cntx, \
+                    rntm, \
+                    &mem_a, \
+                    thread_pa  \
+                    ); \
+                    /* Alias a_use so that it's clear this is our current block of
+                     matrix A. */ \
+                    a_ic_use = a_use; \
+                } \
+\
+                /* Embed the panel stride of A within the auxinfo_t object. The
+                   millikernel will query and use this to iterate through
+                   micropanels of A (if needed). */ \
+                bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
+\
+                /* Grow the thrinfo_t tree. */ \
+                bszid_t*   restrict bszids_jr = &bszids_pa[1]; \
+                                    thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+                bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
+\
+                /* Compute number of primary and leftover components of the JR loop. */ \
+                dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+                dim_t jr_left =   nc_cur % NR; \
+\
+                /* An optimization: allow the last jr iteration to contain up to NRE
+                   columns of C and B. (If NRE > NR, the mkernel has agreed to handle
+                   these cases.) Note that this prevents us from declaring jr_iter and
+                   jr_left as const. NOTE: We forgo this optimization when packing B
+                   since packing an extended edge case is not yet supported. */ \
+                if ( !packb && !is_mt ) \
+                if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
+                { \
+                    jr_iter--; jr_left += NR; \
+                } \
+\
+                /* Compute the JR loop thread range for the current thread. */ \
+                dim_t jr_start, jr_end; \
+                bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+                /* Loop over the n dimension (NR columns at a time). */ \
+                /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
+                for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
+                { \
+                    const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
+\
+                    ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
+                    ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+\
+                    /* Loop over the m dimension (MR rows at a time). */ \
+                    { \
+                        /* Invoke the gemmsup millikernel. */ \
+                        gemmsup_ker \
+                        ( \
+                          BLIS_NO_CONJUGATE, \
+                          BLIS_NO_CONJUGATE, \
+                          mc_cur, \
+                          nr_cur, \
+                          kc_cur, \
+                          &one_local, \
+                          a_ic_use, rs_a_use, cs_a_use, \
+                          b_jr,     rs_b_use, cs_b_use, \
+                          beta_use, \
+                          c_jr,     rs_c,     cs_c, \
+                          &aux, \
+                          cntx  \
+                        ); \
+                    } \
+                } \
+            } \
+\
+            /* NOTE: This barrier is only needed if we are packing B (since
+               that matrix is packed within the pc loop of this variant). */ \
+            if ( packb ) bli_thread_barrier( thread_pb ); \
+        } \
+    } \
+\
+    /* Release any memory that was acquired for packing matrices A and B. */ \
+    PASTEMAC(ch,packm_sup_finalize_mem_a) \
+    ( \
+      packa, \
+      rntm, \
+      &mem_a, \
+      thread_pa  \
+    ); \
+    PASTEMAC(ch,packm_sup_finalize_mem_b) \
+    ( \
+      packb, \
+      rntm, \
+      &mem_b, \
+      thread_pb  \
+    ); \
+}
+
+INSERT_GENTFUNC_BASIC0_SD( gemm_compute )
--- a/frame/3/bli_l3_compute.h
+++ b/frame/3/bli_l3_compute.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_gemm_compute_init
+(
+  obj_t*  a,
+  obj_t*  b,
+  obj_t*  beta,
+  obj_t*  c,
+  cntx_t* cntx,
+  rntm_t* rntm
+);
+
+err_t bli_gemm_compute
+(
+  obj_t*     a,
+  obj_t*     b,
+  obj_t*     beta,
+  obj_t*     c,
+  cntx_t*    cntx,
+  rntm_t*    rntm,
+  thrinfo_t* thread
+);
+
+// Prototype BLAS-like interfaces with void pointer operands.
+
+#undef GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC( ch, varname ) \
+      ( \
+        bool             packa, \
+        bool             packb, \
+        bool             packeda, \
+        bool             packedb, \
+        dim_t            m, \
+        dim_t            n, \
+        dim_t            k, \
+        void*   restrict a, inc_t rs_a, inc_t cs_a, \
+        void*   restrict b, inc_t rs_b, inc_t cs_b, \
+        void*   restrict beta, \
+        void*   restrict c, inc_t rs_c, inc_t cs_c, \
+        stor3_t          stor_id, \
+        cntx_t* restrict cntx, \
+        rntm_t* restrict rntm, \
+        thrinfo_t* restrict thread \
+      );
+
+INSERT_GENTPROT_BASIC0( gemm_compute )
--- a/frame/base/bli_param_map.h
+++ b/frame/base/bli_param_map.h
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -84,6 +84,7 @@ BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_t
 	if      ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
 	else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
 	else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
+	else if ( trans == 'p' || trans == 'P' ) *blis_trans = BLIS_PACKED;
 	else
 	{
 		// See comment for bli_param_map_netlib_to_blis_side() above.
--- a/frame/compat/CMakeLists.txt
+++ b/frame/compat/CMakeLists.txt
@@ -30,11 +30,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bla_imatcopy.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy2.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatadd.c
+${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c
+${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c
+${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute.c
 	)

 # Select AMD specific sources for AMD configurations.
-if(${TARGET_ARCH} STREQUAL zen OR 
-${TARGET_ARCH} STREQUAL zen2 OR 
+if(${TARGET_ARCH} STREQUAL zen OR
+${TARGET_ARCH} STREQUAL zen2 OR
 ${TARGET_ARCH} STREQUAL zen3 OR
 ${TARGET_ARCH} STREQUAL zen4 OR
 ${TARGET_ARCH} STREQUAL amdzen)
@@ -49,8 +52,6 @@ ${TARGET_ARCH} STREQUAL amdzen)
        ${CMAKE_CURRENT_SOURCE_DIR}/bla_scal_amd.c
        ${CMAKE_CURRENT_SOURCE_DIR}/bla_swap_amd.c
        ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_amd.c
-        ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c
-        ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c
        )
 else()
        target_sources("${PROJECT_NAME}"
--- a/frame/compat/bla_gemm_compute.c
+++ b/frame/compat/bla_gemm_compute.c
@@ -0,0 +1,285 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// BLAS Extension APIs
+/* ?gemm_compute.h */
+/* BLAS interface to compute matrix-matrix product  */
+/* Datatype : s & d (single and double precision only supported) */
+/* BLAS Extensions */
+/* output is the gemm result */
+
+#include "blis.h"
+
+void sgemm_compute_blis_impl
+(
+    const f77_char* transa,
+    const f77_char* transb,
+    const f77_int*  m,
+    const f77_int*  n,
+    const f77_int*  k,
+    const float*    a, const f77_int* rs_a, const f77_int* cs_a,
+    const float*    b, const f77_int* rs_b, const f77_int* cs_b,
+    const float*    beta,
+          float*    c, const f77_int* rs_c, const f77_int* cs_c
+)
+{
+    trans_t blis_transa;
+    trans_t blis_transb;
+    dim_t   m0, n0, k0;
+    dim_t   m0_a, n0_a;
+    dim_t   m0_b, n0_b;
+
+    /* Initialize BLIS. */
+    bli_init_auto();
+
+    // @todo: Add AOCL DTL logs
+    // AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    // AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, 
+                            //  (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
+
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemm_compute)
+    (
+      MKSTR(s),
+      MKSTR(gemm),
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      ( ( *rs_a != 1 ) ? rs_a : cs_a ),
+      ( ( *rs_b != 1 ) ? rs_b : cs_b ),
+      rs_c, cs_c
+    );
+
+    /* Quick return if possible. */
+    if ( *m == 0 || *n == 0 )
+    {
+      /* Finalize BLIS. */
+      bli_finalize_auto();
+      return;
+    }
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
+    bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
+
+    /* Typecast BLAS integers to BLIS integers. */
+    bli_convert_blas_dim1(*m, m0);
+    bli_convert_blas_dim1(*n, n0);
+    bli_convert_blas_dim1(*k, k0);
+
+    const num_t dt = BLIS_FLOAT;
+
+    obj_t       ao     = BLIS_OBJECT_INITIALIZER;
+    obj_t       bo     = BLIS_OBJECT_INITIALIZER;
+    obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1;
+    obj_t       co     = BLIS_OBJECT_INITIALIZER;
+
+    bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
+    bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
+
+    bli_obj_init_finish_1x1( dt, (float*)beta,  &betao  );
+
+    bli_obj_init_finish( dt, m0_a, n0_a, (float*)a, *rs_a, *cs_a, &ao );
+    bli_obj_init_finish( dt, m0_b, n0_b, (float*)b, *rs_b, *cs_b, &bo );
+    bli_obj_init_finish( dt, m0,   n0,   (float*)c, *rs_c, *cs_c, &co );
+
+    bli_obj_set_conjtrans( blis_transa, &ao );
+    bli_obj_set_conjtrans( blis_transb, &bo );
+
+    PASTEMAC0( gemm_compute_init )
+    (
+        &ao,
+        &bo,
+        &betao,
+        &co,
+        NULL,
+        NULL
+    );
+
+    /* Finalize BLIS. */
+    bli_finalize_auto();
+    return;
+}
+
+#ifdef BLIS_ENABLE_BLAS
+void sgemm_compute_
+(
+    const f77_char* transa,
+    const f77_char* transb,
+    const f77_int*  m,
+    const f77_int*  n,
+    const f77_int*  k,
+    const float*    a, const f77_int* lda,
+    const float*    b, const f77_int* ldb,
+    const float*    beta,
+          float*    c, const f77_int* ldc
+)
+{
+    f77_int rs_a = 1;
+    f77_int rs_b = 1;
+    f77_int rs_c = 1;
+    sgemm_compute_blis_impl( transa,
+                             transb,
+                             m,
+                             n,
+                             k,
+                             a, &rs_a, lda,
+                             b, &rs_b, ldb,
+                             beta,
+                             c, &rs_c, ldc );
+}
+#endif
+
+void dgemm_compute_blis_impl
+(
+    const f77_char* transa,
+    const f77_char* transb,
+    const f77_int*  m,
+    const f77_int*  n,
+    const f77_int*  k,
+    const double*   a, const f77_int* rs_a, const f77_int* cs_a,
+    const double*   b, const f77_int* rs_b, const f77_int* cs_b,
+    const double*   beta,
+          double*   c, const f77_int* rs_c, const f77_int* cs_c
+)
+{
+    trans_t blis_transa;
+    trans_t blis_transb;
+    dim_t   m0, n0, k0;
+    dim_t   m0_a, n0_a;
+    dim_t   m0_b, n0_b;
+
+    /* Initialize BLIS. */
+    bli_init_auto();
+
+    // @todo: Add AOCL DTL logs
+    // AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    // AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, 
+                            //  (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
+
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemm_compute)
+    (
+      MKSTR(d),
+      MKSTR(gemm),
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      ( ( *rs_a != 1 ) ? rs_a : cs_a ),
+      ( ( *rs_b != 1 ) ? rs_b : cs_b ),
+      rs_c, cs_c
+    );
+
+   /* Quick return if possible. */
+    if ( *m == 0 || *n == 0 )
+    {
+      /* Finalize BLIS. */
+      bli_finalize_auto();
+      return;
+    }
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
+    bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
+
+    /* Typecast BLAS integers to BLIS integers. */
+    bli_convert_blas_dim1(*m, m0);
+    bli_convert_blas_dim1(*n, n0);
+    bli_convert_blas_dim1(*k, k0);
+
+    const num_t dt = BLIS_DOUBLE;
+
+    obj_t       ao     = BLIS_OBJECT_INITIALIZER;
+    obj_t       bo     = BLIS_OBJECT_INITIALIZER;
+    obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1;
+    obj_t       co     = BLIS_OBJECT_INITIALIZER;
+
+    bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
+    bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
+
+    bli_obj_init_finish_1x1( dt, (double*)beta,  &betao  );
+
+    bli_obj_init_finish( dt, m0_a, n0_a, (double*)a, *rs_a, *cs_a, &ao );
+    bli_obj_init_finish( dt, m0_b, n0_b, (double*)b, *rs_b, *cs_b, &bo );
+    bli_obj_init_finish( dt, m0,   n0,   (double*)c, *rs_c, *cs_c, &co );
+
+    bli_obj_set_conjtrans( blis_transa, &ao );
+    bli_obj_set_conjtrans( blis_transb, &bo );
+
+    PASTEMAC0( gemm_compute_init )
+    (
+        &ao,
+        &bo,
+        &betao,
+        &co,
+        NULL,
+        NULL
+    );
+
+    /* Finalize BLIS. */
+    bli_finalize_auto();
+}
+
+#ifdef BLIS_ENABLE_BLAS
+BLIS_EXPORT_BLAS void dgemm_compute_
+(
+    const f77_char* transa,
+    const f77_char* transb,
+    const f77_int*  m,
+    const f77_int*  n,
+    const f77_int*  k,
+    const double*   a, const f77_int* lda,
+    const double*   b, const f77_int* ldb,
+    const double*   beta,
+          double*   c, const f77_int* ldc
+)
+{
+    f77_int rs_a = 1;
+    f77_int rs_b = 1;
+    f77_int rs_c = 1;
+    dgemm_compute_blis_impl( transa,
+                             transb,
+                             m,
+                             n,
+                             k,
+                             a, &rs_a, lda,
+                             b, &rs_b, ldb,
+                             beta,
+                             c, &rs_c, ldc );
+}
+#endif
--- a/frame/compat/bla_gemm_compute.h
+++ b/frame/compat/bla_gemm_compute.h
@@ -0,0 +1,72 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// BLAS Extension APIs
+/* ?gemm_compute.h */
+/* BLAS interface to compute matrix-matrix product  */
+/* Datatype : s & d (single and double precision only supported) */
+/* BLAS Extensions */
+/* output is the gemm result */
+
+#undef  GENTPROTRO
+#define GENTPROTRO( ftype, ch, blasname ) \
+\
+IF_BLIS_ENABLE_BLAS(\
+BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*   a, const f77_int* lda, \
+       const ftype*   b, const f77_int* ldb, \
+       const ftype*   beta, \
+             ftype*   c, const f77_int* ldc \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*   a, const f77_int* rs_a, const f77_int* cs_a, \
+       const ftype*   b, const f77_int* rs_b, const f77_int* cs_b, \
+       const ftype*   beta, \
+             ftype*   c, const f77_int* rs_c, const f77_int* cs_c \
+     );
+
+INSERT_GENTPROTRO_BLAS( gemm_compute )
--- a/frame/compat/bli_blas.h
+++ b/frame/compat/bli_blas.h
@@ -183,6 +183,7 @@
 #include "bla_trmm.h"
 #include "bla_trsm.h"
 #include "bla_gemmt.h"
+#include "bla_gemm_compute.h"

 #include "bla_gemm_check.h"
 #include "bla_hemm_check.h"
@@ -194,6 +195,7 @@
 #include "bla_trmm_check.h"
 #include "bla_trsm_check.h"
 #include "bla_gemmt_check.h"
+#include "bla_gemm_compute_check.h"

 // -- Batch Extension prototypes --
 #include "bla_gemm_batch.h"
--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -48,6 +48,8 @@ enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
 enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
 enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
 enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
+enum CBLAS_STORAGE {CblasPacked=151};
+enum CBLAS_IDENTIFIER {CblasAMatrix=161, CblasBMatrix=162};

 #ifdef __cplusplus
 extern "C" {
@@ -993,6 +995,190 @@ BLIS_EXPORT_BLAS f77_int cblas_idamin(f77_int N, const double *X, f77_int incX);
 BLIS_EXPORT_BLAS f77_int cblas_icamin(f77_int N, const void   *X, f77_int incX);
 BLIS_EXPORT_BLAS f77_int cblas_izamin(f77_int N, const void   *X, f77_int incX);

+
+// -- PACK COMPUTE APIs --
+/** \addtogroup INTERFACE CBLAS INTERFACE
+ *  @{
+ */
+
+/**
+* cblas_sgemm_pack_get_size calculates and returns the number of bytes necessary
+* to store the specified matrix after packing.
+*
+* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
+* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @return The size in bytes required to store the specified matrix after packing.
+*/
+BLIS_EXPORT_BLAS f77_int cblas_sgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier,
+                const f77_int M, const f77_int N, const f77_int K);
+
+/**
+* cblas_dgemm_pack_get_size calculates and returns the number of bytes necessary
+* to store the specified matrix after packing.
+*
+* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
+* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @return The size in bytes required to store the specified matrix after packing.
+*/
+BLIS_EXPORT_BLAS f77_int cblas_dgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier,
+                const f77_int M, const f77_int N, const f77_int K);
+
+/**
+* cblas_sgemm_pack scales by alpha and packs the specified matrix into the
+* allocated buffer. It is imperative to allocate a buffer of type float and size
+* as returned by the cblas_sgemm_pack_get_size() before invoking this routine.
+*
+* @note If both the matrices are to be packed, the user must ensure that only
+* one matrix is packed with the scalar alpha and the other with a unit-scalar.
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
+* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
+* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication:
+* if trans = CblasNoTrans, then Mat(X) = X;
+* if trans = CblasTrans, then Mat(X) = \f$X^T\f$;
+* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$.
+* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] alpha Specifies the scalar alpha.
+* @param[in] src The matrix to be packed.
+* @param[in] ld Specifies the leading dimension of the matrix to be packed.
+* @param[out] dest The buffer to store the scaled and packed matrix.
+* @return None
+*/
+BLIS_EXPORT_BLAS void cblas_sgemm_pack(enum CBLAS_ORDER Order,
+                 enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans,
+                 const f77_int M, const f77_int N, const f77_int K,
+                 const float alpha, const float *src, const f77_int ld,
+                 float* dest );
+
+/**
+* cblas_dgemm_pack scales by alpha and packs the specified matrix into the
+* allocated buffer. It is imperative to allocate a buffer of type double and
+* size as returned by the cblas_dgemm_pack_get_size() before invoking this
+* routine.
+*
+* @note If both the matrices are to be packed, the user must ensure that only
+* one matrix is packed with the scalar alpha and the other with a unit-scalar.
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
+* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
+* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication:
+* if trans = CblasNoTrans, then Mat(X) = X;
+* if trans = CblasTrans, then Mat(X) = \f$X^T\f$;
+* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$.
+* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] alpha Specifies the scalar alpha.
+* @param[in] src The matrix to be packed.
+* @param[in] ld Specifies the leading dimension of the matrix to be packed.
+* @param[out] dest The buffer to store the scaled and packed matrix.
+* @return None
+*/
+BLIS_EXPORT_BLAS void cblas_dgemm_pack(enum CBLAS_ORDER Order,
+                 enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans,
+                 const f77_int M, const f77_int N, const f77_int K,
+                 const double alpha, const double *src, const f77_int ld,
+                 double* dest );
+
+/**
+* cblas_sgemm_compute computes the matrix-matrix product where one or both the
+* input matrices are packed and adds this to the scalar-matrix product. This
+* operation is defined as:
+* C := Mat(A) * Mat(B) + beta*C,
+* where,
+* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
+* beta is a scalar,
+* A, B and C are matrices:
+* Mat(A) is an nxk matrix, or a packed matrix buffer,
+* Mat(B) is a kxn matrix, or a packed matrix buffer,
+* C is an mxn matrix.
+*
+* @note In case both the matrices are to be packed, the user must ensure that
+* only one matrix is packed with alpha scalar and the other with a unit-scalar,
+* during the packing process
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
+* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
+* if transa = CblasNoTrans, then Mat(A) = A;
+* if transa = CblasTrans, then Mat(A) = \f$A^T\f$;
+* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$;
+* if transa = CblasPacked, then A matrix is packed and lda is ignored.
+* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
+* if transb = CblasNoTrans, then Mat(B) = B;
+* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
+* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$;
+* if transb = CblasPacked, then B matrix is packed and ldb is ignored.
+* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] A  The array is float matrix A or a buffer with packed matrix A.
+* @param[in] lda Specifies the leading dimension of A.
+* @param[in] B The array is float matrix B or a buffer with packed matrix B.
+* @param[in] ldb Specifies the leading dimension of B.
+* @param[in] beta Specifies the scalar beta.
+* @param[in,out] C The array is float matrix C.
+* @param[in] ldc Specifies the leading dimension of C.
+* @return None
+*/
+BLIS_EXPORT_BLAS void cblas_sgemm_compute(enum CBLAS_ORDER Order,
+                 f77_int TransA, f77_int TransB,
+                 const f77_int M, const f77_int N, const f77_int K,
+                 const float* A, f77_int lda, const float* B, f77_int ldb,
+                 float beta, float* C, f77_int ldc);
+
+/**
+* cblas_dgemm_compute computes the matrix-matrix product where one or both the
+* input matrices are packed and adds this to the scalar-matrix product. This
+* operation is defined as:
+* C := Mat(A) * Mat(B) + beta*C,
+* where,
+* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
+* beta is a scalar,
+* A, B and C are matrices:
+* Mat(A) is an nxk matrix, or a packed matrix buffer,
+* Mat(B) is a kxn matrix, or a packed matrix buffer,
+* C is an mxn matrix.
+*
+* @note In case both the matrices are to be packed, the user must ensure that
+* only one matrix is packed with alpha scalar and the other with a unit-scalar,
+* during the packing process
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
+* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
+* if transa = CblasNoTrans, then Mat(A) = A;
+* if transa = CblasTrans, then Mat(A) = \f$A^T\f$;
+* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$;
+* if transa = CblasPacked, then A matrix is packed and lda is ignored.
+* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
+* if transb = CblasNoTrans, then Mat(B) = B;
+* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
+* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$;
+* if transb = CblasPacked, then B matrix is packed and ldb is ignored.
+* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] A  The array is double matrix A or a buffer with packed matrix A.
+* @param[in] lda Specifies the leading dimension of A.
+* @param[in] B The array is double matrix B or a buffer with packed matrix B.
+* @param[in] ldb Specifies the leading dimension of B.
+* @param[in] beta Specifies the scalar beta.
+* @param[in,out] C The array is double matrix C.
+* @param[in] ldc Specifies the leading dimension of C.
+* @return None
+*/
+BLIS_EXPORT_BLAS void cblas_dgemm_compute(enum CBLAS_ORDER Order,
+                 f77_int TransA, f77_int TransB,
+                 const f77_int M, const f77_int N, const f77_int K,
+                 const double* A, f77_int lda, const double* B, f77_int ldb,
+                 double beta, double* C, f77_int ldc);
+/** @}*/
+
 #ifdef __cplusplus
 }
 #endif
--- a/frame/compat/cblas/src/cblas_dgemm_compute.c
+++ b/frame/compat/cblas/src/cblas_dgemm_compute.c
@@ -0,0 +1,172 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+
+#include "cblas.h"
+#include "cblas_f77.h"
+
+BLIS_EXPORT_BLAS void cblas_dgemm_compute( enum  CBLAS_ORDER Order,
+                                                 f77_int TransA,
+                                                 f77_int TransB,
+                                           const f77_int M, const f77_int N,
+                                           const f77_int K,
+                                           const double* A,       f77_int lda,
+                                           const double* B,       f77_int ldb,
+                                                 double  beta,
+                                                 double* C,       f77_int ldc )
+{
+    char TA, TB;
+#ifdef F77_CHAR
+    F77_CHAR F77_TA, F77_TB;
+#else
+    #define F77_TA &TA
+    #define F77_TB &TB
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+    F77_INT F77_ldc=ldc;
+#else
+    #define F77_M M
+    #define F77_N N
+    #define F77_K K
+    #define F77_lda lda
+    #define F77_ldb ldb
+    #define F77_ldc ldc
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+    CBLAS_CallFromC = 1;
+
+    if ( Order == CblasColMajor )       // CblasColMajor
+    {
+        if      ( TransA == CblasTrans )     TA='T';
+        else if ( TransA == CblasConjTrans ) TA='T';
+        else if ( TransA == CblasNoTrans )   TA='N';
+        else if ( TransA == CblasPacked )    TA='P';
+        else
+        {
+            cblas_xerbla(2, "cblas_dgemm_compute",
+                            "Illegal TransA setting, %d\n", TransA);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        if      ( TransB == CblasTrans )     TB='T';
+        else if ( TransB == CblasConjTrans ) TB='T';
+        else if ( TransB == CblasNoTrans )   TB='N';
+        else if ( TransB == CblasPacked )    TB='P';
+        else
+        {
+            cblas_xerbla(3, "cblas_dgemm_compute",
+                            "Illegal TransB setting, %d\n", TransB);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+#ifdef F77_CHAR
+        F77_TA = C2F_CHAR(&TA);
+        F77_TB = C2F_CHAR(&TB);
+#endif
+
+        f77_int rs_a = 1;
+        f77_int rs_b = 1;
+        f77_int rs_c = 1;
+
+        F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
+                           B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc);
+    }
+    else if ( Order == CblasRowMajor )      // CblasRowMajor
+    {
+        RowMajorStrg = 1;
+
+        // If Row Major, and A is not already reordered
+        // then toggle the transA parameter and interchange the strides.
+        if      ( TransA == CblasPacked )    TA='P';
+        else if ( TransA == CblasTrans )     TA='N';
+        else if ( TransA == CblasNoTrans )   TA='T';
+        else if ( TransA == CblasConjTrans ) TA='N';
+        else
+        {
+            cblas_xerbla(2, "cblas_dgemm_compute",
+                            "Illegal TransA setting, %d\n", TransA);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        // If Row Major, and B is not already reordered
+        // then toggle the transB parameter and interchange the strides.
+        if      ( TransB == CblasPacked )    TB='P';
+        else if ( TransB == CblasTrans )     TB='N';
+        else if ( TransB == CblasNoTrans )   TB='T';
+        else if ( TransB == CblasConjTrans ) TB='N';
+        else
+        {
+            cblas_xerbla(2, "cblas_dgemm_compute",
+                            "Illegal TransB setting, %d\n", TransB);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+#ifdef F77_CHAR
+        F77_TA = C2F_CHAR(&TA);
+        F77_TB = C2F_CHAR(&TB);
+#endif
+
+        f77_int rs_a = 1;
+        f77_int rs_b = 1;
+        f77_int cs_c = 1;
+
+        F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
+                           B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c );
+    }
+    else
+    {
+        cblas_xerbla(1, "cblas_dgemm_compute",
+                        "Illegal Order setting, %d\n", Order);
+        CBLAS_CallFromC = 0;
+        RowMajorStrg = 0;
+        return;
+    }
+    return;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_dgemm_pack.c
+++ b/frame/compat/cblas/src/cblas_dgemm_pack.c
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+
+#include "cblas.h"
+#include "cblas_f77.h"
+
+BLIS_EXPORT_BLAS void cblas_dgemm_pack( enum  CBLAS_ORDER      Order,
+                                        enum  CBLAS_IDENTIFIER Identifier,
+                                        enum  CBLAS_TRANSPOSE  Trans,
+                                        const f77_int M,
+                                        const f77_int N,
+                                        const f77_int K,
+                                        const double  alpha,
+                                        const double* src, const f77_int ld,
+                                              double* dest )
+{
+    char TR;
+    char ID;
+
+#ifdef F77_CHAR
+    F77_CHAR F77_TR;
+    F77_CHAR F77_ID;
+#else
+#define F77_TR &TR
+#define F77_ID &ID
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld;
+#else
+
+#define F77_M M
+#define F77_N N
+#define F77_K K
+#define F77_ld ld
+
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+
+    CBLAS_CallFromC = 1;
+
+    if ( Order == CblasColMajor )       // CblasColMajor
+    {
+        if      ( Trans == CblasNoTrans )   TR = 'N';
+        else if ( Trans == CblasTrans )     TR = 'T';
+        else if ( Trans == CblasConjTrans ) TR = 'T';
+        else
+        {
+            cblas_xerbla(3, "cblas_dgemm_pack","Illegal Trans setting, %d\n", Trans);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        if      ( Identifier == CblasAMatrix ) ID = 'A';
+        else if ( Identifier == CblasBMatrix ) ID = 'B';
+        else
+        {
+            cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+#ifdef F77_CHAR
+        F77_TR = C2F_CHAR(&TR);
+        F77_ID = C2F_CHAR(&ID);
+#endif
+        F77_dgemm_pack( F77_ID,
+                        F77_TR,
+                        &F77_M,
+                        &F77_N,
+                        &F77_K,
+                        &alpha,
+                        src, &F77_ld,
+                        dest );
+    }
+    else if ( Order == CblasRowMajor )      // CblasRowMajor
+    {
+        RowMajorStrg = 1;
+        if      ( Trans == CblasNoTrans )   TR = 'T';
+        else if ( Trans == CblasTrans )     TR = 'N';
+        else if ( Trans == CblasConjTrans ) TR = 'N';
+        else
+        {
+            cblas_xerbla(3, "cblas_dgemm_pack","Invalid Trans setting, %d\n", Trans);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        if      ( Identifier == CblasAMatrix ) ID = 'A';
+        else if ( Identifier == CblasBMatrix ) ID = 'B';
+        else
+        {
+            cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+#ifdef F77_CHAR
+        F77_TR = C2F_CHAR(&TR);
+        F77_ID = C2F_CHAR(&ID);
+#endif
+        F77_dgemm_pack ( F77_ID,
+                         F77_TR,
+                         &F77_M,
+                         &F77_N,
+                         &F77_K,
+                         &alpha,
+                         src, &F77_ld,
+                         dest );
+    }
+    else cblas_xerbla(1, "cblas_dgemm_pack", "Invalid Order setting, %d\n", Order);
+    CBLAS_CallFromC = 0;
+    RowMajorStrg = 0;
+    return;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c
+++ b/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c
@@ -0,0 +1,83 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+
+#include "cblas.h"
+#include "cblas_f77.h"
+
+f77_int cblas_dgemm_pack_get_size( enum  CBLAS_IDENTIFIER Identifier,
+                                   const f77_int M,
+                                   const f77_int N,
+                                   const f77_int K )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 );
+
+    char ID;
+    f77_int tbytes = 0;
+
+#ifdef F77_CHAR
+    F77_CHAR F77_ID;
+#else
+    #define F77_ID &ID
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_M=M, F77_N=N, F77_K=K;
+#else
+    #define F77_M M
+    #define F77_N N
+    #define F77_K K
+#endif
+
+    if      (Identifier == CblasAMatrix ) ID = 'A';
+    else if (Identifier == CblasBMatrix ) ID = 'B';
+    else
+    {
+        cblas_xerbla( 1, "cblas_dgemm_pack_get_size",
+                         "Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier );
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
+        return 0;
+    }
+
+#ifdef F77_CHAR
+    F77_ID = C2F_CHAR( &ID );
+#endif
+    tbytes = F77_dgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K );
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
+    return tbytes;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_f77.h
+++ b/frame/compat/cblas/src/cblas_f77.h
@@ -202,6 +202,14 @@
 #define F77_cgemm_batch  cgemm_batch
 #define F77_zgemm_batch  zgemm_batch

+// -- Pack-Compute APIs --
+#define F77_sgemm_pack_get_size  sgemm_pack_get_size_blis_impl
+#define F77_dgemm_pack_get_size  dgemm_pack_get_size_blis_impl
+#define F77_sgemm_pack  sgemm_pack_blis_impl
+#define F77_dgemm_pack  dgemm_pack_blis_impl
+#define F77_sgemm_compute  sgemm_compute_blis_impl
+#define F77_dgemm_compute  dgemm_compute_blis_impl
+
 // (BLIS_ENABLE_NO_UNDERSCORE_API) ends
 #else
 /*
@@ -389,6 +397,14 @@
 #define F77_dgemm_batch  dgemm_batch_
 #define F77_cgemm_batch  cgemm_batch_
 #define F77_zgemm_batch  zgemm_batch_
+
+// -- Pack-Compute APIs --
+#define F77_sgemm_pack_get_size  sgemm_pack_get_size_blis_impl
+#define F77_dgemm_pack_get_size  dgemm_pack_get_size_blis_impl
+#define F77_sgemm_pack  sgemm_pack_blis_impl
+#define F77_dgemm_pack  dgemm_pack_blis_impl
+#define F77_sgemm_compute  sgemm_compute_blis_impl
+#define F77_dgemm_compute  dgemm_compute_blis_impl
 #endif

 #endif /*  CBLAS_F77_H */
--- a/frame/compat/cblas/src/cblas_sgemm_compute.c
+++ b/frame/compat/cblas/src/cblas_sgemm_compute.c
@@ -0,0 +1,171 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+
+#include "cblas.h"
+#include "cblas_f77.h"
+
+BLIS_EXPORT_BLAS void cblas_sgemm_compute( enum CBLAS_ORDER Order,
+                                                 f77_int TransA,
+                                                 f77_int TransB,
+                                           const f77_int M,
+                                           const f77_int N,
+                                           const f77_int K,
+                                           const float*  A, f77_int lda,
+                                           const float*  B, f77_int ldb,
+                                           float beta,
+                                                 float*  C, f77_int ldc)
+{
+    char TA, TB;
+#ifdef F77_CHAR
+    F77_CHAR F77_TA, F77_TB;
+#else
+    #define F77_TA &TA
+    #define F77_TB &TB
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+    F77_INT F77_ldc=ldc;
+#else
+    #define F77_M M
+    #define F77_N N
+    #define F77_K K
+    #define F77_lda lda
+    #define F77_ldb ldb
+    #define F77_ldc ldc
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+    CBLAS_CallFromC = 1;
+    if( Order == CblasColMajor )        // CblasColMajor
+    {
+        if      ( TransA == CblasTrans )     TA='T';
+        else if ( TransA == CblasConjTrans ) TA='T';
+        else if ( TransA == CblasNoTrans )   TA='N';
+        else if ( TransA == CblasPacked )    TA='P';
+        else
+        {
+            cblas_xerbla(2, "cblas_sgemm_compute",
+                            "Illegal TransA setting, %d\n", TransA);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        if      ( TransB == CblasTrans  )    TB='T';
+        else if ( TransB == CblasConjTrans ) TB='T';
+        else if ( TransB == CblasNoTrans )   TB='N';
+        else if ( TransB == CblasPacked )    TB='P';
+        else
+        {
+            cblas_xerbla(3, "cblas_sgemm_compute",
+                            "Illegal TransB setting, %d\n", TransB);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        #ifdef F77_CHAR
+            F77_TA = C2F_CHAR(&TA);
+            F77_TB = C2F_CHAR(&TB);
+        #endif
+
+        f77_int rs_a = 1;
+        f77_int rs_b = 1;
+        f77_int rs_c = 1;
+
+        F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
+                           B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc);
+   }
+   else if ( Order == CblasRowMajor )       // CblasRowMajor
+   {
+        RowMajorStrg = 1;
+
+        // If Row Major, and A is not already reordered
+        // then toggle the transA parameter and interchange the strides.
+        if      ( TransA == CblasPacked )    TA='P';
+        else if ( TransA == CblasTrans )     TA='N';
+        else if ( TransA == CblasNoTrans )   TA='T';
+        else if ( TransA == CblasConjTrans ) TA='N';
+        else
+        {
+            cblas_xerbla(2, "cblas_sgemm_compute",
+                            "Illegal TransA setting, %d\n", TransA);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        // If Row Major, and B is not already reordered
+        // then toggle the transB parameter and interchange the strides.
+        if      ( TransB == CblasPacked )    TB='P';
+        else if ( TransB == CblasTrans )     TB='N';
+        else if ( TransB == CblasNoTrans )   TB='T';
+        else if ( TransB == CblasConjTrans ) TB='N';
+        else
+        {
+            cblas_xerbla(2, "cblas_sgemm_compute",
+                            "Illegal TransB setting, %d\n", TransB);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+        #ifdef F77_CHAR
+            F77_TA = C2F_CHAR(&TA);
+            F77_TB = C2F_CHAR(&TB);
+        #endif
+
+        f77_int rs_a = 1;
+        f77_int rs_b = 1;
+        f77_int cs_c = 1;
+
+        F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
+                           B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c);
+    } 
+    else
+    {
+        cblas_xerbla(1, "cblas_sgemm_compute",
+                        "Illegal Order setting, %d\n", Order);
+        CBLAS_CallFromC = 0;
+        RowMajorStrg = 0;
+        return;
+    }
+    return;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_sgemm_pack.c
+++ b/frame/compat/cblas/src/cblas_sgemm_pack.c
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+
+#include "cblas.h"
+#include "cblas_f77.h"
+
+BLIS_EXPORT_BLAS void cblas_sgemm_pack( enum  CBLAS_ORDER      Order,
+                                        enum  CBLAS_IDENTIFIER Identifier,
+                                        enum  CBLAS_TRANSPOSE  Trans,
+                                        const f77_int M,
+                                        const f77_int N,
+                                        const f77_int K,
+                                        const float   alpha,
+                                        const float*  src, const f77_int ld,
+                                              float*  dest )
+{
+    char TR;
+    char ID;
+
+#ifdef F77_CHAR
+    F77_CHAR F77_TR;
+    F77_CHAR F77_ID;
+#else
+#define F77_TR &TR
+#define F77_ID &ID
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld;
+#else
+
+#define F77_M M
+#define F77_N N
+#define F77_K K
+#define F77_ld ld
+
+#endif
+
+    extern int CBLAS_CallFromC;
+    extern int RowMajorStrg;
+    RowMajorStrg = 0;
+
+    CBLAS_CallFromC = 1;
+
+    if ( Order == CblasColMajor )       // CblasColMajor
+    {
+        if      ( Trans == CblasNoTrans )   TR = 'N';
+        else if ( Trans == CblasTrans )     TR = 'T';
+        else if ( Trans == CblasConjTrans ) TR = 'T';
+        else
+        {
+            cblas_xerbla(3, "cblas_sgemm_pack","Illegal Trans setting, %d\n", Trans);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        if      ( Identifier == CblasAMatrix ) ID = 'A';
+        else if ( Identifier == CblasBMatrix ) ID = 'B';
+        else
+        {
+            cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+#ifdef F77_CHAR
+        F77_TR = C2F_CHAR(&TR);
+        F77_ID = C2F_CHAR(&ID);
+#endif
+        F77_sgemm_pack( F77_ID,
+                        F77_TR,
+                        &F77_M,
+                        &F77_N,
+                        &F77_K,
+                        &alpha,
+                        src, &F77_ld,
+                        dest );
+    }
+    else if ( Order == CblasRowMajor )      // CblasRowMajor
+    {
+        RowMajorStrg = 1;
+        if      ( Trans == CblasNoTrans )   TR = 'T';
+        else if ( Trans == CblasTrans )     TR = 'N';
+        else if ( Trans == CblasConjTrans ) TR = 'N';
+        else
+        {
+            cblas_xerbla(3, "cblas_sgemm_pack","Invalid Trans setting, %d\n", Trans);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+        if      ( Identifier == CblasAMatrix ) ID = 'A';
+        else if ( Identifier == CblasBMatrix ) ID = 'B';
+        else
+        {
+            cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier);
+            CBLAS_CallFromC = 0;
+            RowMajorStrg = 0;
+            return;
+        }
+
+#ifdef F77_CHAR
+        F77_TR = C2F_CHAR(&TR);
+        F77_ID = C2F_CHAR(&ID);
+#endif
+        F77_sgemm_pack ( F77_ID,
+                         F77_TR,
+                         &F77_M,
+                         &F77_N,
+                         &F77_K,
+                         &alpha,
+                         src, &F77_ld,
+                         dest );
+    }
+    else cblas_xerbla(1, "cblas_sgemm_pack", "Invalid Order setting, %d\n", Order);
+    CBLAS_CallFromC = 0;
+    RowMajorStrg = 0;
+    return;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c
+++ b/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c
@@ -0,0 +1,83 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+
+#include "cblas.h"
+#include "cblas_f77.h"
+
+f77_int cblas_sgemm_pack_get_size( enum  CBLAS_IDENTIFIER Identifier,
+                                   const f77_int M,
+                                   const f77_int N,
+                                   const f77_int K )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 );
+
+    char ID;
+    f77_int tbytes = 0;
+
+#ifdef F77_CHAR
+    F77_CHAR F77_ID;
+#else
+    #define F77_ID &ID
+#endif
+
+#ifdef F77_INT
+    F77_INT F77_M=M, F77_N=N, F77_K=K;
+#else
+    #define F77_M M
+    #define F77_N N
+    #define F77_K K
+#endif
+
+    if      ( Identifier == CblasAMatrix ) ID = 'A';
+    else if ( Identifier == CblasBMatrix ) ID = 'B';
+    else
+     {
+        cblas_xerbla( 1, "cblas_sgemm_pack_get_size",
+                         "Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier );
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
+        return 0;
+     }
+
+#ifdef F77_CHAR
+    F77_ID = C2F_CHAR( &ID );
+#endif
+    tbytes = F77_sgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K );
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
+    return tbytes;
+}
+#endif
--- a/frame/compat/check/CMakeLists.txt
+++ b/frame/compat/check/CMakeLists.txt
@@ -1,4 +1,4 @@
-##Copyright (C) 2020, Advanced Micro Devices, Inc.## 
+##Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. ##

 target_sources("${PROJECT_NAME}"
    PRIVATE
@@ -23,8 +23,5 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_trmv_check.h
 ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_check.h
 ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsv_check.h
 ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm3m_check.h
+${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute_check.h
 	)
-
-
-	
-
--- a/frame/compat/check/bla_gemm_compute_check.h
+++ b/frame/compat/check/bla_gemm_compute_check.h
@@ -0,0 +1,87 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define bla_gemm_compute_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, rs_c, cs_c ) \
+{ \
+	f77_int info = 0; \
+	f77_int nota,  notb; \
+	f77_int conja, conjb; \
+	f77_int ta,    tb; \
+	f77_int packa, packb; \
+	f77_int nrowa, nrowb; \
+\
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTE_LSAME( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTE_LSAME( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTE_LSAME( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+	packa = PASTE_LSAME( transa, "P", (ftnlen)1, (ftnlen)1 ); \
+	packb = PASTE_LSAME( transb, "P", (ftnlen)1, (ftnlen)1 ); \
+\
+	if ( nota || packa ) { nrowa = *m; } \
+	else        { nrowa = *k; } \
+	if ( notb || packb ) { nrowb = *k; } \
+	else        { nrowb = *n; } \
+\
+	if      ( !nota && !conja && !ta && !packa ) \
+		info = 1; \
+	else if ( !notb && !conjb && !tb && !packb ) \
+		info = 2; \
+	else if ( *m < 0 ) \
+		info = 3; \
+	else if ( *n < 0 ) \
+		info = 4; \
+	else if ( *k < 0 ) \
+		info = 5; \
+	else if ( !packa && *lda < bli_max( 1, nrowa ) ) /* lda is ignored when A is packed. */ \
+		info = 7; \
+	else if ( !packb && *ldb < bli_max( 1, nrowb ) ) /* ldb is ignored when B is packed. */ \
+		info = 9; \
+	else if ( ( *rs_c == 1 && *cs_c < bli_max( 1, *m ) ) || ( *cs_c == 1 && *rs_c < bli_max( 1, *n ) ) ) \
+		info = 12; \
+\
+	if ( info != 0 ) \
+	{ \
+		char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
+\
+		sprintf( func_str, "%s%-5s", dt_str, op_str ); \
+\
+		bli_string_mkupper( func_str ); \
+\
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
+\
+		return; \
+	} \
+}
--- a/frame/include/bli_macro_defs.h
+++ b/frame/include/bli_macro_defs.h
@@ -287,6 +287,8 @@
 #define dgemm_batch_ dgemm_batch
 #define cgemm_batch_ cgemm_batch
 #define zgemm_batch_ zgemm_batch
+#define sgemm_compute_ sgemm_compute
+#define dgemm_compute_ dgemm_compute
 #define saxpby_ saxpby
 #define daxpby_ daxpby
 #define caxpby_ caxpby
@@ -391,6 +393,7 @@
 #define dgbmv                     DGBMV
 #define dgemm                     DGEMM
 #define dgemm_batch               DGEMM_BATCH
+#define dgemm_compute             DGEMM_COMPUTE
 #define dgemmt                    DGEMMT
 #define dgemv                     DGEMV
 #define dger                      DGER
@@ -464,6 +467,7 @@
 #define sgbmv                     SGBMV
 #define sgemm                     SGEMM
 #define sgemm_batch               SGEMM_BATCH
+#define sgemm_compute             SGEMM_COMPUTE
 #define sgemmt                    SGEMMT
 #define sgemv                     SGEMV
 #define sger                      SGER
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -470,7 +470,8 @@ typedef enum
 	BLIS_NO_TRANSPOSE      = 0x0,
 	BLIS_TRANSPOSE         = BLIS_BITVAL_TRANS,
 	BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ,
-	BLIS_CONJ_TRANSPOSE    = BLIS_BITVAL_CONJ_TRANS
+	BLIS_CONJ_TRANSPOSE    = BLIS_BITVAL_CONJ_TRANS,
+	BLIS_PACKED            = BLIS_BITVAL_PACKED_UNSPEC
 } trans_t;

 typedef enum
--- a/frame/thread/CMakeLists.txt
+++ b/frame/thread/CMakeLists.txt
@@ -2,6 +2,8 @@

 target_sources("${PROJECT_NAME}"
    PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_openmp.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_single.c
    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_openmp.c
    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_pthreads.c
    ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_single.c
--- a/frame/thread/bli_l3_compute_decor.h
+++ b/frame/thread/bli_l3_compute_decor.h
@@ -0,0 +1,67 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_COMPUTE_DECOR_H
+#define BLIS_L3_COMPUTE_DECOR_H
+
+// Level-3 compute internal function type.
+typedef err_t (*l3computeint_t)
+     (
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       thrinfo_t* thread
+     );
+
+// Level-3 compute thread decorator prototype.
+err_t bli_l3_compute_thread_decorator
+     (
+       l3computeint_t func,
+       opid_t         family,
+       obj_t*         a,
+       obj_t*         b,
+       obj_t*         beta,
+       obj_t*         c,
+       cntx_t*        cntx,
+       rntm_t*        rntm
+     );
+
+#include "bli_l3_compute_decor_single.h"
+#include "bli_l3_compute_decor_openmp.h"
+// #include "bli_l3_compute_decor_pthreads.h"
+
+#endif
--- a/frame/thread/bli_l3_compute_decor_openmp.c
+++ b/frame/thread/bli_l3_compute_decor_openmp.c
@@ -0,0 +1,133 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// @note: Presently MT is not supported, so n_threads have been explicitly
+//  initialized to 1 while intializing. Thus, even if BLIS is build with OpenMP
+//  support, the compute APIs work as an ST implementation.
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_OPENMP
+
+void* bli_l3_compute_thread_entry( void* data_void ) { return NULL; }
+
+err_t bli_l3_compute_thread_decorator
+     (
+       l3computeint_t func,
+       opid_t         family,
+       obj_t*         a,
+       obj_t*         b,
+       obj_t*         beta,
+       obj_t*         c,
+       cntx_t*        cntx,
+       rntm_t*        rntm
+     )
+{
+    // Query the total number of threads from the rntm_t object.
+    const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+    // Check out an array_t from the small block allocator. This is done
+    // with an internal lock to ensure only one application thread accesses
+    // the sba at a time. bli_sba_checkout_array() will also automatically
+    // resize the array_t, if necessary.
+    array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+    // Access the pool_t* for thread 0 and embed it into the rntm. We do
+    // this up-front only so that we have the rntm_t.sba_pool field
+    // initialized and ready for the global communicator creation below.
+    bli_sba_rntm_set_pool( 0, array, rntm );
+
+    // Set the packing block allocator field of the rntm. This will be
+    // inherited by all of the child threads when they make local copies of
+    // the rntm below.
+    bli_pba_rntm_set_pba( rntm );
+
+    // Allcoate a global communicator for the root thrinfo_t structures.
+    thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+    _Pragma( "omp parallel num_threads(n_threads)" )
+    {
+        // Create a thread-local copy of the master thread's rntm_t. This is
+        // necessary since we want each thread to be able to track its own
+        // small block pool_t as it executes down the function stack.
+        rntm_t           rntm_l = *rntm;
+        rntm_t* restrict rntm_p = &rntm_l;
+
+        // Query the thread's id from OpenMP.
+        const dim_t tid = omp_get_thread_num();
+
+        // Check for a somewhat obscure OpenMP thread-mistmatch issue.
+        // NOTE: This calls the same function used for the conventional/large
+        // code path.
+        bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
+
+        // Use the thread id to access the appropriate pool_t* within the
+        // array_t, and use it to set the sba_pool field within the rntm_t.
+        // If the pool_t* element within the array_t is NULL, it will first
+        // be allocated/initialized.
+        bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+        thrinfo_t* thread = NULL;
+
+        // Create the root node of the thread's thrinfo_t structure.
+        bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+
+        func
+        (
+          a,
+          b,
+          beta,
+          c,
+          cntx,
+          rntm_p,
+          thread
+        );
+
+        // Free the current thread's thrinfo_t structure.
+        bli_l3_sup_thrinfo_free( rntm_p, thread );
+    }
+
+    // We shouldn't free the global communicator since it was already freed
+    // by the global communicator's chief thread in bli_l3_thrinfo_free()
+    // (called from the thread entry function).
+
+    // Check the array_t back into the small block allocator. Similar to the
+    // check-out, this is done using a lock embedded within the sba to ensure
+    // mutual exclusion.
+    bli_sba_checkin_array( array );
+
+    return BLIS_SUCCESS;
+}
+
+#endif
--- a/frame/thread/bli_l3_compute_decor_openmp.h
+++ b/frame/thread/bli_l3_compute_decor_openmp.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_SUP_DECOR_OPENMP_H
+#define BLIS_L3_SUP_DECOR_OPENMP_H
+
+// Definitions specific to situations when OpenMP multithreading is enabled.
+#ifdef BLIS_ENABLE_OPENMP
+
+#endif
+
+#endif
+
--- a/frame/thread/bli_l3_compute_decor_single.c
+++ b/frame/thread/bli_l3_compute_decor_single.c
@@ -0,0 +1,87 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+err_t bli_l3_compute_thread_decorator
+     (
+       l3computeint_t func,
+       opid_t         family,
+       obj_t*         a,
+       obj_t*         b,
+       obj_t*         beta,
+       obj_t*         c,
+       cntx_t*        cntx,
+       rntm_t*        rntm
+     )
+{
+    const dim_t n_threads = 1;
+    array_t* restrict array = bli_sba_checkout_array( n_threads );
+    bli_sba_rntm_set_pool( 0, array, rntm );
+    bli_pba_rntm_set_pba( rntm );
+
+    {
+        rntm_t* restrict rntm_p = rntm;
+        const dim_t tid = 0;
+
+        // This optimization allows us to use one of the global thrinfo_t
+        // objects for single-threaded execution rather than grow one from
+        // scratch. The key is that bli_thrinfo_sup_grow(), which is called
+        // from within the variants, will immediately return if it detects
+        // that the thrinfo_t* passed into it is either
+        // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
+        thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
+
+        ( void )tid;
+
+        func
+        (
+          a,
+          b,
+          beta,
+          c,
+          cntx,
+          rntm_p,
+          thread
+        );
+    }
+
+    bli_sba_checkin_array( array );
+
+    return BLIS_SUCCESS;
+}
+
+#endif
--- a/frame/thread/bli_l3_compute_decor_single.h
+++ b/frame/thread/bli_l3_compute_decor_single.h
@@ -0,0 +1,43 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_COMPUTE_DECOR_SINGLE_H
+#define BLIS_L3_COMPUTE_DECOR_SINGLE_H
+
+// Definitions specific to situations when multithreading is disabled.
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+#endif
+
+#endif
--- a/frame/thread/bli_pack_full_decor_openmp.c
+++ b/frame/thread/bli_pack_full_decor_openmp.c
@@ -54,7 +54,11 @@ void bli_pack_full_thread_decorator
    /* Ensure n_threads is always greater than or equal to 1 */
    /* Passing BLIS_IC_NT and BLIS_JC_NT for pack can lead to n_threads */
    /* becoming negative. In that case, packing is done using 1 thread */
-    n_threads = ( n_threads > 0 ) ? n_threads : 1;
+    // n_threads = ( n_threads > 0 ) ? n_threads : 1;
+
+    // Explicitly setting n_threads = 1 to force packing with only a single
+    // thread.
+    n_threads = 1;

    _Pragma( "omp parallel num_threads(n_threads)" )
    {
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -60,6 +60,9 @@
 // Include the pack full thread decorator and related definitions and prototypes
 // for the pack code path.
 #include "bli_pack_full_decor.h"
+// Include the level-3 thread decorator and related definitions and prototypes
+// for the compute code path.
+#include "bli_l3_compute_decor.h"

 // Initialization-related prototypes.
 void bli_thread_init( void );