Fixed dynamic dispatch crash issue on non-zen architecture.

This commit fixed issue for gemm and copy API’s.

The BLIS binary with dynamic dispatch feature was crashing on non-zen
CPUs (specifically CPUs without AVX2 support).
The crash was caused by un-supported instructions in zen optimized kernels.
The issue is fixed by calling only reference kernels if the architecture detected at
runtime is not zen, zen2 or zen3.

AMD-Internal: [CPUPL-1930]

Change-Id: Ief57cd457b87542aa1a7bad64dc36c01f0d1a366
This commit is contained in:
Dipal M Zambare
2021-10-27 15:14:33 +05:30
parent d683c224e8
commit ddbdfd0ba4
2 changed files with 119 additions and 20 deletions

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -153,16 +153,37 @@ void scopy_
incy0 = (inc_t)(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN3) || (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN);
/* Call BLIS kernel */
bli_scopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
if (bamdzen)
{
/* Call BLIS kernel */
bli_scopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
@@ -232,16 +253,38 @@ void dcopy_
incy0 = (inc_t)(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN3) || (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel */
bli_dcopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
/* Call BLIS kernel */
bli_dcopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */

View File

@@ -362,7 +362,63 @@ void dgemm_
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN3) || (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN);
if (!bamdzen)
{
// This code is duplicated below, however we don't want to move it out of
// this IF block as it will affect the performance on Zen architetures
// Also this is temporary fix which will be replaced later.
const num_t dt = BLIS_DOUBLE;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao);
bli_obj_init_finish_1x1(dt, (double *)beta, &betao);
bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao);
bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo);
bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co);
bli_obj_set_conjtrans(blis_transa, &ao);
bli_obj_set_conjtrans(blis_transb, &bo);
// Will call parallelized dgemm code - sup & native
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
(
&alphao,
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
{
bli_dgemm_ref_k1_nn( m0, n0, k0,
(double*)alpha,