Merge commit 'e77a7ca2bc65651b5e87a0127e0335733aca2f35' into develop

This commit is contained in:
assistant-librarian[bot]
2025-12-18 21:13:07 +00:00
parent cef729b554
commit 87b8b502e6
93 changed files with 8182 additions and 127 deletions

View File

@@ -286,8 +286,25 @@ struct ThreadwiseTensorSliceTransfer_v7r3
static_for<0, nDst, 1>{}([&](auto i) {
using elm_vector_t = typename remove_cvref_t<decltype(elm_vectors[i])>::type;
elm_vectors(i).template AsType<elm_vector_t>()(I0) =
oob_val ? elm_vectors(i).template AsType<elm_vector_t>()[I0] : elm_vector_t{0};
using scalar_t =
remove_cvref_t<decltype(elm_vectors(i).template AsType<elm_vector_t>()[I0])>;
// This is a bit ugly but necessary to be able to compile f8 instances for grouped
// convolution forward. For some reason for that specific type there is an ambiguity
// in the type resolution for the ternary expression. I added an explicit cast to
// disambiguate and only use it for f8 just in case it affects performance.
if constexpr(std::is_same_v<scalar_t, ck::f8_ocp_t>)
{
elm_vectors(i).template AsType<elm_vector_t>()(I0) =
oob_val ? elm_vector_t{elm_vectors(i).template AsType<elm_vector_t>()[I0]}
: elm_vector_t{0};
}
else
{
elm_vectors(i).template AsType<elm_vector_t>()(I0) =
oob_val ? elm_vectors(i).template AsType<elm_vector_t>()[I0]
: elm_vector_t{0};
}
});
elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;