add an example of customized type convert - bfp16_rtn (#869)

* add an example of customized bfp16_rtn

* fixed threadwise_copy

---------

Co-authored-by: Jing Zhang <jizha@amd.com>

[ROCm/composable_kernel commit: 38ada109ea]
This commit is contained in:
zjing14
2023-08-29 12:31:24 -05:00
committed by GitHub
parent 90c7e28a8d
commit f8bcfe60ac
5 changed files with 52 additions and 4 deletions

View File

@@ -92,11 +92,11 @@ struct ReferenceGemm : public device::BaseOperator
ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
}
AccDataType v_c;
CDataType v_c;
arg.c_element_op_(v_c, v_acc);
arg.c_m_n_(m, n) = ck::type_convert<CDataType>(v_c);
arg.c_m_n_(m, n) = v_c;
};
make_ParallelTensorFunctor(