[CK_TILE] layernorm support fused-quant/fused-add (#1604)

* add prenorm/postnorm support, refactor using generate.py

* update README

* update README

* fix format

* update some description and fix format

* update format

* format

* use non-raw for loading

* format and update n4096

* dynamic-quant ready

* update readme

* support fused dynamic-quant

* update fused-quant, with smooth

* update README

* update args

* update some based on comment
This commit is contained in:
carlushuang
2024-10-31 14:54:53 +08:00
committed by GitHub
parent 9a8a52130d
commit c3a4800c5f
61 changed files with 1790 additions and 766 deletions

View File

@@ -301,7 +301,10 @@ struct BlockReduce2D
.get_static_tile_distribution_encoding(),
ReduceDim{}));
return make_static_distributed_tensor<InDataType>(acc_dstr);
auto dst_ = make_static_distributed_tensor<InDataType>(acc_dstr);
// init acc_tensor
tile_elementwise_inout([&](auto& x_) { x_ = type_convert<InDataType>(reduce_init); }, dst_);
return dst_;
}
// return number of pixels each lane need to reduce

View File

@@ -17,14 +17,24 @@ struct BlockReduce2d
CK_TILE_DEVICE constexpr BlockReduce2d() {}
template <typename XDistributedTensor_, typename YDistributedTensor_, typename ReduceFunc>
template <typename XDistributedTensor_,
typename YDistributedTensor_,
typename ReduceFunc,
typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
YDistributedTensor_& y_tensor,
const ReduceFunc& reduce_func)
const ReduceFunc& reduce_func,
ReducePacksPerXDim = {})
{
sweep_tile<XDistributedTensor_>(
[&](auto... idx_) {
constexpr auto idx_0 = make_tuple(make_tuple(idx_[number<0>{}]...)[number<0>{}]);
y_tensor(idx_0) = reduce_func(y_tensor(idx_0), x_tensor[idx_]...);
},
ReducePacksPerXDim{});
#if 0
constexpr auto I0 = number<0>{};
constexpr auto I1 = number<1>{};
constexpr auto spans = XDistributedTensor_::get_distributed_spans();
// FIXME: hard coded to reduce 2nd axis
@@ -42,6 +52,7 @@ struct BlockReduce2d
y_tensor(y_dstr_idx) = y;
});
#endif
}
template <typename XDistributedTensor_>
@@ -63,14 +74,17 @@ struct BlockReduce2d
return tensor;
}
template <typename XDistributedTensor_, typename ReduceFunc>
template <typename XDistributedTensor_,
typename ReduceFunc,
typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
CK_TILE_DEVICE auto operator()(const XDistributedTensor_& x_tensor,
const ComputeDataType& reduce_init,
const ReduceFunc& reduce_func)
const ReduceFunc& reduce_func,
ReducePacksPerXDim = {})
{
auto y_tensor = MakeYBlockTile<XDistributedTensor_>();
set_tile(y_tensor, reduce_init);
(*this)(x_tensor, y_tensor, reduce_func);
(*this)(x_tensor, y_tensor, reduce_func, ReducePacksPerXDim{});
return y_tensor;
}