mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 21:09:08 +00:00
Add 'Permute' device op & example (#408)
* Add example folder for 'DeviceElementwise'
* Re-structure example files
* Move common parts into common.hpp
* Use more strict input
* Add more helper methods in 'DeviceElementwise'
* Use more specific method to write example
* Allow specify problem through command line argument
* Allow specify problem 'axes' through command line argument
* Add check to template type argument
* Add transpose_shape() to generalize shape permute
* Generalize transpose utility functions
* Use better name for tensor indices
* Add checks in helper functions
* Remove debug messages
* Refine error message for check_err()
* Generalize variable naming in example code
* Add device op 'DevicePermute'
This device op is clone of 'DeviceElementwise'
* Use 'DevicePermute' device op in example
* Remove 'elementwise' from identifiers
* Remove 'elementwise' from file paths
* Remove base class of 'DevicePermute'
* Let 'DevicePermute' inherit from 'BaseOperator'
* Add simple type traits to validate device op type
* Add static_assert() to check type constraints
* Create 'DevicePermuteBase' to generate methods
* Use indirect base type to generate methods
* Remove 'is_device_op<>' type traits
* Only accept single-input-single-output for 'DervicePermute'
* Simplify 'DevicePermute' interface
* Re-format 'DeviceElementwise'
* Use CRTP to generate overridden virtual method
* Remove unnecessary include directives
* Distinguish input & output shape in 'DevicePermute'
* Passing 'axes' to 'DevicePermute'
* Use more reasonable return value for Invoker::Run()
* Add 'GridwisePermute' kernel
This kernel is a clone of 'GridwiseElementwise_1D'
* Remove no-longer used type argument
* Check if input/output shape meet the requirement
* Remove no-longer used method
* Remove never-entered-if-clause
* Change problem description for 'DevicePermute'
* Transform descriptor into 3 dimensions
* Add debug code the verify result
* Add comment to indicate template argument location
* Add N/H/WPerBlock template parameter to 'DevicePermute'
* Rename 'GridwisePermute' to 'GridwiseCopy'
* Check tensor descriptor dimensions in 'GridwiseElementwise_1D'
* Add missing include directive
* Add 'BlockSize' parameter to 'DevicePermute'
* Remove no-longer used method
* Add 'BlockToTileMap' for 'GridwiseCopy'
* Use the normal Block2TileMap convention
* Rename 'BlockToTileMap' as 'Block2TileMap'
* Fix most of compilation errors
* Let 'Block2TileMap' map block to 2d coordinate
* Allow data transfer in 'GridwiseCopy'
* Fix wrong output descriptor for 2nd blockwise copy
* Rename 'GridwiseCopy' as 'GridwisePermute'
* Remove '1d' in identifiers
* Remove commented-out codes
* Remove 'MPerThread' template parameter
* Seperate template parameters
* Unify variable namming convention
* Use more verbose way to create expressions
* Add template parameter 'InBlockLdsExtraW'
* Release the constraint on In/OutGridDesc
* Use date type directly as template argument
* Re-arrange template arguments for blockwise copy
* Remove no-longer used template parameters
* Embed layout in the variable names
* Add GridwisePermute::CheckValidity()
* Extract local types as template parameters
* Rename local type alias
* Add more template parameters (vector width related)
* Calculate new SrcVectorDim/DstVectorDim after merge descriptor dimensions
* Fill tensor values start from 1
* Re-formate example code
* Avoid too-large block id
* Add comment
* Make sure 'SrcVectorDim' is not same as 'DstVectorDim'
* Add check for the 'VectorDim' & 'ScalarPerVector' template params
* Let 'DstVectorDim' equals 'SrcVectorDim' after transpose out grid desc
* Remove no-longer used template parameter 'NPerBlock'
* Fix wrong descriptor creation logics
* Specify problem in each examples
* Use better example name
* Add new example 'example_permute_NxHxW_fp32'
* Add example for demonstrating bundle multiple elems in tensor
* Add support to permute multiple elements together
* Change the default problem size
* Add span<> class template
* Use span<> to generalize check_err() interface
* Fix ambiguous ctor call
* Avoid create necessary objects
* Use helper functions to simplify example code
* Add example for 4xfp16 permute
* Disable failed-to-compile example
* Add check for the NUM_ELEMS_IN_BUNDLE
* Remove redundant parameter in helper lambda function
* Add check for the input tensor type's byte-size
* Check scalar-per-vector with padded length
* Use more verbose name to avoid name collision
* Use fixed 'VectorDim' & 'ScalarPerVector' for LDS
* Embed shape info in name of descriptor constructor
* Rename example folder '36_permute' into '37_permute'
* Avoid using too-large LDS in kernel code
* Remove redundant example
* Usw switch() to group similar codes
* Add const to the span<> type arguement
* Simply initialize tensor with floating point values
* Use fp16 as data type in all examples
* Enlarge tensor size in example
* Enalrge N-dim in example
* Add check for the bundled type in example
* Use more stricter error threshold
* Remove global load/store loop in kernel code
* Measure execution time by default
* Use faster device op config for example 'NxHxW_fp16'
* Use faster device op config for example '1xHxW_fp16'
* Use faster device op config for example 'HxWx4_fp16'
* Remove cmd arg parsing logics
* Rename functions
* Extract bundle permutation logic out
* Simplify permute bundle example
* Add Tensor<>::GetElementSpaceSizeInBytes()
* Add Tensor<>::data()
* Use new methods to simplify code
* Use type alias to replace duplicated code
* Use existing method to shorten code
* Allow FillUniformDistribution accept range arugment
* Intialize random values in range
* Add Tensor<>::size()
* Use more meaningful names in permute bundle example
* Use more meaningful names in permute element examples
* Use rangified copy() to copy elements
* Use function return value directly to eliminate variables
* Add to_array() conversion tool to eliminate more variables
* Add Tensor<>::AsSpan<>() to create view of tensor values
* Use AsSpan() to shorten check_err() calls
* Remove no-longer-used 'using' directives
* Move 'using' directive to proper code position
* Remove redudant variables
* Remove useless static_assert()
* Add check for range types
* Declare variable right before first use
* Move long return type as tailing return type
* Add BaseInvokerCRTP<> class template to generate method
* Create new base type for 'DervicePermute' implementations
* Move 'NumDim' template param to the first
* Rename 'DevicePermute' to 'DevicePermuteImpl'
* Add 'noexcept' specifier to CRTP generated method
* Move 'Block2TileMap' definition into 'GridwisePermute'
* Use type alias to reduce code
* Unify naming style in 'DevicePermute'
* Add comments in 'GridwisePermute'
* Rename permute example folder
* Use std::cerr to report error
* Use larger shape in examples
* Rename '38_permute' to '39_permute'
* Make sure we use unsigned type for shape & indices
* Remove opt-ed out assertion
* Remove template BaseInvokerCRTP<>
[ROCm/composable_kernel commit: f584ab0c54]
This commit is contained in:
@@ -15,6 +15,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/span.hpp"
|
||||
#include "ck/utility/type.hpp"
|
||||
#include "ck/host_utility/io.hpp"
|
||||
|
||||
@@ -32,7 +33,7 @@ check_err(const std::vector<T>& out,
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
@@ -50,7 +51,7 @@ check_err(const std::vector<T>& out,
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl;
|
||||
}
|
||||
res = false;
|
||||
@@ -58,7 +59,7 @@ check_err(const std::vector<T>& out,
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
||||
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
@@ -73,7 +74,7 @@ check_err(const std::vector<T>& out,
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
@@ -94,7 +95,7 @@ check_err(const std::vector<T>& out,
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
@@ -102,22 +103,22 @@ check_err(const std::vector<T>& out,
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
||||
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
|
||||
check_err(const std::vector<T>& out,
|
||||
const std::vector<T>& ref,
|
||||
typename std::enable_if<std::is_same_v<T, half_t>, bool>::type
|
||||
check_err(span<const T> out,
|
||||
span<const T> ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
@@ -137,7 +138,7 @@ check_err(const std::vector<T>& out,
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
@@ -145,11 +146,22 @@ check_err(const std::vector<T>& out,
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
||||
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
|
||||
check_err(const std::vector<T>& out,
|
||||
const std::vector<T>& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
return check_err(span<const T>{out}, span<const T>{ref}, msg, rtol, atol);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::enable_if_t<(std::is_integral_v<T> && !std::is_same_v<T, bhalf_t>)
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
@@ -194,7 +206,7 @@ check_err(const std::vector<T>& out,
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
std::cout << "max err: " << max_err << std::endl;
|
||||
std::cerr << "max err: " << max_err << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,10 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iterator>
|
||||
#include <random>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
@@ -25,6 +28,15 @@ struct FillUniformDistribution
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) -> std::void_t<decltype(
|
||||
std::declval<FillUniformDistribution>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
|
||||
|
||||
@@ -3,15 +3,16 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/span.hpp"
|
||||
|
||||
template <typename Range>
|
||||
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
|
||||
@@ -235,6 +236,9 @@ auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
template <typename T>
|
||||
struct Tensor
|
||||
{
|
||||
using Descriptor = HostTensorDescriptor;
|
||||
using Data = std::vector<T>;
|
||||
|
||||
template <typename X>
|
||||
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
|
||||
{
|
||||
@@ -251,7 +255,7 @@ struct Tensor
|
||||
{
|
||||
}
|
||||
|
||||
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {}
|
||||
Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {}
|
||||
|
||||
template <typename OutT>
|
||||
Tensor<OutT> CopyAsType() const
|
||||
@@ -278,9 +282,9 @@ struct Tensor
|
||||
{
|
||||
}
|
||||
|
||||
const std::vector<std::size_t>& GetLengths() const { return mDesc.GetLengths(); }
|
||||
decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
|
||||
|
||||
const std::vector<std::size_t>& GetStrides() const { return mDesc.GetStrides(); }
|
||||
decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
|
||||
|
||||
std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); }
|
||||
|
||||
@@ -288,6 +292,8 @@ struct Tensor
|
||||
|
||||
std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); }
|
||||
|
||||
std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
|
||||
|
||||
void SetZero()
|
||||
{
|
||||
for(auto& v : mData)
|
||||
@@ -425,14 +431,40 @@ struct Tensor
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(idx)];
|
||||
}
|
||||
|
||||
typename std::vector<T>::iterator begin() { return mData.begin(); }
|
||||
typename Data::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::iterator end() { return mData.end(); }
|
||||
typename Data::iterator end() { return mData.end(); }
|
||||
|
||||
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
|
||||
typename Data::pointer data() { return mData.data(); }
|
||||
|
||||
typename std::vector<T>::const_iterator end() const { return mData.end(); }
|
||||
typename Data::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
HostTensorDescriptor mDesc;
|
||||
std::vector<T> mData;
|
||||
typename Data::const_iterator end() const { return mData.end(); }
|
||||
|
||||
typename Data::const_pointer data() const { return mData.data(); }
|
||||
|
||||
typename Data::size_type size() const { return mData.size(); }
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan() const
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::add_const_t<std::remove_reference_t<U>>;
|
||||
return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan()
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::remove_reference_t<U>;
|
||||
return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
Descriptor mDesc;
|
||||
Data mData;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user