mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
CK: removed the api reference (#3571)
* removed the api reference * updating to the latest rocm-docs-core min version * fixed a formatting issue with buffer views * removed reference links from code snippets * removed reference links from code snippets --------- Co-authored-by: John Afaganis <john.afaganis@amd.com>
This commit is contained in:
@@ -59,10 +59,6 @@ The key insight is that convolution can be transformed from a complex nested loo
|
||||
|
||||
|
||||
|
||||
.. image:: diagrams/convolution_example.svg
|
||||
:alt: Diagram
|
||||
:align: center
|
||||
|
||||
.. image:: diagrams/convolution_example.svg
|
||||
:alt: Diagram
|
||||
:align: center
|
||||
@@ -88,7 +84,6 @@ Non-overlapping tiles:
|
||||
|
||||
// Original matrix: shape=(6, 6), strides=(6, 1)
|
||||
// Tiled view: shape=(3, 3, 2, 2), strides=(12, 2, 6, 1)
|
||||
// See :ref:`ck_tile_descriptors` for descriptor details
|
||||
using TileDescriptor = TensorDescriptor<
|
||||
Sequence<kNumTiles, kNumTiles, kTileSize, kTileSize>,
|
||||
Sequence<12, 2, 6, 1>
|
||||
@@ -243,7 +238,6 @@ The im2col transformation converts the 4D windows tensor into a 2D matrix suitab
|
||||
>;
|
||||
|
||||
// Step 2: Apply merge transforms to create 2D im2col layout
|
||||
// See :ref:`ck_tile_transforms` for transform operations
|
||||
using Im2colDescriptor = decltype(
|
||||
transform_tensor_descriptor(
|
||||
WindowsDescriptor{},
|
||||
@@ -312,7 +306,6 @@ Combining all components into an optimized convolution implementation:
|
||||
>;
|
||||
|
||||
// Tile distribution for matrix multiplication
|
||||
// See :ref:`ck_tile_tile_distribution` for details
|
||||
using ATileDist = TileDistribution<
|
||||
Sequence<TileM, TileK>,
|
||||
Sequence<BlockM, 1>
|
||||
@@ -327,7 +320,6 @@ Combining all components into an optimized convolution implementation:
|
||||
>;
|
||||
|
||||
// Thread-local accumulator
|
||||
// See :ref:`ck_tile_static_distributed_tensor`
|
||||
StaticDistributedTensor<DataType, CTileDist> c_accumulator;
|
||||
|
||||
// Initialize accumulator
|
||||
@@ -339,7 +331,6 @@ Combining all components into an optimized convolution implementation:
|
||||
// Main GEMM loop over K dimension
|
||||
for (index_t k_tile = 0; k_tile < PatchSize; k_tile += TileK) {
|
||||
// Create tile windows for im2col matrix and kernel
|
||||
// See :ref:`ck_tile_tile_window` for window operations
|
||||
auto a_window = make_tile_window<ATileDist>(
|
||||
input, Im2colDesc{H, W, K},
|
||||
{blockIdx.y * TileM, k_tile}
|
||||
@@ -350,7 +341,7 @@ Combining all components into an optimized convolution implementation:
|
||||
{k_tile, 0}
|
||||
);
|
||||
|
||||
// Load tiles - see :ref:`ck_tile_load_store_traits` for optimization
|
||||
// Load tiles
|
||||
auto a_tile = a_window.load();
|
||||
auto b_tile = b_window.load();
|
||||
|
||||
@@ -476,7 +467,6 @@ CK Tile enables several optimizations for convolution:
|
||||
__shared__ float smem_b[TileK][TileN];
|
||||
|
||||
// Collaborative loading with proper bank conflict avoidance
|
||||
// See :ref:`ck_tile_lds_bank_conflicts` for optimization
|
||||
auto load_tile_to_smem = [&](auto& window, float smem[][TileK]) {
|
||||
#pragma unroll
|
||||
for (index_t i = threadIdx.y; i < TileM; i += blockDim.y) {
|
||||
@@ -560,7 +550,7 @@ This example demonstrates how CK Tile transforms convolution from a memory-bound
|
||||
|
||||
- **Sliding windows** can be efficiently represented using tensor descriptors with appropriate strides
|
||||
- **Im2col transformation** converts convolution to matrix multiplication without data copies
|
||||
- **Tile distribution** enables optimal work distribution across GPU threads (see :ref:`ck_tile_tile_distribution`)
|
||||
- **Tile distribution** enables optimal work distribution across GPU threads (see :ref:`ck_tile_distribution`)
|
||||
- **Multi-channel support** extends naturally through higher-dimensional descriptors
|
||||
- **Performance optimizations** like vectorization and shared memory are seamlessly integrated (see :ref:`ck_tile_gemm_optimization` for similar techniques)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user