External Interface (#304)

* add client example * clean * clean * reorg * clean up profiler * reorg * clea * fix profiler * function for getinstances * update client example * update client example * update client example * update * update example * update Jenkins file * update cmake * update Jenkins
2026-05-13 17:55:48 +00:00 · 2022-06-26 19:39:02 -05:00
parent b653c5eb2e
commit aebd211c36
73 changed files with 2199 additions and 1961 deletions
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -6,43 +6,45 @@ function(add_instance_library INSTANCE_NAME)
 endfunction(add_instance_library INSTANCE_NAME)

 add_subdirectory(gemm)
+add_subdirectory(gemm_splitk)
 add_subdirectory(gemm_bias2d)
 add_subdirectory(gemm_bias_relu)
 add_subdirectory(gemm_bias_relu_add)
 add_subdirectory(gemm_reduce)
 add_subdirectory(gemm_bias_add_reduce)
+add_subdirectory(gemm_add_add_fastgelu)
 add_subdirectory(batched_gemm)
+add_subdirectory(batched_gemm_reduce)
+add_subdirectory(grouped_gemm)
 add_subdirectory(conv1d_fwd)
 add_subdirectory(conv2d_fwd)
 add_subdirectory(conv3d_fwd)
 add_subdirectory(conv2d_fwd_bias_relu)
 add_subdirectory(conv2d_fwd_bias_relu_add)
 add_subdirectory(conv2d_bwd_data)
-add_subdirectory(reduce)
 add_subdirectory(convnd_bwd_data)
-add_subdirectory(grouped_gemm)
 add_subdirectory(conv2d_bwd_weight)
-add_subdirectory(batched_gemm_reduce)
-add_subdirectory(gemm_add_add_fastgelu)
+add_subdirectory(reduce)

 add_library(device_operations STATIC
-    $<TARGET_OBJECTS:device_conv1d_fwd_instance>
-    $<TARGET_OBJECTS:device_batched_gemm_instance>
-    $<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
-    $<TARGET_OBJECTS:device_conv2d_fwd_instance>
-    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
-    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
    $<TARGET_OBJECTS:device_gemm_instance>
+    $<TARGET_OBJECTS:device_gemm_splitk_instance>
    $<TARGET_OBJECTS:device_gemm_bias_relu_instance>
    $<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
    $<TARGET_OBJECTS:device_gemm_bias2d_instance>
-    $<TARGET_OBJECTS:device_reduce_instance>
-    $<TARGET_OBJECTS:device_convnd_bwd_data_instance>
-    $<TARGET_OBJECTS:device_grouped_gemm_instance>
-    $<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
-    $<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
-    $<TARGET_OBJECTS:device_conv3d_fwd_instance>
    $<TARGET_OBJECTS:device_gemm_add_add_fastgelu_instance>
+    $<TARGET_OBJECTS:device_batched_gemm_instance>
+    $<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
+    $<TARGET_OBJECTS:device_grouped_gemm_instance>
+    $<TARGET_OBJECTS:device_conv1d_fwd_instance>
+    $<TARGET_OBJECTS:device_conv2d_fwd_instance>
+    $<TARGET_OBJECTS:device_conv3d_fwd_instance>
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
+    $<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
+    $<TARGET_OBJECTS:device_convnd_bwd_data_instance>
+    $<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
+    $<TARGET_OBJECTS:device_reduce_instance>
 )
 add_library(composablekernels::device_operations ALIAS device_operations)

@@ -67,8 +69,8 @@ target_include_directories(device_operations PUBLIC
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
 )

--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
@@ -44,7 +44,7 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
@@ -44,7 +44,7 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
@@ -48,7 +48,7 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
@@ -49,7 +49,7 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -44,7 +44,7 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -44,7 +44,7 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -53,7 +53,7 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -49,7 +49,7 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -44,7 +44,7 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -44,7 +44,7 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -44,7 +44,7 @@ using device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -49,7 +49,7 @@ using device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
@@ -59,7 +59,7 @@ using device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
@@ -59,7 +59,7 @@ using device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
@@ -59,7 +59,7 @@ using device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
@@ -51,7 +51,7 @@ using device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances = std::tuple<
    >;

 void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceBatchedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances{});
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -67,9 +67,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
-    std::vector<
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
-        instances)
+    std::vector<DeviceBatchedGemmReducePtr<PassThrough,
+                                           PassThrough,
+                                           PassThrough,
+                                           DInElementOps,
+                                           DOutElementOps>>& instances)
 {
    add_device_operation_instances(
        instances,
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -67,9 +67,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
-    std::vector<
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
-        instances)
+    std::vector<DeviceBatchedGemmReducePtr<PassThrough,
+                                           PassThrough,
+                                           PassThrough,
+                                           DInElementOps,
+                                           DOutElementOps>>& instances)
 {
    add_device_operation_instances(
        instances,
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -67,9 +67,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
-    std::vector<
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
-        instances)
+    std::vector<DeviceBatchedGemmReducePtr<PassThrough,
+                                           PassThrough,
+                                           PassThrough,
+                                           DInElementOps,
+                                           DOutElementOps>>& instances)
 {
    add_device_operation_instances(
        instances,
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -64,9 +64,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
-    std::vector<
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
-        instances)
+    std::vector<DeviceBatchedGemmReducePtr<PassThrough,
+                                           PassThrough,
+                                           PassThrough,
+                                           DInElementOps,
+                                           DOutElementOps>>& instances)
 {
    add_device_operation_instances(
        instances,
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -28,14 +28,6 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
   device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp;
   device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp;
   device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp;
-   device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp;
-   device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp;
-   device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp;
-   device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp;
-   device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp;
-   device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp;
-   device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp;
-   device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
   device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp;
   device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp;
   device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp;
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(DEVICE_GEMM_SPLITK_INSTANCE_SOURCE
+   device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp;
+   device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp;
+   device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp;
+   device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp;
+   device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp;
+   device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp;
+   device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp;
+   device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
+)
+
+add_library(device_gemm_splitk_instance OBJECT ${DEVICE_GEMM_SPLITK_INSTANCE_SOURCE})
+
+target_compile_features(device_gemm_splitk_instance PUBLIC)
+set_target_properties(device_gemm_splitk_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances = std::tuple<
    >;

 void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances{});
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances = std::tuple<
    >;

 void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances{});
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
    >;

 void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{});
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -83,7 +83,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple<
 //     >;

 void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{});
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances = std::tuple<
    >;

 void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances{});
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances = std::tuple<
    >;

 void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances{});
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -51,7 +51,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances = std::tuple<
    >;

 void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances{});
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -51,7 +51,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances = std::tuple<
    >;

 void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+    std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances{});