diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index b9748aabda..9f4c43338e 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -22,7 +22,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
         add_example_executable(example_moe_gemm1_xdl_pk_i4 moe_gemm1_xdl_pk_i4.cpp)
         add_example_executable(example_moe_gemm2_xdl_pk_i4 moe_gemm2_xdl_pk_i4.cpp)
-        if(CK_hip_VERSION VERSION_LESS_EQUAL 6.3.42132)
+        if(hip_VERSION_FLAT LESS_EQUAL 600342132)
             set(EXAMPLE_COMPILE_OPTIONS)
             check_cxx_compiler_flag("-mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1" HAS_MAX_ILP_SCHEDULING_STRATEGY)
             if(HAS_MAX_ILP_SCHEDULING_STRATEGY)
@@ -31,8 +31,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
             example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
             example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
         endif()
-        set(GEMM_OPTIONS)
-        list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
         example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
@@ -40,16 +39,25 @@ foreach(gpu IN LISTS GPU_TARGETS)
     endif()
 endforeach()
 
-set(GEMM_OPTIONS)
-list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
-set(BLOCKSCALE_GEMM_OPTIONS)
+set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+set(BLOCKSCALE_GEMM_OPTIONS )
 check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
 check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
-if(HAS_MISCHED_BOTTOMUP)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
-elseif(HAS_MISCHED_PRERA_DIRECTION)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+
+if(hip_VERSION_FLAT LESS 600443483 OR hip_VERSION_FLAT GREATER_EQUAL 700000000)
+  if(HAS_MISCHED_BOTTOMUP)
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+  elseif(HAS_MISCHED_PRERA_DIRECTION)
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+  endif()
+else()
+  if(HAS_MISCHED_BOTTOMUP)
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
+  elseif(HAS_MISCHED_PRERA_DIRECTION)
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-prera-direction=bottomup")
+  endif()
 endif()
+
 check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental " HAS_MAX_OCCUPANCY_EXPERIMENTAL)
 if(HAS_MAX_OCCUPANCY_EXPERIMENTAL)
     list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental)
diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index ab9cc4199c..011491ffc6 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -32,6 +32,22 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
     return atomicAdd(p_dst, x);
 }
 
+template <>
+__device__ unsigned short atomic_add<unsigned short>(unsigned short* p_dst, const unsigned short& x)
+{
+    // Use atomicAdd with unsigned int
+    return static_cast<unsigned short>(
+        atomicAdd(reinterpret_cast<unsigned int*>(p_dst), static_cast<unsigned int>(x)));
+}
+
+template <>
+__device__ _Float16 atomic_add<_Float16>(_Float16* p_dst, const _Float16& x)
+{
+    // Use atomicAdd with unsigned int
+    return static_cast<_Float16>(
+        atomicAdd(reinterpret_cast<unsigned int*>(p_dst), static_cast<unsigned int>(x)));
+}
+
 template <>
 __device__ double atomic_add<double>(double* p_dst, const double& x)
 {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
index b44d60deaf..b987519082 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
@@ -10,27 +10,11 @@ namespace instance {
 
 #if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
 
-using GemmF8F8BF16InstanceVector =
-    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
-                                                        Col,
-                                                        Row,
-                                                        F8,
-                                                        F8,
-                                                        BF16,
-                                                        PassThrough,
-                                                        PassThrough,
-                                                        PassThrough>>>&;
+using GemmF8F8BF16InstanceVector = std::vector<std::unique_ptr<
+    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&;
 
-using GemmF8F8F16InstanceVector =
-    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
-                                                        Col,
-                                                        Row,
-                                                        F8,
-                                                        F8,
-                                                        F16,
-                                                        PassThrough,
-                                                        PassThrough,
-                                                        PassThrough>>>&;
+using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
+    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
     GemmF8F8BF16InstanceVector& instances);
@@ -48,7 +32,7 @@ void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances
     GemmF8F8BF16InstanceVector& instances);
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
-        GemmF8F8BF16InstanceVector& instances);
+    GemmF8F8BF16InstanceVector& instances);
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
     GemmF8F8BF16InstanceVector& instances);
 
@@ -84,7 +68,7 @@ void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_defau
     GemmF8F8F16InstanceVector& instances);
 
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-        GemmF8F8F16InstanceVector& instances);
+    GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
     GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(