From e66d307e1302efa9a34a89aac253fe5e1d0a3aed Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 16 Oct 2025 11:31:03 +0300
Subject: [PATCH] Better argsort (CPU) (#835)

* Better argsort (CPU)

* Minor

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml/src/CMakeLists.txt      |  4 +--
 ggml/src/ggml.c              |  6 +++-
 ggml/src/iqk/iqk_cpu_ops.cpp | 57 ++++++++++++++++++++++++++++++++++++
 ggml/src/iqk/iqk_cpu_ops.h   | 25 ++++++++++++++++
 4 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 ggml/src/iqk/iqk_cpu_ops.cpp
 create mode 100644 ggml/src/iqk/iqk_cpu_ops.h
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 1c9fcc24..707ab6f2 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -256,8 +256,8 @@ if (GGML_BLAS)
     endif()
 endif()
 
-set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp)
-set (GGML_HEADERS_IQK iqk/iqk_config.h)
+set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp iqk/iqk_cpu_ops.cpp)
+set (GGML_HEADERS_IQK iqk/iqk_config.h iqk/iqk_cpu_ops.h)
 if (GGML_IQK_MUL_MAT)
     message(STATUS "Using optimized iqk matrix multiplications")
     add_compile_definitions(GGML_USE_IQK_MULMAT)
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1869d19b..bfd5e41e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12,6 +12,7 @@
 #include "ggml.h"
 #include "ggml-aarch64.h"
 #include "iqk/iqk_quantize.h"
+#include "iqk/iqk_cpu_ops.h"
 #if GGML_USE_IQK_MULMAT
 #include "iqk/iqk_mul_mat.h"
 #include "iqk/iqk_config.h"
@@ -9408,6 +9409,7 @@ struct ggml_tensor * ggml_argsort(
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
 
     ggml_set_op_params_i32(result, 0, (int32_t) order);
+    ggml_set_op_params_i32(result, 1, (int32_t) a->ne[0]);
 
     result->op   = GGML_OP_ARGSORT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -9446,6 +9448,7 @@ struct ggml_tensor * ggml_top_k(
     GGML_ASSERT(a->ne[0] >= k);
 
     struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
+    ggml_set_op_params_i32(result, 1, k);
 
     result = ggml_view_4d(ctx, result,
                 k, result->ne[1], result->ne[2], result->ne[3],
@@ -19942,7 +19945,8 @@ static void ggml_compute_forward_argsort(
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_argsort_f32(params, dst);
+                iqk_argsort(dst, params->ith, params->nth);
+                //ggml_compute_forward_argsort_f32(params, dst);
             } break;
         default:
             {
diff --git a/ggml/src/iqk/iqk_cpu_ops.cpp b/ggml/src/iqk/iqk_cpu_ops.cpp
new file mode 100644
index 00000000..74de1479
--- /dev/null
+++ b/ggml/src/iqk/iqk_cpu_ops.cpp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
+#include "iqk_cpu_ops.h"
+#include "ggml.h"
+
+#include <cstdint>
+#include <vector>
+#include <algorithm>
+
+void iqk_grouped_top_k([[maybe_unused]] ggml_tensor * dst, [[maybe_unused]] int ith, [[maybe_unused]] int nth) {
+}
+
+void iqk_argsort(ggml_tensor * dst, int ith, int nth) {
+
+    auto src = dst->src[0];
+    GGML_ASSERT(dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+
+    auto nrows = ggml_nrows(src);
+    auto npt   = (nrows + nth - 1)/nth;
+    auto first = npt*ith;
+    auto last  = std::min(first + npt, nrows);
+    if (last <= first) return;
+
+    auto order = (ggml_sort_order)dst->op_params[0];
+    int nk = dst->op_params[1];
+
+    int ne00 = src->ne[0];
+    thread_local std::vector<std::pair<float,int>> aux;
+    if ((int)aux.size() < ne00) aux.resize(ne00);
+
+    for (int ir = first; ir < last; ++ir) {
+        auto data = (const float *)((const char *)src->data + ir*src->nb[1]);
+        for (int j = 0; j < ne00; ++j) aux[j] = {data[j], j};
+        if (nk < ne00) {
+            if (order == GGML_SORT_ORDER_DESC) {
+                std::partial_sort(aux.begin(), aux.begin() + nk, aux.end(), std::greater<std::pair<float,int>>{});
+            } else {
+                std::partial_sort(aux.begin(), aux.begin() + nk, aux.end());
+            }
+        } else {
+            if (order == GGML_SORT_ORDER_DESC) {
+                std::sort(aux.begin(), aux.end(), std::greater<std::pair<float,int>>{});
+            } else {
+                std::sort(aux.begin(), aux.end());
+            }
+        }
+        auto y = (int32_t *)((char *)dst->data + ir*dst->nb[1]);
+        for (int j = 0; j < ne00; ++j) y[j] = aux[j].second;
+    }
+
+}
+
diff --git a/ggml/src/iqk/iqk_cpu_ops.h b/ggml/src/iqk/iqk_cpu_ops.h
new file mode 100644
index 00000000..c83d8061
--- /dev/null
+++ b/ggml/src/iqk/iqk_cpu_ops.h
@@ -0,0 +1,25 @@
+//
+// Copyright (C) 2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+#include <stdint.h>
+#include <stdbool.h>
+#include "iqk_config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_tensor;
+
+void iqk_grouped_top_k(struct ggml_tensor * dst, int ith, int nth);
+
+void iqk_argsort(struct ggml_tensor * dst, int ith, int nth);
+
+#ifdef __cplusplus
+}
+#endif
+
+