Better argsort (CPU) (#835)

* Better argsort (CPU) * Minor --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-01-26 17:20:01 +00:00 · 2025-10-16 11:31:03 +03:00
parent f7adde1043
commit e66d307e13
4 changed files with 89 additions and 3 deletions
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -256,8 +256,8 @@ if (GGML_BLAS)
    endif()
 endif()

-set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp)
-set (GGML_HEADERS_IQK iqk/iqk_config.h)
+set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp iqk/iqk_cpu_ops.cpp)
+set (GGML_HEADERS_IQK iqk/iqk_config.h iqk/iqk_cpu_ops.h)
 if (GGML_IQK_MUL_MAT)
    message(STATUS "Using optimized iqk matrix multiplications")
    add_compile_definitions(GGML_USE_IQK_MULMAT)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12,6 +12,7 @@
 #include "ggml.h"
 #include "ggml-aarch64.h"
 #include "iqk/iqk_quantize.h"
+#include "iqk/iqk_cpu_ops.h"
 #if GGML_USE_IQK_MULMAT
 #include "iqk/iqk_mul_mat.h"
 #include "iqk/iqk_config.h"
@@ -9408,6 +9409,7 @@ struct ggml_tensor * ggml_argsort(
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);

    ggml_set_op_params_i32(result, 0, (int32_t) order);
+    ggml_set_op_params_i32(result, 1, (int32_t) a->ne[0]);

    result->op   = GGML_OP_ARGSORT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -9446,6 +9448,7 @@ struct ggml_tensor * ggml_top_k(
    GGML_ASSERT(a->ne[0] >= k);

    struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
+    ggml_set_op_params_i32(result, 1, k);

    result = ggml_view_4d(ctx, result,
                k, result->ne[1], result->ne[2], result->ne[3],
@@ -19942,7 +19945,8 @@ static void ggml_compute_forward_argsort(
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_argsort_f32(params, dst);
+                iqk_argsort(dst, params->ith, params->nth);
+                //ggml_compute_forward_argsort_f32(params, dst);
            } break;
        default:
            {
--- a/ggml/src/iqk/iqk_cpu_ops.cpp
+++ b/ggml/src/iqk/iqk_cpu_ops.cpp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
+#include "iqk_cpu_ops.h"
+#include "ggml.h"
+
+#include <cstdint>
+#include <vector>
+#include <algorithm>
+
+void iqk_grouped_top_k([[maybe_unused]] ggml_tensor * dst, [[maybe_unused]] int ith, [[maybe_unused]] int nth) {
+}
+
+void iqk_argsort(ggml_tensor * dst, int ith, int nth) {
+
+    auto src = dst->src[0];
+    GGML_ASSERT(dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+
+    auto nrows = ggml_nrows(src);
+    auto npt   = (nrows + nth - 1)/nth;
+    auto first = npt*ith;
+    auto last  = std::min(first + npt, nrows);
+    if (last <= first) return;
+
+    auto order = (ggml_sort_order)dst->op_params[0];
+    int nk = dst->op_params[1];
+
+    int ne00 = src->ne[0];
+    thread_local std::vector<std::pair<float,int>> aux;
+    if ((int)aux.size() < ne00) aux.resize(ne00);
+
+    for (int ir = first; ir < last; ++ir) {
+        auto data = (const float *)((const char *)src->data + ir*src->nb[1]);
+        for (int j = 0; j < ne00; ++j) aux[j] = {data[j], j};
+        if (nk < ne00) {
+            if (order == GGML_SORT_ORDER_DESC) {
+                std::partial_sort(aux.begin(), aux.begin() + nk, aux.end(), std::greater<std::pair<float,int>>{});
+            } else {
+                std::partial_sort(aux.begin(), aux.begin() + nk, aux.end());
+            }
+        } else {
+            if (order == GGML_SORT_ORDER_DESC) {
+                std::sort(aux.begin(), aux.end(), std::greater<std::pair<float,int>>{});
+            } else {
+                std::sort(aux.begin(), aux.end());
+            }
+        }
+        auto y = (int32_t *)((char *)dst->data + ir*dst->nb[1]);
+        for (int j = 0; j < ne00; ++j) y[j] = aux[j].second;
+    }
+
+}
+
--- a/ggml/src/iqk/iqk_cpu_ops.h
+++ b/ggml/src/iqk/iqk_cpu_ops.h
@@ -0,0 +1,25 @@
+//
+// Copyright (C) 2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+#include <stdint.h>
+#include <stdbool.h>
+#include "iqk_config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_tensor;
+
+void iqk_grouped_top_k(struct ggml_tensor * dst, int ith, int nth);
+
+void iqk_argsort(struct ggml_tensor * dst, int ith, int nth);
+
+#ifdef __cplusplus
+}
+#endif
+
+