From e66d307e1302efa9a34a89aac253fe5e1d0a3aed Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Thu, 16 Oct 2025 11:31:03 +0300 Subject: [PATCH] Better argsort (CPU) (#835) * Better argsort (CPU) * Minor --------- Co-authored-by: Iwan Kawrakow --- ggml/src/CMakeLists.txt | 4 +-- ggml/src/ggml.c | 6 +++- ggml/src/iqk/iqk_cpu_ops.cpp | 57 ++++++++++++++++++++++++++++++++++++ ggml/src/iqk/iqk_cpu_ops.h | 25 ++++++++++++++++ 4 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 ggml/src/iqk/iqk_cpu_ops.cpp create mode 100644 ggml/src/iqk/iqk_cpu_ops.h diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 1c9fcc24..707ab6f2 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -256,8 +256,8 @@ if (GGML_BLAS) endif() endif() -set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp) -set (GGML_HEADERS_IQK iqk/iqk_config.h) +set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp iqk/iqk_cpu_ops.cpp) +set (GGML_HEADERS_IQK iqk/iqk_config.h iqk/iqk_cpu_ops.h) if (GGML_IQK_MUL_MAT) message(STATUS "Using optimized iqk matrix multiplications") add_compile_definitions(GGML_USE_IQK_MULMAT) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1869d19b..bfd5e41e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -12,6 +12,7 @@ #include "ggml.h" #include "ggml-aarch64.h" #include "iqk/iqk_quantize.h" +#include "iqk/iqk_cpu_ops.h" #if GGML_USE_IQK_MULMAT #include "iqk/iqk_mul_mat.h" #include "iqk/iqk_config.h" @@ -9408,6 +9409,7 @@ struct ggml_tensor * ggml_argsort( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); + ggml_set_op_params_i32(result, 1, (int32_t) a->ne[0]); result->op = GGML_OP_ARGSORT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -9446,6 +9448,7 @@ struct ggml_tensor * ggml_top_k( GGML_ASSERT(a->ne[0] >= k); struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC); + ggml_set_op_params_i32(result, 1, k); result = ggml_view_4d(ctx, result, k, result->ne[1], result->ne[2], result->ne[3], @@ -19942,7 +19945,8 @@ static void ggml_compute_forward_argsort( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_argsort_f32(params, dst); + iqk_argsort(dst, params->ith, params->nth); + //ggml_compute_forward_argsort_f32(params, dst); } break; default: { diff --git a/ggml/src/iqk/iqk_cpu_ops.cpp b/ggml/src/iqk/iqk_cpu_ops.cpp new file mode 100644 index 00000000..74de1479 --- /dev/null +++ b/ggml/src/iqk/iqk_cpu_ops.cpp @@ -0,0 +1,57 @@ +// +// Copyright (C) 2025 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + +#include "iqk_cpu_ops.h" +#include "ggml.h" + +#include +#include +#include + +void iqk_grouped_top_k([[maybe_unused]] ggml_tensor * dst, [[maybe_unused]] int ith, [[maybe_unused]] int nth) { +} + +void iqk_argsort(ggml_tensor * dst, int ith, int nth) { + + auto src = dst->src[0]; + GGML_ASSERT(dst->type == GGML_TYPE_I32); + GGML_ASSERT(src->type == GGML_TYPE_F32); + + auto nrows = ggml_nrows(src); + auto npt = (nrows + nth - 1)/nth; + auto first = npt*ith; + auto last = std::min(first + npt, nrows); + if (last <= first) return; + + auto order = (ggml_sort_order)dst->op_params[0]; + int nk = dst->op_params[1]; + + int ne00 = src->ne[0]; + thread_local std::vector> aux; + if ((int)aux.size() < ne00) aux.resize(ne00); + + for (int ir = first; ir < last; ++ir) { + auto data = (const float *)((const char *)src->data + ir*src->nb[1]); + for (int j = 0; j < ne00; ++j) aux[j] = {data[j], j}; + if (nk < ne00) { + if (order == GGML_SORT_ORDER_DESC) { + std::partial_sort(aux.begin(), aux.begin() + nk, aux.end(), std::greater>{}); + } else { + std::partial_sort(aux.begin(), aux.begin() + nk, aux.end()); + } + } else { + if (order == GGML_SORT_ORDER_DESC) { + std::sort(aux.begin(), aux.end(), std::greater>{}); + } else { + std::sort(aux.begin(), aux.end()); + } + } + auto y = (int32_t *)((char *)dst->data + ir*dst->nb[1]); + for (int j = 0; j < ne00; ++j) y[j] = aux[j].second; + } + +} + diff --git a/ggml/src/iqk/iqk_cpu_ops.h b/ggml/src/iqk/iqk_cpu_ops.h new file mode 100644 index 00000000..c83d8061 --- /dev/null +++ b/ggml/src/iqk/iqk_cpu_ops.h @@ -0,0 +1,25 @@ +// +// Copyright (C) 2025 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + +#pragma once +#include +#include +#include "iqk_config.h" +#ifdef __cplusplus +extern "C" { +#endif + +struct ggml_tensor; + +void iqk_grouped_top_k(struct ggml_tensor * dst, int ith, int nth); + +void iqk_argsort(struct ggml_tensor * dst, int ith, int nth); + +#ifdef __cplusplus +} +#endif + +