Better argsort (CPU) (#835)

* Better argsort (CPU)

* Minor

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-10-16 11:31:03 +03:00
committed by GitHub
parent f7adde1043
commit e66d307e13
4 changed files with 89 additions and 3 deletions

View File

@@ -256,8 +256,8 @@ if (GGML_BLAS)
endif()
endif()
set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp)
set (GGML_HEADERS_IQK iqk/iqk_config.h)
set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp iqk/iqk_cpu_ops.cpp)
set (GGML_HEADERS_IQK iqk/iqk_config.h iqk/iqk_cpu_ops.h)
if (GGML_IQK_MUL_MAT)
message(STATUS "Using optimized iqk matrix multiplications")
add_compile_definitions(GGML_USE_IQK_MULMAT)

View File

@@ -12,6 +12,7 @@
#include "ggml.h"
#include "ggml-aarch64.h"
#include "iqk/iqk_quantize.h"
#include "iqk/iqk_cpu_ops.h"
#if GGML_USE_IQK_MULMAT
#include "iqk/iqk_mul_mat.h"
#include "iqk/iqk_config.h"
@@ -9408,6 +9409,7 @@ struct ggml_tensor * ggml_argsort(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
ggml_set_op_params_i32(result, 0, (int32_t) order);
ggml_set_op_params_i32(result, 1, (int32_t) a->ne[0]);
result->op = GGML_OP_ARGSORT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -9446,6 +9448,7 @@ struct ggml_tensor * ggml_top_k(
GGML_ASSERT(a->ne[0] >= k);
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
ggml_set_op_params_i32(result, 1, k);
result = ggml_view_4d(ctx, result,
k, result->ne[1], result->ne[2], result->ne[3],
@@ -19942,7 +19945,8 @@ static void ggml_compute_forward_argsort(
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_argsort_f32(params, dst);
iqk_argsort(dst, params->ith, params->nth);
//ggml_compute_forward_argsort_f32(params, dst);
} break;
default:
{

View File

@@ -0,0 +1,57 @@
//
// Copyright (C) 2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#include "iqk_cpu_ops.h"
#include "ggml.h"
#include <cstdint>
#include <vector>
#include <algorithm>
void iqk_grouped_top_k([[maybe_unused]] ggml_tensor * dst, [[maybe_unused]] int ith, [[maybe_unused]] int nth) {
}
void iqk_argsort(ggml_tensor * dst, int ith, int nth) {
auto src = dst->src[0];
GGML_ASSERT(dst->type == GGML_TYPE_I32);
GGML_ASSERT(src->type == GGML_TYPE_F32);
auto nrows = ggml_nrows(src);
auto npt = (nrows + nth - 1)/nth;
auto first = npt*ith;
auto last = std::min(first + npt, nrows);
if (last <= first) return;
auto order = (ggml_sort_order)dst->op_params[0];
int nk = dst->op_params[1];
int ne00 = src->ne[0];
thread_local std::vector<std::pair<float,int>> aux;
if ((int)aux.size() < ne00) aux.resize(ne00);
for (int ir = first; ir < last; ++ir) {
auto data = (const float *)((const char *)src->data + ir*src->nb[1]);
for (int j = 0; j < ne00; ++j) aux[j] = {data[j], j};
if (nk < ne00) {
if (order == GGML_SORT_ORDER_DESC) {
std::partial_sort(aux.begin(), aux.begin() + nk, aux.end(), std::greater<std::pair<float,int>>{});
} else {
std::partial_sort(aux.begin(), aux.begin() + nk, aux.end());
}
} else {
if (order == GGML_SORT_ORDER_DESC) {
std::sort(aux.begin(), aux.end(), std::greater<std::pair<float,int>>{});
} else {
std::sort(aux.begin(), aux.end());
}
}
auto y = (int32_t *)((char *)dst->data + ir*dst->nb[1]);
for (int j = 0; j < ne00; ++j) y[j] = aux[j].second;
}
}

View File

@@ -0,0 +1,25 @@
//
// Copyright (C) 2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#pragma once
#include <stdint.h>
#include <stdbool.h>
#include "iqk_config.h"
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_tensor;
void iqk_grouped_top_k(struct ggml_tensor * dst, int ith, int nth);
void iqk_argsort(struct ggml_tensor * dst, int ith, int nth);
#ifdef __cplusplus
}
#endif