mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-25 17:09:22 +00:00
* Fuse sigmoid+add+grouped_topk+get_rows (CPU) * Fix CPU + CUDA but CUDA is somehow not 100% correct as I get a slightly different PPL (lower!) * Minor * Fuse sigmoid+add+topk+get_rows (CUDA) * Fuse sigmoid+add+topk+get_rows (CPU) * Fuse topk+view+get_rows+reshape+softmax (CPU) * Fuse topk+view+get_rows+reshape+softmax (CUDA) * cpu: turn off the openai topk fusing for now Something is not right and I don't see the bug. On the CPU one doesn't gain much if anything, so not a big loss. * Also fuse sum_rows and div --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
20 lines
716 B
Plaintext
20 lines
716 B
Plaintext
//
|
|
// Copyright (C) 2023-2024 The ggml authors
|
|
// Copyright (C) 2024 Iwan Kawrakow
|
|
// MIT license
|
|
// SPDX-License-Identifier: MIT
|
|
//
|
|
#include "common.cuh"
|
|
|
|
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
|
|
void ggml_cuda_op_argsort_thresh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
|
|
void ggml_cuda_op_grouped_topk(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
|
|
void cuda_bailingmoev2_experts(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * topk);
|
|
|
|
void cuda_glm45moe_experts(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * topk);
|
|
|
|
void cuda_openai_experts(ggml_backend_cuda_context & ctx, ggml_tensor * topk, ggml_tensor * softmax);
|