mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-26 01:19:20 +00:00
* Trying to implement quantized fmoe - not working yet * This works, but is slower than the non-working version * quantize_mmq_q8_1_id * Minor --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
39 lines
1.5 KiB
Plaintext
39 lines
1.5 KiB
Plaintext
//
|
|
// Copyright (C) 2023-2024 The ggml authors
|
|
// Copyright (C) 2024 Iwan Kawrakow
|
|
// MIT license
|
|
// SPDX-License-Identifier: MIT
|
|
//
|
|
|
|
#pragma once
|
|
|
|
#include "common.cuh"
|
|
#include "mmq.cuh"
|
|
|
|
#include <cstdint>
|
|
|
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
|
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
|
|
|
|
static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
|
|
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
|
|
|
typedef void (*quantize_cuda_t)(
|
|
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
const ggml_type type_x, cudaStream_t stream);
|
|
|
|
void quantize_row_q8_1_cuda(
|
|
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
const ggml_type type_x, cudaStream_t stream);
|
|
|
|
void quantize_mmq_q8_1_cuda(
|
|
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
const ggml_type type_x, cudaStream_t stream);
|
|
|
|
void quantize_mmq_q8_1_id_cuda(
|
|
const float * x, void * vy, const char * row_mapping, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded,
|
|
const ggml_type type_x, cudaStream_t stream);
|
|
|
|
// For now only applicable for tensors with ne[1] = 1, ne[3] = 1, and useful if ne[2] > 1
|
|
void quantize_tensor_q8_1_cuda(const struct ggml_tensor * src, void * vy, const enum ggml_type type, cudaStream_t stream);
|