[fix]: fix precision

This commit is contained in:
KMSorSMS
2025-11-07 14:56:05 +00:00
parent 32fab532c6
commit 7b88bb3d39
3 changed files with 30 additions and 51 deletions

View File

@@ -5,7 +5,9 @@
#include <cstdint>
#include <cstdio>
#include <limits>
#include <vector>
#include "amx_config.hpp"
#include "amx_utils.hpp"
#include "llama.cpp/ggml-impl.h"
#include "pack.hpp"
@@ -46,41 +48,16 @@ struct BufferAImpl {
assert(ith == 0 && nth == 1);
for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
__m512 amax_v0 = _mm512_setzero_ps();
__m512 amax_v1 = _mm512_setzero_ps();
__m512 amax_v2 = _mm512_setzero_ps();
__m512 amax_v3 = _mm512_setzero_ps();
__m512 amax_v4 = _mm512_setzero_ps();
__m512 amax_v5 = _mm512_setzero_ps();
__m512 amax_v6 = _mm512_setzero_ps();
__m512 amax_v7 = _mm512_setzero_ps();
for (int j = 0; j < k; j += 128) {
__m512 f0, f1, f2, f3, f4, f5, f6, f7;
avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 0), &f0, &f1);
avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 32), &f2, &f3);
avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 64), &f4, &f5);
avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 96), &f6, &f7);
amax_v0 = vector_abs_max(amax_v0, f0);
amax_v1 = vector_abs_max(amax_v1, f1);
amax_v2 = vector_abs_max(amax_v2, f2);
amax_v3 = vector_abs_max(amax_v3, f3);
amax_v4 = vector_abs_max(amax_v4, f4);
amax_v5 = vector_abs_max(amax_v5, f5);
amax_v6 = vector_abs_max(amax_v6, f6);
amax_v7 = vector_abs_max(amax_v7, f7);
float amax = 0.0f;
for (int j = 0; j < k; j += 32) {
__m512 f0, f1;
avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j), &f0, &f1);
amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
}
amax_v0 = vector_abs_max(amax_v0, amax_v1);
amax_v2 = vector_abs_max(amax_v2, amax_v3);
amax_v4 = vector_abs_max(amax_v4, amax_v5);
amax_v6 = vector_abs_max(amax_v6, amax_v7);
amax_v0 = vector_abs_max(amax_v0, amax_v2);
amax_v4 = vector_abs_max(amax_v4, amax_v6);
amax_v0 = vector_abs_max(amax_v0, amax_v4);
float amax = _mm512_reduce_max_ps(amax_v0);
d[m_begin + i] = amax / ((1 << 7) - 1);
}
}
int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {

View File

@@ -1180,9 +1180,9 @@ struct GemmKernel224Int4 {
}
static void load_a(dt* a, size_t lda) {
#ifdef __AMX__
_tile_stream_loadd(0, a, lda);
_tile_stream_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#ifdef HAVE_AMX
_tile_loadd(0, a, lda);
_tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
(void)a;
(void)lda;

View File

@@ -29,7 +29,10 @@
#include "../../cpu_backend/worker_pool.h"
#include "../moe-tp.hpp"
#include "la/amx.hpp"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
template <class T>
class AMX_MOE_TP {
@@ -261,6 +264,8 @@ class AMX_MOE_TP {
~AMX_MOE_TP() {
// shared_mem_buffer_numa.dealloc(this);
}
// pack and quant the weights
void pack_weights() {}
void load_weights() {
auto pool = config_.pool->get_subpool(tp_part_idx);
const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
@@ -269,7 +274,7 @@ class AMX_MOE_TP {
config_.expert_num, nullptr,
[this, physical_to_logical_map](int expert_id) {
// printf("Load layer %d [%d/%d]\n", config_.layer_idx, expert_id, config_.expert_num);
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_id);
uint64_t logical_expert_id = expert_id;
{
size_t scale_size = config_.intermediate_size * sizeof(float);
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size) - scale_size;
@@ -307,7 +312,7 @@ class AMX_MOE_TP {
std::cout << "Loading from " << prefix << std::endl;
for (int task_id = 0; task_id < config_.expert_num * mat_type_all * mat_split; task_id++) {
int64_t expert_idx = task_id / (mat_type_all * mat_split);
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
uint64_t logical_expert_id = expert_idx;
uint8_t mat_class = (task_id % (mat_type_all * mat_split)) / mat_split;
uint8_t mat_split_idex = task_id % mat_split;
if (mat_class == 0) { // the up matrix
@@ -341,32 +346,30 @@ class AMX_MOE_TP {
}
pool->do_work_stealing_job(
nth * config_.expert_num, nullptr,
[this, nth, physical_to_logical_map](int task_id) {
[this, nth](int task_id) {
int64_t expert_idx = task_id / nth;
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
int ith = task_id % nth;
// gate part
gate_bb_[logical_expert_id]->from_mat(
(ggml_bf16_t*)config_.gate_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
ith, nth);
gate_bb_[expert_idx]->from_mat(
(ggml_bf16_t*)config_.gate_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith,
nth);
// up part
up_bb_[logical_expert_id]->from_mat(
(ggml_bf16_t*)config_.up_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
ith, nth);
up_bb_[expert_idx]->from_mat(
(ggml_bf16_t*)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith,
nth);
},
nullptr);
nth = T::recommended_nth(config_.hidden_size);
pool->do_work_stealing_job(
nth * config_.expert_num, nullptr,
[this, nth, physical_to_logical_map](int task_id) {
[this, nth](int task_id) {
int64_t expert_idx = task_id / nth;
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
int ith = task_id % nth;
// down part
down_bb_[logical_expert_id]->from_mat(
(ggml_bf16_t*)config_.down_proj + logical_expert_id * config_.hidden_size * config_.intermediate_size,
ith, nth);
down_bb_[expert_idx]->from_mat(
(ggml_bf16_t*)config_.down_proj + expert_idx * config_.hidden_size * config_.intermediate_size, ith,
nth);
// printf("load down, expert %ld, ith %d, total nth %d\n", expert_idx, ith, nth);
},
nullptr);
@@ -378,9 +381,8 @@ class AMX_MOE_TP {
if (config_.save) {
pool->do_work_stealing_job(
config_.expert_num * mat_type_all, nullptr,
[this, physical_to_logical_map](int task_id) {
[this](int task_id) {
int64_t expert_idx = task_id / mat_type_all;
expert_idx = expert_map(physical_to_logical_map, expert_idx);
uint8_t mat_class = task_id % mat_type_all;
if (mat_class == 0) { // the up matrix
size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);