Feat - add kimi 2.5 Vision (#1280)

* port kimi 25-vision from upstream * feat(clip): add support for Kimi K2.5 vision model
2026-04-29 19:01:47 +00:00 · 2026-02-19 04:15:03 -03:00
parent 04cf685e82
commit 51df09be8a
5 changed files with 453 additions and 140 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2279,6 +2279,7 @@ extern "C" {
    enum ggml_scale_mode {
        GGML_SCALE_MODE_NEAREST  = 0,
        GGML_SCALE_MODE_BILINEAR = 1,
+        GGML_SCALE_MODE_BICUBIC  = 2,

        GGML_SCALE_MODE_COUNT
    };
--- a/ggml/src/ggml-cuda/upscale.cu
+++ b/ggml/src/ggml-cuda/upscale.cu
@@ -22,6 +22,70 @@ static __global__ void upscale_f32(const float * x, float * dst,
    dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
 }

+namespace bicubic_interpolation {
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+__device__ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
+
+static __device__ float weight1(float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
+static __device__ float weight2(float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
+
+static __device__ float bicubic(float p0, float p1, float p2, float p3, float x) {
+    const float w0 = weight2(x + 1);
+    const float w1 = weight1(x + 0);
+    const float w2 = weight1(1 - x);
+    const float w3 = weight2(2 - x);
+    return p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+};
+} // namespace bicubic_interpolation
+
+static __global__ void upscale_f32_bicubic(const float * x, float * dst,
+        const int nb00, const int nb01, const int nb02, const int nb03,
+        const int ne00_src, const int ne01_src,
+        const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
+        const float sf0, const float sf1, const float sf2, const float sf3,
+        const float pixel_offset) {
+    using bicubic_interpolation::bicubic;
+
+    const int64_t index              = threadIdx.x + blockIdx.x * blockDim.x;
+    const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
+
+    if (index >= dst_total_elements) {
+        return;
+    }
+
+    const int i10_dst = index % ne10_dst;
+    const int i11_dst = (index / ne10_dst) % ne11_dst;
+    const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
+    const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
+
+    const int i02_src = (int)(i12_dst / sf2);
+    const int i03_src = (int)(i13_dst / sf3);
+
+    const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
+    const int y0_src    = (int)floorf(y_src_f);
+    const float dy      = y_src_f - (float)y0_src;
+
+    const float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
+    const int x0_src    = (int)floorf(x_src_f);
+    const float dx      = x_src_f - (float)x0_src;
+
+    const char * x_base = (const char *)x + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03;
+
+    auto load = [=](int x_off, int y_off) -> float {
+        int i00_src = max(0, min(x0_src + x_off, ne00_src - 1));
+        int i01_src = max(0, min(y0_src + y_off, ne01_src - 1));
+        return *(const float *)(x_base + (int64_t)i00_src * nb00 + (int64_t)i01_src * nb01);
+    };
+
+    const float result = bicubic(
+        bicubic(load(-1,-1), load(0,-1), load(1,-1), load(2,-1), dx),
+        bicubic(load(-1, 0), load(0, 0), load(1, 0), load(2, 0), dx),
+        bicubic(load(-1, 1), load(0, 1), load(1, 1), load(2, 1), dx),
+        bicubic(load(-1, 2), load(0, 2), load(1, 2), load(2, 2), dx), dy);
+
+    dst[index] = result;
+}
+
 static void upscale_f32_cuda(const float * x, float * dst,
        const int nb00, const int nb01, const int nb02, const int nb03,
        const int ne10, const int ne11, const int ne12, const int ne13,
@@ -33,6 +97,18 @@ static void upscale_f32_cuda(const float * x, float * dst,
    upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
 }

+static void upscale_f32_bicubic_cuda(const float * x, float * dst,
+        const int nb00, const int nb01, const int nb02, const int nb03,
+        const int ne00_src, const int ne01_src,
+        const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
+        const float sf0, const float sf1, const float sf2, const float sf3,
+        const float pixel_offset, cudaStream_t stream) {
+    const int64_t dst_size   = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
+    const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+
+    upscale_f32_bicubic<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
+}
+
 void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
@@ -42,10 +118,26 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    const float sf0 = (float)dst->ne[0]/src0->ne[0];
-    const float sf1 = (float)dst->ne[1]/src0->ne[1];
-    const float sf2 = (float)dst->ne[2]/src0->ne[2];
+    const int mode_flags = dst->op_params[0];
+    const ggml_scale_mode mode = (ggml_scale_mode)(mode_flags & 0xFF);
+
+    float sf0 = (float)dst->ne[0]/src0->ne[0];
+    float sf1 = (float)dst->ne[1]/src0->ne[1];
+    float sf2 = (float)dst->ne[2]/src0->ne[2];
    const float sf3 = (float)dst->ne[3]/src0->ne[3];

-    upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
+    float pixel_offset = 0.5f;
+    if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
+        sf0          = dst->ne[0] > 1 && src0->ne[0] > 1 ? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1) : sf0;
+        sf1          = dst->ne[1] > 1 && src0->ne[1] > 1 ? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1) : sf1;
+        pixel_offset = 0.0f;
+    }
+
+    if (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) {
+        upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
+    } else if (mode == GGML_SCALE_MODE_BICUBIC) {
+        upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                                 src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                                 sf0, sf1, sf2, sf3, pixel_offset, stream);
+    }
 }
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -21042,6 +21042,26 @@ static void ggml_compute_forward_pool_2d(

 // ggml_compute_forward_upscale

+#ifndef GGML_CLAMP
+#define GGML_CLAMP(x, min, max) ((x) < (min) ? (min) : ((x) > (max) ? (max) : (x)))
+#endif
+
+static inline float ggml_bicubic_weight1(float x, float a) {
+    return ((a + 2.0f) * x - (a + 3.0f)) * x * x + 1.0f;
+}
+
+static inline float ggml_bicubic_weight2(float x, float a) {
+    return ((a * x - 5.0f * a) * x + 8.0f * a) * x - 4.0f * a;
+}
+
+static inline float ggml_bicubic_interp(float p0, float p1, float p2, float p3, float x, float a) {
+    const float w0 = ggml_bicubic_weight2(x + 1.0f, a);
+    const float w1 = ggml_bicubic_weight1(x + 0.0f, a);
+    const float w2 = ggml_bicubic_weight1(1.0f - x, a);
+    const float w3 = ggml_bicubic_weight2(2.0f - x, a);
+    return p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
+}
+
 static void ggml_compute_forward_upscale_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {
@@ -21055,29 +21075,78 @@ static void ggml_compute_forward_upscale_f32(

    GGML_TENSOR_UNARY_OP_LOCALS

-    const float sf0 = (float)ne0/src0->ne[0];
-    const float sf1 = (float)ne1/src0->ne[1];
-    const float sf2 = (float)ne2/src0->ne[2];
-    const float sf3 = (float)ne3/src0->ne[3];
+    float sf0 = (float)ne0/src0->ne[0];
+    float sf1 = (float)ne1/src0->ne[1];
+    float sf2 = (float)ne2/src0->ne[2];
+    float sf3 = (float)ne3/src0->ne[3];
+    float pixel_offset = 0.5f;

-    // TODO: optimize
+    const int32_t mode_flags = ((int32_t *)dst->op_params)[0];
+    const int32_t mode = (mode_flags & 0xFF);

-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        const int64_t i03 = i3 / sf3;
-        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-            const int64_t i02 = i2 / sf2;
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                const int64_t i01 = i1 / sf1;
-                for (int64_t i0 = 0; i0 < ne0; i0++) {
-                    const int64_t i00 = i0 / sf0;
+    if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
+        pixel_offset = 0.0f;
+        sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
+        sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
+    }

-                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+    if (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) {
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const int64_t i01 = i1 / sf1;
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const int64_t i00 = i0 / sf0;

-                    *y = *x;
+                        const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+
+                        *y = *x;
+                    }
                }
            }
        }
+    } else if (mode == GGML_SCALE_MODE_BICUBIC) {
+        // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+        const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
+
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
+                    const int64_t y0 = (int64_t)floorf(y);
+                    const float dy = y - (float)y0;
+
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
+                        const int64_t x0 = (int64_t)floorf(x);
+                        const float dx = x - (float)x0;
+                        float p_vals[4];
+
+                        for (int iy = -1; iy <= 2; iy++) {
+                            float row_vals[4];
+                            for (int ix = -1; ix <= 2; ix++) {
+                                int64_t idx_x = GGML_CLAMP(x0 + ix, 0, ne00 - 1);
+                                int64_t idx_y = GGML_CLAMP(y0 + iy, 0, ne01 - 1);
+
+                                row_vals[ix + 1] = *(const float *)((const char *)src0->data + idx_x*nb00 + idx_y*nb01 + i02*nb02 + i03*nb03);
+                            }
+                            p_vals[iy + 1] = ggml_bicubic_interp(row_vals[0], row_vals[1], row_vals[2], row_vals[3], dx, a);
+                        }
+                        const float val = ggml_bicubic_interp(p_vals[0], p_vals[1], p_vals[2], p_vals[3], dy, a);
+
+                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                        *y_dst = val;
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ABORT("unsupported upscale mode");
    }
 }