mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-06-30 19:57:52 +00:00
374 lines
14 KiB
C++
374 lines
14 KiB
C++
/*****************************************************************************************
|
|
* Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
|
|
* Copyright (C) 2025 Intel Corporation, All rights reserved.
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
* list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
****************************************************************************************/
|
|
#include "common.h"
|
|
#include "vec.h"
|
|
|
|
// [NOTE] Preprocessor Optimization
|
|
// 1. this file is apple-to-apple to `Qwen2VLImageProcessorFast`.
|
|
// 2. `out_dtype` set to torch.bfloat16 skips outplace dtype conversion.
|
|
// 3. skip all redundant memory copy and dtype conversion.
|
|
// 4. TODO: rewrite `_upsample_bicubic2d_aa`.
|
|
//
|
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers
|
|
// /models/qwen2_vl/image_processing_qwen2_vl_fast.py
|
|
//
|
|
namespace {
|
|
|
|
template <typename scalar_t>
|
|
inline void normalize(
|
|
scalar_t* __restrict__ out,
|
|
const uint8_t* __restrict__ input,
|
|
const std::vector<float>& image_mean,
|
|
const std::vector<float>& image_std,
|
|
int64_t channel,
|
|
int64_t temporal_patch_size,
|
|
int64_t patch_size,
|
|
int64_t stride_ch,
|
|
int64_t stride_pt,
|
|
int64_t stride_ph) {
|
|
TORCH_CHECK(false, "normalize: scalar path not implemented.");
|
|
}
|
|
|
|
#if defined(CPU_CAPABILITY_AVX512)
|
|
template <>
|
|
inline void normalize<float>(
|
|
float* __restrict__ out,
|
|
const uint8_t* __restrict__ input,
|
|
const std::vector<float>& image_mean,
|
|
const std::vector<float>& image_std,
|
|
int64_t channel,
|
|
int64_t temporal_patch_size,
|
|
int64_t patch_size,
|
|
int64_t stride_ch,
|
|
int64_t stride_pt,
|
|
int64_t stride_ph) {
|
|
// we do vectorization on patch_size dim
|
|
assert(patch_size == 16);
|
|
|
|
// loop last 4 dimensions:
|
|
// {channel, patch_t(repeated), patch_h, patch_w}
|
|
for (int64_t c = 0; c < channel; ++c) {
|
|
__m512 vmean = _mm512_set1_ps(image_mean[c]);
|
|
__m512 vrstd = _mm512_set1_ps(1.f / image_std[c]);
|
|
|
|
float* __restrict__ out_ptr = out + c * temporal_patch_size * patch_size * patch_size;
|
|
#pragma GCC unroll 4
|
|
for (int64_t ph = 0; ph < patch_size; ++ph) {
|
|
__m128i u8 = _mm_loadu_si128((const __m128i*)(input + c * stride_ch + /* pt */ 0 * stride_pt + ph * stride_ph));
|
|
__m512 x = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(u8));
|
|
x = _mm512_mul_ps(_mm512_sub_ps(x, vmean), vrstd);
|
|
#pragma GCC unroll 2
|
|
for (int64_t pt = 0; pt < temporal_patch_size; ++pt) {
|
|
_mm512_storeu_ps(out_ptr + pt * patch_size * patch_size + ph * patch_size, x);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <>
|
|
inline void normalize<at::BFloat16>(
|
|
at::BFloat16* __restrict__ out,
|
|
const uint8_t* __restrict__ input,
|
|
const std::vector<float>& image_mean,
|
|
const std::vector<float>& image_std,
|
|
int64_t channel,
|
|
int64_t temporal_patch_size,
|
|
int64_t patch_size,
|
|
int64_t stride_ch,
|
|
int64_t stride_pt,
|
|
int64_t stride_ph) {
|
|
// we do vectorization on patch_size dim
|
|
assert(patch_size == 16);
|
|
|
|
// loop last 4 dimensions:
|
|
// {channel, patch_t(repeated), patch_h, patch_w}
|
|
for (int64_t c = 0; c < channel; ++c) {
|
|
__m512 vmean = _mm512_set1_ps(image_mean[c]);
|
|
__m512 vrstd = _mm512_set1_ps(1.f / image_std[c]);
|
|
|
|
at::BFloat16* __restrict__ out_ptr = out + c * temporal_patch_size * patch_size * patch_size;
|
|
#pragma GCC unroll 4
|
|
for (int64_t ph = 0; ph < patch_size; ++ph) {
|
|
__m128i u8 = _mm_loadu_si128((const __m128i*)(input + c * stride_ch + /* pt */ 0 * stride_pt + ph * stride_ph));
|
|
__m512 x = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(u8));
|
|
x = _mm512_mul_ps(_mm512_sub_ps(x, vmean), vrstd);
|
|
__m256i x16 = (__m256i)_mm512_cvtneps_pbh(x);
|
|
#pragma GCC unroll 2
|
|
for (int64_t pt = 0; pt < temporal_patch_size; ++pt) {
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out_ptr + pt * patch_size * patch_size + ph * patch_size), x16);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
template <typename scalar_t>
|
|
void rescale_and_normalize_kernel_impl(
|
|
scalar_t* __restrict__ out,
|
|
const uint8_t* __restrict__ input,
|
|
const std::vector<float>& image_mean,
|
|
const std::vector<float>& image_std,
|
|
int64_t grid_t,
|
|
int64_t grid_h,
|
|
int64_t grid_w,
|
|
int64_t merge_size,
|
|
int64_t channel,
|
|
int64_t temporal_patch_size,
|
|
int64_t patch_size) {
|
|
// [NOTE]: temporal patching uses repeat on last image
|
|
//
|
|
// input : {grid_t, patch_t, channel, grid_h, merge_h, patch_h, grid_w, merge_w, patch_w}
|
|
// out : {grid_t, grid_h, grid_w, merge_h, merge_w, channel, patch_t, patch_h, patch_w}
|
|
//
|
|
int64_t height = grid_h * merge_size * patch_size;
|
|
int64_t width = grid_w * merge_size * patch_size;
|
|
|
|
int64_t stride_gt = /* temporal_patch_size */ 1 * channel * height * width;
|
|
int64_t stride_gh = merge_size * patch_size * width;
|
|
int64_t stride_gw = merge_size * patch_size;
|
|
int64_t stride_mh = patch_size * width;
|
|
int64_t stride_mw = patch_size;
|
|
int64_t stride_ch = height * width;
|
|
int64_t stride_pt = channel * height * width;
|
|
int64_t stride_ph = width;
|
|
int64_t stride_grid = channel * temporal_patch_size * patch_size * patch_size;
|
|
|
|
// parallel on first 5 dims, aka, grids
|
|
at::parallel_for(0, grid_t * grid_h * grid_w * merge_size * merge_size, 0, [&](int64_t begin, int64_t end) {
|
|
int64_t gt{0}, gh{0}, gw{0}, mh{0}, mw{0};
|
|
data_index_init(begin, gt, grid_t, gh, grid_h, gw, grid_w, mh, merge_size, mw, merge_size);
|
|
|
|
for (int64_t i = begin; i < end; ++i) {
|
|
normalize<scalar_t>(
|
|
out + i * stride_grid,
|
|
input + gt * stride_gt + gh * stride_gh + gw * stride_gw + mh * stride_mh + mw * stride_mw,
|
|
image_mean,
|
|
image_std,
|
|
channel,
|
|
temporal_patch_size,
|
|
patch_size,
|
|
stride_ch,
|
|
stride_pt,
|
|
stride_ph);
|
|
|
|
// move to the next index
|
|
data_index_step(gt, grid_t, gh, grid_h, gw, grid_w, mh, merge_size, mw, merge_size);
|
|
}
|
|
});
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
void check_input_image(const at::Tensor& image) {
|
|
TORCH_CHECK(image.scalar_type() == at::kByte, "expect image to be uint8.");
|
|
TORCH_CHECK(image.dim() == 3, "expect image to be CHW.");
|
|
}
|
|
|
|
// https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
|
|
std::pair<int64_t, int64_t>
|
|
smart_resize(int64_t height, int64_t width, int64_t factor, int64_t min_pixels, int64_t max_pixels) {
|
|
// aspect ratio check
|
|
int64_t mx = std::max(height, width);
|
|
int64_t mn = std::min(height, width);
|
|
|
|
TORCH_CHECK(static_cast<double>(mx) / mn <= 200.0, "absolute aspect ratio must be smaller than 200");
|
|
|
|
// round to nearest multiple of factor
|
|
auto round_to_factor = [&](int64_t x) {
|
|
return static_cast<int64_t>(std::round(static_cast<double>(x) / factor)) * factor;
|
|
};
|
|
|
|
int64_t h_bar = round_to_factor(height);
|
|
int64_t w_bar = round_to_factor(width);
|
|
|
|
int64_t area = h_bar * w_bar;
|
|
|
|
if (area > max_pixels) {
|
|
double beta = std::sqrt((1.0 * height * width) / max_pixels);
|
|
h_bar = std::max(factor, (static_cast<int64_t>(std::floor(height / beta / factor)) * factor));
|
|
w_bar = std::max(factor, (static_cast<int64_t>(std::floor(width / beta / factor)) * factor));
|
|
} else if (area < min_pixels) {
|
|
double beta = std::sqrt((double)min_pixels / (height * width));
|
|
h_bar = static_cast<int64_t>(std::ceil(height * beta / factor)) * factor;
|
|
w_bar = static_cast<int64_t>(std::ceil(width * beta / factor)) * factor;
|
|
}
|
|
|
|
return {h_bar, w_bar};
|
|
}
|
|
|
|
// do rescale and normalize
|
|
// from `resized_image` to `pixel_values`
|
|
void rescale_and_normalize_image(
|
|
at::Tensor& pixel_values,
|
|
const at::Tensor& image,
|
|
double rescale_factor,
|
|
c10::ArrayRef<double> image_mean,
|
|
c10::ArrayRef<double> image_std,
|
|
int64_t grid_t,
|
|
int64_t grid_h,
|
|
int64_t grid_w,
|
|
int64_t merge_size,
|
|
int64_t channel,
|
|
int64_t temporal_patch_size,
|
|
int64_t patch_size,
|
|
int64_t grid_offset,
|
|
int64_t grid_stride) {
|
|
// update mean and std
|
|
std::vector<float> mean_vec(channel), std_vec(channel);
|
|
for (int64_t c = 0; c < channel; ++c) {
|
|
mean_vec[c] = static_cast<float>(image_mean[c] * (1 / rescale_factor));
|
|
std_vec[c] = static_cast<float>(image_std[c] * (1 / rescale_factor));
|
|
}
|
|
|
|
AT_DISPATCH_FLOATING_TYPES_AND(at::kBFloat16, pixel_values.scalar_type(), "rescale_and_normalize_image", [&] {
|
|
rescale_and_normalize_kernel_impl<scalar_t>(
|
|
pixel_values.data_ptr<scalar_t>() + grid_offset * grid_stride,
|
|
image.data_ptr<uint8_t>(),
|
|
mean_vec,
|
|
std_vec,
|
|
grid_t,
|
|
grid_h / merge_size,
|
|
grid_w / merge_size,
|
|
merge_size,
|
|
channel,
|
|
temporal_patch_size,
|
|
patch_size);
|
|
});
|
|
}
|
|
|
|
std::tuple<at::Tensor, at::Tensor> image_preprocess_cpu(
|
|
at::TensorList images,
|
|
bool do_convert_rgb,
|
|
bool do_resize,
|
|
int64_t shortest_edge,
|
|
int64_t longest_edge,
|
|
const std::string& interpolation,
|
|
bool do_rescale,
|
|
double rescale_factor,
|
|
bool do_normalize,
|
|
c10::ArrayRef<double> image_mean,
|
|
c10::ArrayRef<double> image_std,
|
|
int64_t patch_size,
|
|
int64_t temporal_patch_size,
|
|
int64_t merge_size,
|
|
bool disable_grouping,
|
|
at::ScalarType out_dtype) {
|
|
RECORD_FUNCTION("sgl_kernel::image_preprocess_cpu", std::vector<c10::IValue>({}));
|
|
|
|
// TODO: lift C++ kernel limitations
|
|
TORCH_CHECK(interpolation == "bicubic", "image_preprocess_cpu: support only bicubic mode.");
|
|
TORCH_CHECK(do_rescale && do_normalize, "image_preprocess_cpu: support only do_rescale and do_normalize.");
|
|
TORCH_CHECK(disable_grouping, "image_preprocess_cpu: support only disable_grouping.");
|
|
|
|
// support only float32 or bfloat16 as output
|
|
TORCH_CHECK(
|
|
out_dtype == at::kFloat || out_dtype == at::kBFloat16,
|
|
"image_preprocess_cpu: support only float32 and bfloat16 as pixel_values dtype.");
|
|
|
|
int64_t batch_size = images.size();
|
|
int64_t channel = image_mean.size();
|
|
CHECK_GT(batch_size, 0);
|
|
CHECK_EQ(channel, image_std.size());
|
|
CHECK_EQ(channel, 3);
|
|
|
|
const at::Tensor& first_image = images[0];
|
|
const auto options = first_image.options();
|
|
at::Tensor pixel_values = at::empty({}, options.dtype(out_dtype));
|
|
at::Tensor image_grid_thw = at::empty({batch_size, channel}, options.dtype(at::kLong));
|
|
|
|
// index type use int64_t
|
|
int64_t* image_grid_thw_data = image_grid_thw.data_ptr<int64_t>();
|
|
|
|
// resized image sizes and global grid offset
|
|
std::vector<std::pair<int64_t, int64_t>> image_sizes(batch_size);
|
|
std::vector<int64_t> grid_offsets(batch_size + 1, 0);
|
|
|
|
// Stage 1: compute resized shapes and fill in `image_grid_thw`
|
|
for (int64_t idx = 0; idx < batch_size; ++idx) {
|
|
const auto& image = images[idx];
|
|
check_input_image(image);
|
|
|
|
auto [resized_h, resized_w] =
|
|
smart_resize(image.size(-2), image.size(-1), patch_size * merge_size, shortest_edge, longest_edge);
|
|
|
|
image_sizes[idx] = {resized_h, resized_w};
|
|
|
|
// temporal dimension for image is 1
|
|
int64_t grid_t = div_up((int64_t)1, temporal_patch_size);
|
|
int64_t grid_h = div_up(resized_h, patch_size);
|
|
int64_t grid_w = div_up(resized_w, patch_size);
|
|
|
|
// fill in image_grid_thw
|
|
image_grid_thw_data[idx * 3 + 0] = grid_t;
|
|
image_grid_thw_data[idx * 3 + 1] = grid_h;
|
|
image_grid_thw_data[idx * 3 + 2] = grid_w;
|
|
|
|
// fill in global grid offset
|
|
grid_offsets[idx + 1] = grid_offsets[idx] + grid_t * grid_h * grid_w;
|
|
}
|
|
|
|
// last element holds the total sum of grids
|
|
int64_t grid_size = grid_offsets[batch_size];
|
|
int64_t grid_stride = channel * temporal_patch_size * patch_size * patch_size;
|
|
// allocate memory
|
|
pixel_values.resize_({grid_size, grid_stride});
|
|
|
|
// Stage 2: compute `pixel_values`
|
|
for (int64_t idx = 0; idx < batch_size; ++idx) {
|
|
const auto& image = images[idx];
|
|
int64_t resized_h = image_sizes[idx].first;
|
|
int64_t resized_w = image_sizes[idx].second;
|
|
auto resized_image = at::_upsample_bicubic2d_aa(
|
|
image.unsqueeze(0),
|
|
{resized_h, resized_w},
|
|
/* align_corners */ false);
|
|
|
|
rescale_and_normalize_image(
|
|
pixel_values,
|
|
resized_image,
|
|
rescale_factor,
|
|
image_mean,
|
|
image_std,
|
|
/* grid_t */ image_grid_thw_data[idx * 3 + 0],
|
|
/* grid_h */ image_grid_thw_data[idx * 3 + 1],
|
|
/* grid_w */ image_grid_thw_data[idx * 3 + 2],
|
|
merge_size,
|
|
channel,
|
|
temporal_patch_size,
|
|
patch_size,
|
|
grid_offsets[idx],
|
|
grid_stride);
|
|
}
|
|
|
|
return std::make_tuple(pixel_values, image_grid_thw);
|
|
}
|