diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index f3a4c90e80..0842524820 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -309,35 +309,71 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o std::vector> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split); std::vector> results; std::vector> sizes; + const size_t channels = 3; std::vector> preprocessed{imgs.size()}; - size_t max_h = 0, max_w = 0, n_images = 0; - std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const std::vector& row) { + size_t max_h = 0, max_w = 0, n_images = 0, max_size = 0; + std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &max_size, &n_images](const std::vector& row) { std::vector processed_row{row.size()}; - std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const clip_image_u8& raw) { + std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &max_size, &n_images](const clip_image_u8& raw) { clip_image_f32 im = clip_image_preprocess(ctx_clip, raw); - max_h = std::max(size_t(im.ny), max_h); - max_w = std::max(size_t(im.nx), max_w); + if (size_t(im.ny) * size_t(im.nx) > max_size) { + max_size = size_t(im.ny) * size_t(im.nx); + max_h = size_t(im.ny); + max_w = size_t(im.nx); + } ++n_images; return im; }); return processed_row; }); - ov::Tensor batched_images{ov::element::f32, {n_images, 3, max_h, max_w}}; - float* batched_data = batched_images.data(); - const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0); - std::copy(resized_preprocessed.buf.begin(), resized_preprocessed.buf.end(), batched_data); + ov::Tensor pixel_values{ov::element::f32, {n_images, channels, patch_size, max_size / patch_size}}; + size_t d3_all_pixel = pixel_values.get_shape().at(3); + float* pixel_value_data = pixel_values.data(); + + //image chw to 1*c*kernel*hw/kernel and padding zero + clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0); + size_t img_h = resized_preprocessed.ny; + size_t img_w = resized_preprocessed.nx; + ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}, resized_preprocessed.buf.data()}; + ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size); + + float* clip_value_data = clip_pixel_values.data(); + size_t batch_pixel = 1; + size_t d3_clip_pixel = clip_pixel_values.get_shape().at(3); + for (size_t c_idx = 0; c_idx < channels; ++c_idx) { + for (size_t k_idx = 0; k_idx < patch_size; k_idx++) { + std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data); + clip_value_data += d3_clip_pixel; + pixel_value_data += d3_all_pixel; + } + } + if (1 < preprocessed.size()) { for (size_t row = 1; row < preprocessed.size(); ++row) { size_t n_slices = preprocessed.at(row).size(); for (size_t col = 0; col < n_slices; ++col) { - const clip_image_f32& elem = preprocessed.at(row).at(col); - std::copy(elem.buf.begin(), elem.buf.end(), batched_data + ((row - 1) * n_slices + col + 1) * 3 * max_h * max_w); + clip_image_f32& elem = preprocessed.at(row).at(col); + img_h = elem.ny; + img_w = elem.nx; + ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}, elem.buf.data()}; + ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size); + + d3_clip_pixel = clip_pixel_values.get_shape().at(3); + clip_value_data = clip_pixel_values.data(); + pixel_value_data = pixel_values.data() + batch_pixel * channels * patch_size * d3_all_pixel; + for (size_t c_idx = 0; c_idx < channels; ++c_idx) { + for (size_t k_idx = 0; k_idx < patch_size; k_idx++) { + std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data); + clip_value_data += d3_clip_pixel; + pixel_value_data += d3_all_pixel; + } + } + batch_pixel++; } } } - ov::Tensor pixel_values = preprocess_for_encoder(batched_images, patch_size); encoder.set_tensor("pixel_values", pixel_values); ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, max_h / patch_size * max_w / patch_size}};