from typing import Optional, Union from transformers.image_processing_utils import BaseImageProcessor, BatchFeature from transformers.image_transforms import convert_to_rgb, resize, to_channel_dimension_format from transformers.image_utils import ( ChannelDimension, ImageInput, PILImageResampling, infer_channel_dimension_format, is_scaled_image, make_flat_list_of_images, to_numpy_array, ) from transformers.utils import TensorType, logging logger = logging.get_logger(__name__) class VectorLLMImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( self, do_resize: bool = True, resample: PILImageResampling = PILImageResampling.BICUBIC, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = False, image_mean=None, image_std=None, do_convert_rgb: bool = True, pre_resize_size: Optional[int] = 432, resized_size: int = 128, patch_size: int = 16, **kwargs, ) -> None: super().__init__(**kwargs) self.pre_resize_size = pre_resize_size self.resized_size = resized_size self.patch_size = patch_size self.do_resize = do_resize self.resample = resample self.do_rescale = do_rescale self.rescale_factor = rescale_factor self.do_normalize = do_normalize self.image_mean = image_mean self.image_std = image_std self.do_convert_rgb = do_convert_rgb def _preprocess( self, images: ImageInput, do_resize: Optional[bool] = None, resample: Optional[PILImageResampling] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, image_mean=None, image_std=None, pre_resize_size: Optional[int] = None, resized_size: Optional[int] = None, do_convert_rgb: Optional[bool] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, ): images = make_flat_list_of_images(images) if do_convert_rgb: images = [convert_to_rgb(image) for image in images] images = [to_numpy_array(image) for image in images] if do_rescale and is_scaled_image(images[0]): logger.warning_once( "Input images already look rescaled. Set do_rescale=False to avoid double rescaling." ) if input_data_format is None: input_data_format = infer_channel_dimension_format(images[0]) processed_images = [] for image in images: if do_resize: if pre_resize_size is not None: image = resize( image, size=(pre_resize_size, pre_resize_size), resample=resample, input_data_format=input_data_format, ) image = resize( image, size=(resized_size, resized_size), resample=resample, input_data_format=input_data_format, ) if do_rescale: image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) if do_normalize: image = self.normalize( image=image, mean=image_mean, std=image_std, input_data_format=input_data_format, ) image = to_channel_dimension_format( image, data_format, input_channel_dim=input_data_format, ) processed_images.append(image) return processed_images def preprocess( self, images: ImageInput, do_resize: Optional[bool] = None, resample: Optional[PILImageResampling] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, image_mean=None, image_std=None, pre_resize_size: Optional[int] = None, resized_size: Optional[int] = None, do_convert_rgb: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, ): do_resize = self.do_resize if do_resize is None else do_resize resample = self.resample if resample is None else resample do_rescale = self.do_rescale if do_rescale is None else do_rescale rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor do_normalize = self.do_normalize if do_normalize is None else do_normalize image_mean = self.image_mean if image_mean is None else image_mean image_std = self.image_std if image_std is None else image_std pre_resize_size = self.pre_resize_size if pre_resize_size is None else pre_resize_size resized_size = self.resized_size if resized_size is None else resized_size do_convert_rgb = self.do_convert_rgb if do_convert_rgb is None else do_convert_rgb images = self._preprocess( images=images, do_resize=do_resize, resample=resample, do_rescale=do_rescale, rescale_factor=rescale_factor, do_normalize=do_normalize, image_mean=image_mean, image_std=image_std, pre_resize_size=pre_resize_size, resized_size=resized_size, do_convert_rgb=do_convert_rgb, data_format=data_format, input_data_format=input_data_format, ) return BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)