File size: 6,087 Bytes
bcc6605 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | from typing import Optional, Union
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_transforms import convert_to_rgb, resize, to_channel_dimension_format
from transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_flat_list_of_images,
to_numpy_array,
)
from transformers.utils import TensorType, logging
logger = logging.get_logger(__name__)
class VectorLLMImageProcessor(BaseImageProcessor):
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = False,
image_mean=None,
image_std=None,
do_convert_rgb: bool = True,
pre_resize_size: Optional[int] = 432,
resized_size: int = 128,
patch_size: int = 16,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.pre_resize_size = pre_resize_size
self.resized_size = resized_size
self.patch_size = patch_size
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def _preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
resample: Optional[PILImageResampling] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean=None,
image_std=None,
pre_resize_size: Optional[int] = None,
resized_size: Optional[int] = None,
do_convert_rgb: Optional[bool] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
images = make_flat_list_of_images(images)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
images = [to_numpy_array(image) for image in images]
if do_rescale and is_scaled_image(images[0]):
logger.warning_once(
"Input images already look rescaled. Set do_rescale=False to avoid double rescaling."
)
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
processed_images = []
for image in images:
if do_resize:
if pre_resize_size is not None:
image = resize(
image,
size=(pre_resize_size, pre_resize_size),
resample=resample,
input_data_format=input_data_format,
)
image = resize(
image,
size=(resized_size, resized_size),
resample=resample,
input_data_format=input_data_format,
)
if do_rescale:
image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
if do_normalize:
image = self.normalize(
image=image,
mean=image_mean,
std=image_std,
input_data_format=input_data_format,
)
image = to_channel_dimension_format(
image,
data_format,
input_channel_dim=input_data_format,
)
processed_images.append(image)
return processed_images
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
resample: Optional[PILImageResampling] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean=None,
image_std=None,
pre_resize_size: Optional[int] = None,
resized_size: Optional[int] = None,
do_convert_rgb: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
do_resize = self.do_resize if do_resize is None else do_resize
resample = self.resample if resample is None else resample
do_rescale = self.do_rescale if do_rescale is None else do_rescale
rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
pre_resize_size = self.pre_resize_size if pre_resize_size is None else pre_resize_size
resized_size = self.resized_size if resized_size is None else resized_size
do_convert_rgb = self.do_convert_rgb if do_convert_rgb is None else do_convert_rgb
images = self._preprocess(
images=images,
do_resize=do_resize,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
pre_resize_size=pre_resize_size,
resized_size=resized_size,
do_convert_rgb=do_convert_rgb,
data_format=data_format,
input_data_format=input_data_format,
)
return BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
|