Provide a HF-compatible interface for mistral_common.tokens.tokenizers.multimodal.ImageEncoder.
Source code in vllm/transformers_utils/processors/pixtral.py
| class MistralCommonImageProcessor:
"""
Provide a HF-compatible interface for
`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
"""
def __init__(self, mm_encoder: ImageEncoder) -> None:
self.mm_encoder = mm_encoder
def __call__(
self,
images: ImageInput,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
images_lst = [images] if not isinstance(images, list) else images
images_processed = list[torch.Tensor]()
for image in images_lst:
image_inputs = self.mm_encoder(ImageChunk(image=image))
image_processed = torch.tensor(image_inputs.image)
images_processed.append(image_processed)
return BatchFeature({"images": images_processed}, tensor_type=return_tensors)
def get_number_of_image_patches(
self,
height: int,
width: int,
) -> tuple[int, int, int]:
image = Image.new("RGB", (width, height))
ncols, nrows = self.mm_encoder._image_to_num_tokens(image)
return ncols * nrows, nrows, ncols
|