Provide a HF-compatible interface for mistral_common.tokens.tokenizers.multimodal.AudioEncoder.
Source code in vllm/transformers_utils/processors/voxtral.py
| class MistralCommonFeatureExtractor:
"""
Provide a HF-compatible interface for
`mistral_common.tokens.tokenizers.multimodal.AudioEncoder`.
"""
def __init__(self, audio_encoder: AudioEncoder) -> None:
self.audio_encoder = audio_encoder
@property
def sampling_rate(self):
return self.audio_encoder.audio_config.sampling_rate
@property
def frame_rate(self):
return self.audio_encoder.audio_config.frame_rate
def __call__(
self,
audios: AudioInput,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
audios_lst = [audios] if not isinstance(audios, list) else audios
audios_processed = list[torch.Tensor]()
for audio in audios_lst:
audio = np.asarray(audio, dtype=np.float32).ravel()
if not self.audio_encoder.audio_config.is_streaming:
audio = self.audio_encoder.pad(audio, self.sampling_rate)
audios_processed.append(torch.tensor(audio))
return BatchFeature(
{"audio_arrays": audios_processed}, tensor_type=return_tensors
)
def get_num_audio_tokens(self, audio_length: int) -> int:
return ceil(audio_length / (self.sampling_rate // self.frame_rate))
|