Skip to content

vllm.transformers_utils.configs.openvla

OpenVLA configuration support.

OpenVLA checkpoints use a custom model_type and nest the language model configuration under text_config. This shim lets vLLM load the checkpoint configuration without executing Hugging Face remote code.

OpenVLAConfig

Bases: PretrainedConfig

Configuration class for OpenVLA models.

Source code in vllm/transformers_utils/configs/openvla.py
class OpenVLAConfig(PretrainedConfig):
    """Configuration class for OpenVLA models."""

    model_type = "openvla"

    def __init__(
        self,
        timm_model_ids: list[str] | None = None,
        timm_override_act_layers: list[str | None] | None = None,
        image_sizes: list[int] | None = None,
        use_fused_vision_backbone: bool = True,
        image_token_index: int = 32000,
        n_action_bins: int = 256,
        text_config: dict[str, Any] | LlamaConfig | None = None,
        **kwargs: Any,
    ) -> None:
        kwargs.setdefault("architectures", ["OpenVLAForActionPrediction"])
        super().__init__(**kwargs)

        self.timm_model_ids = timm_model_ids or [
            "vit_large_patch14_reg4_dinov2.lvd142m",
            "vit_so400m_patch14_siglip_224",
        ]
        self.timm_override_act_layers = timm_override_act_layers or [None, None]
        self.image_sizes = image_sizes or [224, 224]
        self.use_fused_vision_backbone = use_fused_vision_backbone
        self.image_token_index = image_token_index
        self.n_action_bins = n_action_bins

        if text_config is None:
            text_config = LlamaConfig(architectures=["LlamaForCausalLM"])
        elif isinstance(text_config, dict):
            text_config = text_config.copy()
            text_config.setdefault("architectures", ["LlamaForCausalLM"])
            text_config = LlamaConfig(**text_config)
        self.text_config = text_config