vllm.config.attention ¶

AttentionConfig ¶

Configuration for attention mechanisms in vLLM.

Source code in vllm/config/attention.py

@config
class AttentionConfig:
    """Configuration for attention mechanisms in vLLM."""

    backend: AttentionBackendEnum | None = None
    """Attention backend to use. Use "auto" or None for automatic selection."""

    flash_attn_version: Literal[2, 3, 4] | None = None
    """Force vllm to use a specific flash-attention version (2, 3, or 4).
    Only valid when using the flash-attention backend."""

    use_prefill_decode_attention: bool = False
    """Use separate prefill and decode kernels for attention instead of
    the unified triton kernel."""

    flash_attn_max_num_splits_for_cuda_graph: int = 32
    """Flash Attention max number splits for cuda graph decode."""

    tq_max_kv_splits_for_cuda_graph: int = 32
    """TurboQuant max NUM_KV_SPLITS for cuda graph decode.
    Fixes the split count so grid dimensions are constant across captures,
    and buffers can be pre-allocated to avoid inflating the memory estimate."""

    use_trtllm_attention: bool | None = None
    """If set to True/False, use or don't use the TRTLLM attention backend
    in flashinfer. If None, auto-detect the attention backend in flashinfer."""

    disable_flashinfer_q_quantization: bool = False
    """If set, when using fp8 kv, do not quantize Q to fp8."""

    mla_prefill_backend: MLAPrefillBackendEnum | None = None
    """MLA prefill backend to use. If None, will be selected automatically.
    Valid options: FLASH_ATTN (FA3/FA4), FLASHINFER, TRTLLM_RAGGED."""

    use_prefill_query_quantization: bool = False
    """If set, quantize query for attention in prefill."""

    use_fp4_indexer_cache: bool = False
    """If set, use fp4 indexer cache for dsv32 family model (not support yet)"""

    use_non_causal: bool = False
    """Whether to use non-causal (bidirectional) attention."""

    flex_attn_block_m: int | None = None
    """Triton kernel BLOCK_M tile size for flex attention.
    Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1,
    defaults to 16."""

    flex_attn_block_n: int | None = None
    """Triton kernel BLOCK_N tile size for flex attention.
    Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1,
    defaults to 16."""

    flex_attn_q_block_size: int | None = None
    """Logical Q block size for the flex attention block mask.
    Must be a power of 2 and divisible by flex_attn_block_m.
    If None, uses the default (16 on PyTorch >= 2.9, 128 otherwise)."""

    flex_attn_kv_block_size: int | None = None
    """Logical KV block size for the flex attention block mask.
    Must be a power of 2 and divisible by flex_attn_block_n.
    If None, uses the default (kv_cache_block_size on PyTorch >= 2.9,
    128 otherwise)."""

    def compute_hash(self) -> str:
        """
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        from vllm.config.utils import get_hash_factors, hash_factors

        ignored_factors: set[str] = set()
        factors = get_hash_factors(self, ignored_factors)
        return hash_factors(factors)

    @field_validator("backend", mode="before")
    @classmethod
    def validate_backend_before(cls, value: Any) -> Any:
        """Enable parsing of the `backend` enum type from string.

        The special value "auto" is treated as None, which triggers
        automatic backend selection.
        """
        if isinstance(value, str):
            if value.lower() == "auto":
                return None
            return AttentionBackendEnum[value.upper()]
        return value

    @field_validator("mla_prefill_backend", mode="before")
    @classmethod
    def validate_mla_prefill_backend_before(cls, value: Any) -> Any:
        """Enable parsing of the `mla_prefill_backend` enum type from string."""
        if isinstance(value, str):
            return MLAPrefillBackendEnum[value.upper()]
        return value

backend `class-attribute` `instance-attribute` ¶

backend: AttentionBackendEnum | None = None

Attention backend to use. Use "auto" or None for automatic selection.

disable_flashinfer_q_quantization `class-attribute` `instance-attribute` ¶

disable_flashinfer_q_quantization: bool = False

If set, when using fp8 kv, do not quantize Q to fp8.

flash_attn_max_num_splits_for_cuda_graph `class-attribute` `instance-attribute` ¶

flash_attn_max_num_splits_for_cuda_graph: int = 32

Flash Attention max number splits for cuda graph decode.

flash_attn_version `class-attribute` `instance-attribute` ¶

flash_attn_version: Literal[2, 3, 4] | None = None

Force vllm to use a specific flash-attention version (2, 3, or 4). Only valid when using the flash-attention backend.

flex_attn_block_m `class-attribute` `instance-attribute` ¶

flex_attn_block_m: int | None = None

Triton kernel BLOCK_M tile size for flex attention. Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1, defaults to 16.

flex_attn_block_n `class-attribute` `instance-attribute` ¶

flex_attn_block_n: int | None = None

Triton kernel BLOCK_N tile size for flex attention. Must be a power of 2 >= 16. If None and VLLM_BATCH_INVARIANT=1, defaults to 16.

flex_attn_kv_block_size `class-attribute` `instance-attribute` ¶

flex_attn_kv_block_size: int | None = None

Logical KV block size for the flex attention block mask. Must be a power of 2 and divisible by flex_attn_block_n. If None, uses the default (kv_cache_block_size on PyTorch >= 2.9, 128 otherwise).

flex_attn_q_block_size `class-attribute` `instance-attribute` ¶

flex_attn_q_block_size: int | None = None

Logical Q block size for the flex attention block mask. Must be a power of 2 and divisible by flex_attn_block_m. If None, uses the default (16 on PyTorch >= 2.9, 128 otherwise).

mla_prefill_backend `class-attribute` `instance-attribute` ¶

mla_prefill_backend: MLAPrefillBackendEnum | None = None

MLA prefill backend to use. If None, will be selected automatically. Valid options: FLASH_ATTN (FA3/FA4), FLASHINFER, TRTLLM_RAGGED.

tq_max_kv_splits_for_cuda_graph `class-attribute` `instance-attribute` ¶

tq_max_kv_splits_for_cuda_graph: int = 32

TurboQuant max NUM_KV_SPLITS for cuda graph decode. Fixes the split count so grid dimensions are constant across captures, and buffers can be pre-allocated to avoid inflating the memory estimate.

use_fp4_indexer_cache `class-attribute` `instance-attribute` ¶

use_fp4_indexer_cache: bool = False

If set, use fp4 indexer cache for dsv32 family model (not support yet)

use_non_causal `class-attribute` `instance-attribute` ¶

use_non_causal: bool = False

Whether to use non-causal (bidirectional) attention.

use_prefill_decode_attention `class-attribute` `instance-attribute` ¶

use_prefill_decode_attention: bool = False

Use separate prefill and decode kernels for attention instead of the unified triton kernel.

use_prefill_query_quantization `class-attribute` `instance-attribute` ¶

use_prefill_query_quantization: bool = False

If set, quantize query for attention in prefill.

use_trtllm_attention `class-attribute` `instance-attribute` ¶

use_trtllm_attention: bool | None = None

If set to True/False, use or don't use the TRTLLM attention backend in flashinfer. If None, auto-detect the attention backend in flashinfer.

compute_hash ¶

compute_hash() -> str

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Source code in vllm/config/attention.py

def compute_hash(self) -> str:
    """
    Provide a hash that uniquely identifies all the configs
    that affect the structure of the computation
    graph from input ids/embeddings to the final hidden states,
    excluding anything before input ids/embeddings and after
    the final hidden states.
    """
    from vllm.config.utils import get_hash_factors, hash_factors

    ignored_factors: set[str] = set()
    factors = get_hash_factors(self, ignored_factors)
    return hash_factors(factors)

validate_backend_before `classmethod` ¶

validate_backend_before(value: Any) -> Any

Enable parsing of the backend enum type from string.

The special value "auto" is treated as None, which triggers automatic backend selection.

Source code in vllm/config/attention.py

@field_validator("backend", mode="before")
@classmethod
def validate_backend_before(cls, value: Any) -> Any:
    """Enable parsing of the `backend` enum type from string.

    The special value "auto" is treated as None, which triggers
    automatic backend selection.
    """
    if isinstance(value, str):
        if value.lower() == "auto":
            return None
        return AttentionBackendEnum[value.upper()]
    return value

validate_mla_prefill_backend_before `classmethod` ¶

validate_mla_prefill_backend_before(value: Any) -> Any

Enable parsing of the mla_prefill_backend enum type from string.

Source code in vllm/config/attention.py

@field_validator("mla_prefill_backend", mode="before")
@classmethod
def validate_mla_prefill_backend_before(cls, value: Any) -> Any:
    """Enable parsing of the `mla_prefill_backend` enum type from string."""
    if isinstance(value, str):
        return MLAPrefillBackendEnum[value.upper()]
    return value

vllm.config.attention ¶

AttentionConfig ¶

backend class-attribute instance-attribute ¶

disable_flashinfer_q_quantization class-attribute instance-attribute ¶

flash_attn_max_num_splits_for_cuda_graph class-attribute instance-attribute ¶

flash_attn_version class-attribute instance-attribute ¶

flex_attn_block_m class-attribute instance-attribute ¶

flex_attn_block_n class-attribute instance-attribute ¶

flex_attn_kv_block_size class-attribute instance-attribute ¶

flex_attn_q_block_size class-attribute instance-attribute ¶

mla_prefill_backend class-attribute instance-attribute ¶

tq_max_kv_splits_for_cuda_graph class-attribute instance-attribute ¶

use_fp4_indexer_cache class-attribute instance-attribute ¶

use_non_causal class-attribute instance-attribute ¶

use_prefill_decode_attention class-attribute instance-attribute ¶

use_prefill_query_quantization class-attribute instance-attribute ¶

use_trtllm_attention class-attribute instance-attribute ¶

compute_hash ¶

validate_backend_before classmethod ¶

validate_mla_prefill_backend_before classmethod ¶

backend `class-attribute` `instance-attribute` ¶

disable_flashinfer_q_quantization `class-attribute` `instance-attribute` ¶

flash_attn_max_num_splits_for_cuda_graph `class-attribute` `instance-attribute` ¶

flash_attn_version `class-attribute` `instance-attribute` ¶

flex_attn_block_m `class-attribute` `instance-attribute` ¶

flex_attn_block_n `class-attribute` `instance-attribute` ¶

flex_attn_kv_block_size `class-attribute` `instance-attribute` ¶

flex_attn_q_block_size `class-attribute` `instance-attribute` ¶

mla_prefill_backend `class-attribute` `instance-attribute` ¶

tq_max_kv_splits_for_cuda_graph `class-attribute` `instance-attribute` ¶

use_fp4_indexer_cache `class-attribute` `instance-attribute` ¶

use_non_causal `class-attribute` `instance-attribute` ¶

use_prefill_decode_attention `class-attribute` `instance-attribute` ¶

use_prefill_query_quantization `class-attribute` `instance-attribute` ¶

use_trtllm_attention `class-attribute` `instance-attribute` ¶

validate_backend_before `classmethod` ¶

validate_mla_prefill_backend_before `classmethod` ¶