Skip to content

vllm.config.quantization

QuantSpec

Quantization spec for one layer kind (linear or MoE).

None on either side means the method class falls back to its own default (typically inherited from the checkpoint, or unquantized for online).

Source code in vllm/config/quantization.py
@config
class QuantSpec:
    """Quantization spec for one layer kind (linear or MoE).

    `None` on either side means the method class falls back to its own default
    (typically inherited from the checkpoint, or unquantized for online).
    """

    weight: QuantKeyField = None
    """Weight quantization key, or a name from QUANT_KEY_NAMES."""

    activation: QuantKeyField = None
    """Activation quantization key, or a name from QUANT_KEY_NAMES."""

activation class-attribute instance-attribute

activation: QuantKeyField = None

Activation quantization key, or a name from QUANT_KEY_NAMES.

weight class-attribute instance-attribute

weight: QuantKeyField = None

Weight quantization key, or a name from QUANT_KEY_NAMES.

QuantizationConfigArgs

User-facing quantization configuration.

See docs/features/quantization/online.md for the schema and shorthand string forms accepted on linear and moe.

Source code in vllm/config/quantization.py
@config
class QuantizationConfigArgs:
    """User-facing quantization configuration.

    See `docs/features/quantization/online.md` for the schema and shorthand
    string forms accepted on `linear` and `moe`.
    """

    linear: QuantSpec | None = None
    """Spec applied to ``LinearBase`` layers."""

    moe: QuantSpec | None = None
    """Spec applied to ``FusedMoE`` layers."""

    ignore: list[str] = Field(default_factory=list)
    """Layers to skip quantization for."""

    @field_validator("linear", "moe", mode="before")
    @classmethod
    def _coerce_spec(cls, v: Any, info: ValidationInfo) -> Any:
        if not isinstance(v, str):
            return v
        field_name = info.field_name
        assert field_name is not None
        if v in _ONLINE_SHORTHANDS:
            spec = getattr(_ONLINE_SHORTHANDS[v], field_name)
            if spec is None:
                raise ValueError(
                    f"online shorthand {v!r} does not define a {field_name} spec"
                )
            return spec
        return QuantSpec(weight=_coerce_quant_key(v))

ignore class-attribute instance-attribute

ignore: list[str] = Field(default_factory=list)

Layers to skip quantization for.

linear class-attribute instance-attribute

linear: QuantSpec | None = None

Spec applied to LinearBase layers.

moe class-attribute instance-attribute

moe: QuantSpec | None = None

Spec applied to FusedMoE layers.

resolve_quantization_config

resolve_quantization_config(
    quantization: str | None,
    quantization_config: dict[str, Any]
    | QuantizationConfigArgs
    | None,
) -> QuantizationConfigArgs | None

Resolve --quantization shorthand and --quantization-config into a QuantizationConfigArgs.

quantization is a CLI shorthand that desugars into a base config via _ONLINE_SHORTHANDS. quantization_config is a dict or pre-built args object. When both are given, fields explicitly set in quantization_config take precedence over the shorthand.

Source code in vllm/config/quantization.py
def resolve_quantization_config(
    quantization: str | None,
    quantization_config: dict[str, Any] | QuantizationConfigArgs | None,
) -> QuantizationConfigArgs | None:
    """Resolve `--quantization` shorthand and `--quantization-config` into a
    QuantizationConfigArgs.

    `quantization` is a CLI shorthand that desugars into a base config via
    `_ONLINE_SHORTHANDS`. `quantization_config` is a dict or pre-built args
    object. When both are given, fields explicitly set in `quantization_config`
    take precedence over the shorthand.
    """
    if quantization is not None and quantization not in ONLINE_QUANT_SHORTHAND_NAMES:
        if quantization_config is not None:
            raise ValueError(
                f"quantization_config is only supported when quantization is "
                f"one of {sorted(ONLINE_QUANT_SHORTHAND_NAMES)}, "
                f"got quantization={quantization!r}"
            )
        return None

    base = _ONLINE_SHORTHANDS.get(quantization) if quantization else None

    if quantization_config is None:
        return base

    if isinstance(quantization_config, dict):
        quantization_config = QuantizationConfigArgs(**quantization_config)

    if base is None:
        return quantization_config

    return QuantizationConfigArgs(
        linear=quantization_config.linear or base.linear,
        moe=quantization_config.moe or base.moe,
        ignore=quantization_config.ignore or base.ignore,
    )